diff options
Diffstat (limited to 'arch/x86/kernel')
105 files changed, 3165 insertions, 9966 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 84cfa179802c..99a783fd4691 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -141,7 +141,6 @@ obj-$(CONFIG_OF) += devicetree.o obj-$(CONFIG_UPROBES) += uprobes.o obj-$(CONFIG_PERF_EVENTS) += perf_regs.o -obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o obj-$(CONFIG_X86_UMIP) += umip.o diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c index 77bfb846490c..7047124490f6 100644 --- a/arch/x86/kernel/acpi/cppc.c +++ b/arch/x86/kernel/acpi/cppc.c @@ -49,7 +49,7 @@ int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) { int err; - err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + err = rdmsrq_safe_on_cpu(cpunum, reg->address, val); if (!err) { u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, reg->bit_offset); @@ -65,7 +65,7 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) u64 rd_val; int err; - err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + err = rdmsrq_safe_on_cpu(cpunum, reg->address, &rd_val); if (!err) { u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, reg->bit_offset); @@ -74,7 +74,7 @@ int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) val &= mask; rd_val &= ~mask; rd_val |= val; - err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + err = wrmsrq_safe_on_cpu(cpunum, reg->address, rd_val); } return err; } @@ -147,7 +147,7 @@ int amd_get_highest_perf(unsigned int cpu, u32 *highest_perf) int ret; if (cpu_feature_enabled(X86_FEATURE_CPPC)) { - ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); + ret = rdmsrq_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &val); if (ret) goto out; @@ -272,7 +272,7 @@ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) } /* detect if running on heterogeneous design */ - if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) { + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) { switch (core_type) { case TOPO_CPU_TYPE_UNKNOWN: pr_warn("Undefined core type found for cpu %d\n", cpu); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index d5ac34186555..8698d66563ed 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -14,7 +14,7 @@ #include <acpi/processor.h> #include <asm/cpu_device_id.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/mwait.h> #include <asm/special_insns.h> #include <asm/smp.h> diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 6dfecb27b846..91fa262f0e30 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -16,6 +16,7 @@ #include <asm/cacheflush.h> #include <asm/realmode.h> #include <asm/hypervisor.h> +#include <asm/msr.h> #include <asm/smp.h> #include <linux/ftrace.h> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 45bcff181cba..ecfe7b497cad 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -1,39 +1,17 @@ // SPDX-License-Identifier: GPL-2.0-only #define pr_fmt(fmt) "SMP alternatives: " fmt -#include <linux/module.h> -#include <linux/sched.h> +#include <linux/mmu_context.h> #include <linux/perf_event.h> -#include <linux/mutex.h> -#include <linux/list.h> -#include <linux/stringify.h> -#include <linux/highmem.h> -#include <linux/mm.h> #include <linux/vmalloc.h> #include <linux/memory.h> -#include <linux/stop_machine.h> -#include <linux/slab.h> -#include <linux/kdebug.h> -#include <linux/kprobes.h> -#include <linux/mmu_context.h> -#include <linux/bsearch.h> -#include <linux/sync_core.h> #include <linux/execmem.h> + #include <asm/text-patching.h> -#include <asm/alternative.h> -#include <asm/sections.h> -#include <asm/mce.h> -#include <asm/nmi.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> #include <asm/insn.h> -#include <asm/io.h> -#include <asm/fixmap.h> -#include <asm/paravirt.h> -#include <asm/asm-prototypes.h> -#include <asm/cfi.h> #include <asm/ibt.h> #include <asm/set_memory.h> +#include <asm/nmi.h> int __read_mostly alternatives_patched; @@ -339,13 +317,6 @@ static void add_nop(u8 *buf, unsigned int len) *buf = INT3_INSN_OPCODE; } -extern s32 __retpoline_sites[], __retpoline_sites_end[]; -extern s32 __return_sites[], __return_sites_end[]; -extern s32 __cfi_sites[], __cfi_sites_end[]; -extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[]; -extern s32 __smp_locks[], __smp_locks_end[]; -void text_poke_early(void *addr, const void *opcode, size_t len); - /* * Matches NOP and NOPL, not any of the other possible NOPs. */ @@ -537,7 +508,7 @@ static void __apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, } } -void apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) +void text_poke_apply_relocation(u8 *buf, const u8 * const instr, size_t instrlen, u8 *repl, size_t repl_len) { __apply_relocation(buf, instr, instrlen, repl, repl_len); optimize_nops(instr, buf, instrlen); @@ -625,7 +596,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, DPRINTK(ALT, "alt table %px, -> %px", start, end); /* - * In the case CONFIG_X86_5LEVEL=y, KASAN_SHADOW_START is defined using + * KASAN_SHADOW_START is defined using * cpu_feature_enabled(X86_FEATURE_LA57) and is therefore patched here. * During the process, KASAN becomes confused seeing partial LA57 * conversion and triggers a false-positive out-of-bound report. @@ -693,7 +664,7 @@ void __init_or_module noinline apply_alternatives(struct alt_instr *start, for (; insn_buff_sz < a->instrlen; insn_buff_sz++) insn_buff[insn_buff_sz] = 0x90; - apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); + text_poke_apply_relocation(insn_buff, instr, a->instrlen, replacement, a->replacementlen); DUMP_BYTES(ALT, instr, a->instrlen, "%px: old_insn: ", instr); DUMP_BYTES(ALT, replacement, a->replacementlen, "%px: rpl_insn: ", replacement); @@ -2305,7 +2276,7 @@ __visible noinline void __init __alt_reloc_selftest(void *arg) static noinline void __init alt_reloc_selftest(void) { /* - * Tests apply_relocation(). + * Tests text_poke_apply_relocation(). * * This has a relative immediate (CALL) in a place other than the first * instruction and additionally on x86_64 we get a RIP-relative LEA: @@ -2442,76 +2413,8 @@ void __init_or_module text_poke_early(void *addr, const void *opcode, } } -typedef struct { - struct mm_struct *mm; -} temp_mm_state_t; - -/* - * Using a temporary mm allows to set temporary mappings that are not accessible - * by other CPUs. Such mappings are needed to perform sensitive memory writes - * that override the kernel memory protections (e.g., W^X), without exposing the - * temporary page-table mappings that are required for these write operations to - * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the - * mapping is torn down. - * - * Context: The temporary mm needs to be used exclusively by a single core. To - * harden security IRQs must be disabled while the temporary mm is - * loaded, thereby preventing interrupt handler bugs from overriding - * the kernel memory protection. - */ -static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) -{ - temp_mm_state_t temp_state; - - lockdep_assert_irqs_disabled(); - - /* - * Make sure not to be in TLB lazy mode, as otherwise we'll end up - * with a stale address space WITHOUT being in lazy mode after - * restoring the previous mm. - */ - if (this_cpu_read(cpu_tlbstate_shared.is_lazy)) - leave_mm(); - - temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm); - switch_mm_irqs_off(NULL, mm, current); - - /* - * If breakpoints are enabled, disable them while the temporary mm is - * used. Userspace might set up watchpoints on addresses that are used - * in the temporary mm, which would lead to wrong signals being sent or - * crashes. - * - * Note that breakpoints are not disabled selectively, which also causes - * kernel breakpoints (e.g., perf's) to be disabled. This might be - * undesirable, but still seems reasonable as the code that runs in the - * temporary mm should be short. - */ - if (hw_breakpoint_active()) - hw_breakpoint_disable(); - - return temp_state; -} - -__ro_after_init struct mm_struct *poking_mm; -__ro_after_init unsigned long poking_addr; - -static inline void unuse_temporary_mm(temp_mm_state_t prev_state) -{ - lockdep_assert_irqs_disabled(); - - switch_mm_irqs_off(NULL, prev_state.mm, current); - - /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ - cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); - - /* - * Restore the breakpoints if they were disabled before the temporary mm - * was loaded. - */ - if (hw_breakpoint_active()) - hw_breakpoint_restore(); -} +__ro_after_init struct mm_struct *text_poke_mm; +__ro_after_init unsigned long text_poke_mm_addr; static void text_poke_memcpy(void *dst, const void *src, size_t len) { @@ -2531,7 +2434,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l { bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE; struct page *pages[2] = {NULL}; - temp_mm_state_t prev; + struct mm_struct *prev_mm; unsigned long flags; pte_t pte, *ptep; spinlock_t *ptl; @@ -2568,7 +2471,7 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l /* * The lock is not really needed, but this allows to avoid open-coding. */ - ptep = get_locked_pte(poking_mm, poking_addr, &ptl); + ptep = get_locked_pte(text_poke_mm, text_poke_mm_addr, &ptl); /* * This must not fail; preallocated in poking_init(). @@ -2578,21 +2481,21 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l local_irq_save(flags); pte = mk_pte(pages[0], pgprot); - set_pte_at(poking_mm, poking_addr, ptep, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr, ptep, pte); if (cross_page_boundary) { pte = mk_pte(pages[1], pgprot); - set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte); + set_pte_at(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1, pte); } /* * Loading the temporary mm behaves as a compiler barrier, which * guarantees that the PTE will be set at the time memcpy() is done. */ - prev = use_temporary_mm(poking_mm); + prev_mm = use_temporary_mm(text_poke_mm); kasan_disable_current(); - func((u8 *)poking_addr + offset_in_page(addr), src, len); + func((u8 *)text_poke_mm_addr + offset_in_page(addr), src, len); kasan_enable_current(); /* @@ -2601,22 +2504,22 @@ static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t l */ barrier(); - pte_clear(poking_mm, poking_addr, ptep); + pte_clear(text_poke_mm, text_poke_mm_addr, ptep); if (cross_page_boundary) - pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1); + pte_clear(text_poke_mm, text_poke_mm_addr + PAGE_SIZE, ptep + 1); /* * Loading the previous page-table hierarchy requires a serializing * instruction that already allows the core to see the updated version. * Xen-PV is assumed to serialize execution in a similar manner. */ - unuse_temporary_mm(prev); + unuse_temporary_mm(prev_mm); /* * Flushing the TLB might involve IPIs, which would require enabled * IRQs, but not if the mm is not used, as it is in this point. */ - flush_tlb_mm_range(poking_mm, poking_addr, poking_addr + + flush_tlb_mm_range(text_poke_mm, text_poke_mm_addr, text_poke_mm_addr + (cross_page_boundary ? 2 : 1) * PAGE_SIZE, PAGE_SHIFT, false); @@ -2752,7 +2655,7 @@ static void do_sync_core(void *info) sync_core(); } -void text_poke_sync(void) +void smp_text_poke_sync_each_cpu(void) { on_each_cpu(do_sync_core, NULL, 1); } @@ -2762,64 +2665,66 @@ void text_poke_sync(void) * this thing. When len == 6 everything is prefixed with 0x0f and we map * opcode to Jcc.d8, using len to distinguish. */ -struct text_poke_loc { +struct smp_text_poke_loc { /* addr := _stext + rel_addr */ s32 rel_addr; s32 disp; u8 len; u8 opcode; - const u8 text[POKE_MAX_OPCODE_SIZE]; - /* see text_poke_bp_batch() */ + const u8 text[TEXT_POKE_MAX_OPCODE_SIZE]; + /* see smp_text_poke_batch_finish() */ u8 old; }; -struct bp_patching_desc { - struct text_poke_loc *vec; +#define TEXT_POKE_ARRAY_MAX (PAGE_SIZE / sizeof(struct smp_text_poke_loc)) + +static struct smp_text_poke_array { + struct smp_text_poke_loc vec[TEXT_POKE_ARRAY_MAX]; int nr_entries; - atomic_t refs; -}; +} text_poke_array; -static struct bp_patching_desc bp_desc; +static DEFINE_PER_CPU(atomic_t, text_poke_array_refs); -static __always_inline -struct bp_patching_desc *try_get_desc(void) +/* + * These four __always_inline annotations imply noinstr, necessary + * due to smp_text_poke_int3_handler() being noinstr: + */ + +static __always_inline bool try_get_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); - if (!raw_atomic_inc_not_zero(&desc->refs)) - return NULL; + if (!raw_atomic_inc_not_zero(refs)) + return false; - return desc; + return true; } -static __always_inline void put_desc(void) +static __always_inline void put_text_poke_array(void) { - struct bp_patching_desc *desc = &bp_desc; + atomic_t *refs = this_cpu_ptr(&text_poke_array_refs); smp_mb__before_atomic(); - raw_atomic_dec(&desc->refs); + raw_atomic_dec(refs); } -static __always_inline void *text_poke_addr(struct text_poke_loc *tp) +static __always_inline void *text_poke_addr(const struct smp_text_poke_loc *tpl) { - return _stext + tp->rel_addr; + return _stext + tpl->rel_addr; } -static __always_inline int patch_cmp(const void *key, const void *elt) +static __always_inline int patch_cmp(const void *tpl_a, const void *tpl_b) { - struct text_poke_loc *tp = (struct text_poke_loc *) elt; - - if (key < text_poke_addr(tp)) + if (tpl_a < text_poke_addr(tpl_b)) return -1; - if (key > text_poke_addr(tp)) + if (tpl_a > text_poke_addr(tpl_b)) return 1; return 0; } -noinstr int poke_int3_handler(struct pt_regs *regs) +noinstr int smp_text_poke_int3_handler(struct pt_regs *regs) { - struct bp_patching_desc *desc; - struct text_poke_loc *tp; + struct smp_text_poke_loc *tpl; int ret = 0; void *ip; @@ -2828,41 +2733,40 @@ noinstr int poke_int3_handler(struct pt_regs *regs) /* * Having observed our INT3 instruction, we now must observe - * bp_desc with non-zero refcount: + * text_poke_array with non-zero refcount: * - * bp_desc.refs = 1 INT3 - * WMB RMB - * write INT3 if (bp_desc.refs != 0) + * text_poke_array_refs = 1 INT3 + * WMB RMB + * write INT3 if (text_poke_array_refs != 0) */ smp_rmb(); - desc = try_get_desc(); - if (!desc) + if (!try_get_text_poke_array()) return 0; /* - * Discount the INT3. See text_poke_bp_batch(). + * Discount the INT3. See smp_text_poke_batch_finish(). */ ip = (void *) regs->ip - INT3_INSN_SIZE; /* * Skip the binary search if there is a single member in the vector. */ - if (unlikely(desc->nr_entries > 1)) { - tp = __inline_bsearch(ip, desc->vec, desc->nr_entries, - sizeof(struct text_poke_loc), + if (unlikely(text_poke_array.nr_entries > 1)) { + tpl = __inline_bsearch(ip, text_poke_array.vec, text_poke_array.nr_entries, + sizeof(struct smp_text_poke_loc), patch_cmp); - if (!tp) + if (!tpl) goto out_put; } else { - tp = desc->vec; - if (text_poke_addr(tp) != ip) + tpl = text_poke_array.vec; + if (text_poke_addr(tpl) != ip) goto out_put; } - ip += tp->len; + ip += tpl->len; - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: /* * Someone poked an explicit INT3, they'll want to handle it, @@ -2875,16 +2779,16 @@ noinstr int poke_int3_handler(struct pt_regs *regs) break; case CALL_INSN_OPCODE: - int3_emulate_call(regs, (long)ip + tp->disp); + int3_emulate_call(regs, (long)ip + tpl->disp); break; case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: - int3_emulate_jmp(regs, (long)ip + tp->disp); + int3_emulate_jmp(regs, (long)ip + tpl->disp); break; case 0x70 ... 0x7f: /* Jcc */ - int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp); + int3_emulate_jcc(regs, tpl->opcode & 0xf, (long)ip, tpl->disp); break; default: @@ -2894,51 +2798,50 @@ noinstr int poke_int3_handler(struct pt_regs *regs) ret = 1; out_put: - put_desc(); + put_text_poke_array(); return ret; } -#define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc)) -static struct text_poke_loc tp_vec[TP_VEC_MAX]; -static int tp_vec_nr; - /** - * text_poke_bp_batch() -- update instructions on live kernel on SMP - * @tp: vector of instructions to patch - * @nr_entries: number of entries in the vector + * smp_text_poke_batch_finish() -- update instructions on live kernel on SMP * - * Modify multi-byte instruction by using int3 breakpoint on SMP. - * We completely avoid stop_machine() here, and achieve the - * synchronization using int3 breakpoint. + * Input state: + * text_poke_array.vec: vector of instructions to patch + * text_poke_array.nr_entries: number of entries in the vector + * + * Modify multi-byte instructions by using INT3 breakpoints on SMP. + * We completely avoid using stop_machine() here, and achieve the + * synchronization using INT3 breakpoints and SMP cross-calls. * * The way it is done: * - For each entry in the vector: - * - add a int3 trap to the address that will be patched - * - sync cores + * - add an INT3 trap to the address that will be patched + * - SMP sync all CPUs * - For each entry in the vector: * - update all but the first byte of the patched range - * - sync cores + * - SMP sync all CPUs * - For each entry in the vector: - * - replace the first byte (int3) by the first byte of + * - replace the first byte (INT3) by the first byte of the * replacing opcode - * - sync cores + * - SMP sync all CPUs */ -static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries) +void smp_text_poke_batch_finish(void) { unsigned char int3 = INT3_INSN_OPCODE; unsigned int i; int do_sync; - lockdep_assert_held(&text_mutex); + if (!text_poke_array.nr_entries) + return; - bp_desc.vec = tp; - bp_desc.nr_entries = nr_entries; + lockdep_assert_held(&text_mutex); /* - * Corresponds to the implicit memory barrier in try_get_desc() to - * ensure reading a non-zero refcount provides up to date bp_desc data. + * Corresponds to the implicit memory barrier in try_get_text_poke_array() to + * ensure reading a non-zero refcount provides up to date text_poke_array data. */ - atomic_set_release(&bp_desc.refs, 1); + for_each_possible_cpu(i) + atomic_set_release(per_cpu_ptr(&text_poke_array_refs, i), 1); /* * Function tracing can enable thousands of places that need to be @@ -2951,33 +2854,33 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries cond_resched(); /* - * Corresponding read barrier in int3 notifier for making sure the - * nr_entries and handler are correctly ordered wrt. patching. + * Corresponding read barrier in INT3 notifier for making sure the + * text_poke_array.nr_entries and handler are correctly ordered wrt. patching. */ smp_wmb(); /* - * First step: add a int3 trap to the address that will be patched. + * First step: add a INT3 trap to the address that will be patched. */ - for (i = 0; i < nr_entries; i++) { - tp[i].old = *(u8 *)text_poke_addr(&tp[i]); - text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE); + for (i = 0; i < text_poke_array.nr_entries; i++) { + text_poke_array.vec[i].old = *(u8 *)text_poke_addr(&text_poke_array.vec[i]); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &int3, INT3_INSN_SIZE); } - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Second step: update all but the first byte of the patched range. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, }; - u8 _new[POKE_MAX_OPCODE_SIZE+1]; - const u8 *new = tp[i].text; - int len = tp[i].len; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 old[TEXT_POKE_MAX_OPCODE_SIZE+1] = { text_poke_array.vec[i].old, }; + u8 _new[TEXT_POKE_MAX_OPCODE_SIZE+1]; + const u8 *new = text_poke_array.vec[i].text; + int len = text_poke_array.vec[i].len; if (len - INT3_INSN_SIZE > 0) { memcpy(old + INT3_INSN_SIZE, - text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, len - INT3_INSN_SIZE); if (len == 6) { @@ -2986,7 +2889,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries new = _new; } - text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE, + text_poke(text_poke_addr(&text_poke_array.vec[i]) + INT3_INSN_SIZE, new + INT3_INSN_SIZE, len - INT3_INSN_SIZE); @@ -3017,7 +2920,7 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * The old instruction is recorded so that the event can be * processed forwards or backwards. */ - perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len); + perf_event_text_poke(text_poke_addr(&text_poke_array.vec[i]), old, len, new, len); } if (do_sync) { @@ -3026,63 +2929,79 @@ static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries * not necessary and we'd be safe even without it. But * better safe than sorry (plus there's not only Intel). */ - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } /* - * Third step: replace the first byte (int3) by the first byte of + * Third step: replace the first byte (INT3) by the first byte of the * replacing opcode. */ - for (do_sync = 0, i = 0; i < nr_entries; i++) { - u8 byte = tp[i].text[0]; + for (do_sync = 0, i = 0; i < text_poke_array.nr_entries; i++) { + u8 byte = text_poke_array.vec[i].text[0]; - if (tp[i].len == 6) + if (text_poke_array.vec[i].len == 6) byte = 0x0f; if (byte == INT3_INSN_OPCODE) continue; - text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE); + text_poke(text_poke_addr(&text_poke_array.vec[i]), &byte, INT3_INSN_SIZE); do_sync++; } if (do_sync) - text_poke_sync(); + smp_text_poke_sync_each_cpu(); /* * Remove and wait for refs to be zero. + * + * Notably, if after step-3 above the INT3 got removed, then the + * smp_text_poke_sync_each_cpu() will have serialized against any running INT3 + * handlers and the below spin-wait will not happen. + * + * IOW. unless the replacement instruction is INT3, this case goes + * unused. */ - if (!atomic_dec_and_test(&bp_desc.refs)) - atomic_cond_read_acquire(&bp_desc.refs, !VAL); + for_each_possible_cpu(i) { + atomic_t *refs = per_cpu_ptr(&text_poke_array_refs, i); + + if (unlikely(!atomic_dec_and_test(refs))) + atomic_cond_read_acquire(refs, !VAL); + } + + /* They are all completed: */ + text_poke_array.nr_entries = 0; } -static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, - const void *opcode, size_t len, const void *emulate) +static void __smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { + struct smp_text_poke_loc *tpl; struct insn insn; int ret, i = 0; + tpl = &text_poke_array.vec[text_poke_array.nr_entries++]; + if (len == 6) i = 1; - memcpy((void *)tp->text, opcode+i, len-i); + memcpy((void *)tpl->text, opcode+i, len-i); if (!emulate) emulate = opcode; ret = insn_decode_kernel(&insn, emulate); BUG_ON(ret < 0); - tp->rel_addr = addr - (void *)_stext; - tp->len = len; - tp->opcode = insn.opcode.bytes[0]; + tpl->rel_addr = addr - (void *)_stext; + tpl->len = len; + tpl->opcode = insn.opcode.bytes[0]; if (is_jcc32(&insn)) { /* * Map Jcc.d32 onto Jcc.d8 and use len to distinguish. */ - tp->opcode = insn.opcode.bytes[1] - 0x10; + tpl->opcode = insn.opcode.bytes[1] - 0x10; } - switch (tp->opcode) { + switch (tpl->opcode) { case RET_INSN_OPCODE: case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: @@ -3091,14 +3010,14 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, * next instruction can be padded with INT3. */ for (i = insn.length; i < len; i++) - BUG_ON(tp->text[i] != INT3_INSN_OPCODE); + BUG_ON(tpl->text[i] != INT3_INSN_OPCODE); break; default: BUG_ON(len != insn.length); } - switch (tp->opcode) { + switch (tpl->opcode) { case INT3_INSN_OPCODE: case RET_INSN_OPCODE: break; @@ -3107,21 +3026,21 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, case JMP32_INSN_OPCODE: case JMP8_INSN_OPCODE: case 0x70 ... 0x7f: /* Jcc */ - tp->disp = insn.immediate.value; + tpl->disp = insn.immediate.value; break; default: /* assume NOP */ switch (len) { case 2: /* NOP2 -- emulate as JMP8+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP8_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP8_INSN_OPCODE; + tpl->disp = 0; break; case 5: /* NOP5 -- emulate as JMP32+0 */ BUG_ON(memcmp(emulate, x86_nops[len], len)); - tp->opcode = JMP32_INSN_OPCODE; - tp->disp = 0; + tpl->opcode = JMP32_INSN_OPCODE; + tpl->disp = 0; break; default: /* unknown instruction */ @@ -3132,51 +3051,50 @@ static void text_poke_loc_init(struct text_poke_loc *tp, void *addr, } /* - * We hard rely on the tp_vec being ordered; ensure this is so by flushing + * We hard rely on the text_poke_array.vec being ordered; ensure this is so by flushing * early if needed. */ -static bool tp_order_fail(void *addr) +static bool text_poke_addr_ordered(void *addr) { - struct text_poke_loc *tp; - - if (!tp_vec_nr) - return false; + WARN_ON_ONCE(!addr); - if (!addr) /* force */ + if (!text_poke_array.nr_entries) return true; - tp = &tp_vec[tp_vec_nr - 1]; - if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr) - return true; - - return false; -} - -static void text_poke_flush(void *addr) -{ - if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) { - text_poke_bp_batch(tp_vec, tp_vec_nr); - tp_vec_nr = 0; - } -} + /* + * If the last current entry's address is higher than the + * new entry's address we'd like to add, then ordering + * is violated and we must first flush all pending patching + * requests: + */ + if (text_poke_addr(text_poke_array.vec + text_poke_array.nr_entries-1) > addr) + return false; -void text_poke_finish(void) -{ - text_poke_flush(NULL); + return true; } -void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate) +/** + * smp_text_poke_batch_add() -- update instruction on live kernel on SMP, batched + * @addr: address to patch + * @opcode: opcode of new instruction + * @len: length to copy + * @emulate: instruction to be emulated + * + * Add a new instruction to the current queue of to-be-patched instructions + * the kernel maintains. The patching request will not be executed immediately, + * but becomes part of an array of patching requests, optimized for batched + * execution. All pending patching requests will be executed on the next + * smp_text_poke_batch_finish() call. + */ +void __ref smp_text_poke_batch_add(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc *tp; - - text_poke_flush(addr); - - tp = &tp_vec[tp_vec_nr++]; - text_poke_loc_init(tp, addr, opcode, len, emulate); + if (text_poke_array.nr_entries == TEXT_POKE_ARRAY_MAX || !text_poke_addr_ordered(addr)) + smp_text_poke_batch_finish(); + __smp_text_poke_batch_add(addr, opcode, len, emulate); } /** - * text_poke_bp() -- update instructions on live kernel on SMP + * smp_text_poke_single() -- update instruction on live kernel on SMP immediately * @addr: address to patch * @opcode: opcode of new instruction * @len: length to copy @@ -3184,12 +3102,11 @@ void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const voi * * Update a single instruction with the vector in the stack, avoiding * dynamically allocated memory. This function should be used when it is - * not possible to allocate memory. + * not possible to allocate memory for a vector. The single instruction + * is patched in immediately. */ -void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate) +void __ref smp_text_poke_single(void *addr, const void *opcode, size_t len, const void *emulate) { - struct text_poke_loc tp; - - text_poke_loc_init(&tp, addr, opcode, len, emulate); - text_poke_bp_batch(&tp, 1); + __smp_text_poke_batch_add(addr, opcode, len, emulate); + smp_text_poke_batch_finish(); } diff --git a/arch/x86/kernel/amd_gart_64.c b/arch/x86/kernel/amd_gart_64.c index c884deca839b..3485d419c2f5 100644 --- a/arch/x86/kernel/amd_gart_64.c +++ b/arch/x86/kernel/amd_gart_64.c @@ -39,7 +39,7 @@ #include <asm/gart.h> #include <asm/set_memory.h> #include <asm/dma.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/x86_init.h> static unsigned long iommu_bus_base; /* GART remapping area (physical) */ diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 6d12a9b69432..c1acead6227a 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -13,7 +13,9 @@ #include <linux/export.h> #include <linux/spinlock.h> #include <linux/pci_ids.h> -#include <asm/amd_nb.h> + +#include <asm/amd/nb.h> +#include <asm/cpuid/api.h> static u32 *flush_words; @@ -91,10 +93,7 @@ static int amd_cache_northbridges(void) if (amd_gart_present()) amd_northbridges.flags |= AMD_NB_GART; - /* - * Check for L3 cache presence. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return 0; /* @@ -151,7 +150,7 @@ struct resource *amd_get_mmconfig_range(struct resource *res) /* Assume CPUs from Fam10h have mmconfig, although not all VMs do */ if (boot_cpu_data.x86 < 0x10 || - rdmsrl_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr)) + rdmsrq_safe(MSR_FAM10H_MMIO_CONF_BASE, &msr)) return NULL; /* mmconfig is not enabled */ diff --git a/arch/x86/kernel/amd_node.c b/arch/x86/kernel/amd_node.c index b670fa85c61b..a40176b62eb5 100644 --- a/arch/x86/kernel/amd_node.c +++ b/arch/x86/kernel/amd_node.c @@ -9,7 +9,7 @@ */ #include <linux/debugfs.h> -#include <asm/amd_node.h> +#include <asm/amd/node.h> /* * AMD Nodes are a physical collection of I/O devices within an SoC. There can be one diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 89c0c8a3fc7e..769321185a08 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -29,7 +29,7 @@ #include <asm/gart.h> #include <asm/pci-direct.h> #include <asm/dma.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/x86_init.h> #include <linux/crash_dump.h> diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 62584a347931..d73ba5a7b623 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -59,6 +59,7 @@ #include <asm/time.h> #include <asm/smp.h> #include <asm/mce.h> +#include <asm/msr.h> #include <asm/tsc.h> #include <asm/hypervisor.h> #include <asm/cpu_device_id.h> @@ -425,7 +426,7 @@ static int lapic_next_deadline(unsigned long delta, weak_wrmsr_fence(); tsc = rdtsc(); - wrmsrl(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); + wrmsrq(MSR_IA32_TSC_DEADLINE, tsc + (((u64) delta) * TSC_DIVISOR)); return 0; } @@ -449,7 +450,7 @@ static int lapic_timer_shutdown(struct clock_event_device *evt) * the timer _and_ zero the counter registers: */ if (v & APIC_LVT_TIMER_TSCDEADLINE) - wrmsrl(MSR_IA32_TSC_DEADLINE, 0); + wrmsrq(MSR_IA32_TSC_DEADLINE, 0); else apic_write(APIC_TMICT, 0); @@ -1694,7 +1695,7 @@ static bool x2apic_hw_locked(void) x86_arch_cap_msr = x86_read_arch_cap_msr(); if (x86_arch_cap_msr & ARCH_CAP_XAPIC_DISABLE) { - rdmsrl(MSR_IA32_XAPIC_DISABLE_STATUS, msr); + rdmsrq(MSR_IA32_XAPIC_DISABLE_STATUS, msr); return (msr & LEGACY_XAPIC_DISABLED); } return false; @@ -1707,12 +1708,12 @@ static void __x2apic_disable(void) if (!boot_cpu_has(X86_FEATURE_APIC)) return; - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); if (!(msr & X2APIC_ENABLE)) return; /* Disable xapic and x2apic first and then reenable xapic mode */ - wrmsrl(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); - wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); + wrmsrq(MSR_IA32_APICBASE, msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrq(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); printk_once(KERN_INFO "x2apic disabled\n"); } @@ -1720,10 +1721,10 @@ static void __x2apic_enable(void) { u64 msr; - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); if (msr & X2APIC_ENABLE) return; - wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); + wrmsrq(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); printk_once(KERN_INFO "x2apic enabled\n"); } diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c index 16410f087b7a..e272bc7fdc8e 100644 --- a/arch/x86/kernel/apic/apic_numachip.c +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/pgtable.h> +#include <asm/msr.h> #include <asm/numachip/numachip.h> #include <asm/numachip/numachip_csr.h> @@ -31,7 +32,7 @@ static u32 numachip1_get_apic_id(u32 x) unsigned int id = (x >> 24) & 0xff; if (static_cpu_has(X86_FEATURE_NODEID_MSR)) { - rdmsrl(MSR_FAM10H_NODE_ID, value); + rdmsrq(MSR_FAM10H_NODE_ID, value); id |= (value << 2) & 0xff00; } @@ -42,7 +43,7 @@ static u32 numachip2_get_apic_id(u32 x) { u64 mcfg; - rdmsrl(MSR_FAM10H_MMIO_CONF_BASE, mcfg); + rdmsrq(MSR_FAM10H_MMIO_CONF_BASE, mcfg); return ((mcfg >> (28 - 8)) & 0xfff00) | (x >> 24); } @@ -150,7 +151,7 @@ static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) /* Account for nodes per socket in multi-core-module processors */ if (boot_cpu_has(X86_FEATURE_NODEID_MSR)) { - rdmsrl(MSR_FAM10H_NODE_ID, val); + rdmsrq(MSR_FAM10H_NODE_ID, val); nodes = ((val >> 3) & 7) + 1; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index eebc360ed1bb..5ba2feb2c04c 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -1486,7 +1486,7 @@ static void __init delay_with_tsc(void) * 1 GHz == 40 jiffies */ do { - rep_nop(); + native_pause(); now = rdtsc(); } while ((now - start) < 40000000000ULL / HZ && time_before_eq(jiffies, end)); } @@ -2225,7 +2225,7 @@ static int mp_irqdomain_create(int ioapic) /* Handle device tree enumerated APICs proper */ if (cfg->dev) { - fn = of_node_to_fwnode(cfg->dev); + fn = of_fwnode_handle(cfg->dev); } else { fn = irq_domain_alloc_named_id_fwnode("IO-APIC", mpc_ioapic_id(ioapic)); if (!fn) diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index fee42a73d64a..93069b13d3af 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -864,7 +864,7 @@ void lapic_offline(void) __vector_cleanup(cl, false); irq_matrix_offline(vector_matrix); - WARN_ON_ONCE(try_to_del_timer_sync(&cl->timer) < 0); + WARN_ON_ONCE(timer_delete_sync_try(&cl->timer) < 0); WARN_ON_ONCE(!hlist_empty(&cl->head)); unlock_vector_lock(); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index ad4ea6fb3b6c..6259b474073b 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -33,6 +33,14 @@ static void __used common(void) { + OFFSET(CPUINFO_x86, cpuinfo_x86, x86); + OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); + OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); + OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); + OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); + OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); + OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); + BLANK(); OFFSET(TASK_threadsp, task_struct, thread.sp); #ifdef CONFIG_STACKPROTECTOR diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 2b411cd00a4e..e0a292db97b2 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -12,15 +12,6 @@ void foo(void); void foo(void) { - OFFSET(CPUINFO_x86, cpuinfo_x86, x86); - OFFSET(CPUINFO_x86_vendor, cpuinfo_x86, x86_vendor); - OFFSET(CPUINFO_x86_model, cpuinfo_x86, x86_model); - OFFSET(CPUINFO_x86_stepping, cpuinfo_x86, x86_stepping); - OFFSET(CPUINFO_cpuid_level, cpuinfo_x86, cpuid_level); - OFFSET(CPUINFO_x86_capability, cpuinfo_x86, x86_capability); - OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); - BLANK(); - OFFSET(PT_EBX, pt_regs, bx); OFFSET(PT_ECX, pt_regs, cx); OFFSET(PT_EDX, pt_regs, dx); diff --git a/arch/x86/kernel/callthunks.c b/arch/x86/kernel/callthunks.c index d86d7d6e750c..a951333c5995 100644 --- a/arch/x86/kernel/callthunks.c +++ b/arch/x86/kernel/callthunks.c @@ -185,7 +185,7 @@ static void *patch_dest(void *dest, bool direct) u8 *pad = dest - tsize; memcpy(insn_buff, skl_call_thunk_template, tsize); - apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); + text_poke_apply_relocation(insn_buff, pad, tsize, skl_call_thunk_template, tsize); /* Already patched? */ if (!bcmp(pad, insn_buff, tsize)) @@ -294,7 +294,7 @@ static bool is_callthunk(void *addr) pad = (void *)(dest - tmpl_size); memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, pad, tmpl_size, skl_call_thunk_template, tmpl_size); return !bcmp(pad, insn_buff, tmpl_size); } @@ -312,7 +312,7 @@ int x86_call_depth_emit_accounting(u8 **pprog, void *func, void *ip) return 0; memcpy(insn_buff, skl_call_thunk_template, tmpl_size); - apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); + text_poke_apply_relocation(insn_buff, ip, tmpl_size, skl_call_thunk_template, tmpl_size); memcpy(*pprog, insn_buff, tmpl_size); *pprog += tmpl_size; diff --git a/arch/x86/kernel/cet.c b/arch/x86/kernel/cet.c index 303bf74d175b..99444409c026 100644 --- a/arch/x86/kernel/cet.c +++ b/arch/x86/kernel/cet.c @@ -2,6 +2,7 @@ #include <linux/ptrace.h> #include <asm/bugs.h> +#include <asm/msr.h> #include <asm/traps.h> enum cp_error_code { @@ -55,7 +56,7 @@ static void do_user_cp_fault(struct pt_regs *regs, unsigned long error_code) * will be whatever is live in userspace. So read the SSP before enabling * interrupts so locking the fpregs to do it later is not required. */ - rdmsrl(MSR_IA32_PL3_SSP, ssp); + rdmsrq(MSR_IA32_PL3_SSP, ssp); cond_local_irq_enable(regs); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4efdf5c2efc8..1e26179ff18c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -24,7 +24,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o -obj-y += cpuid-deps.o +obj-y += cpuid-deps.o cpuid_0x2_table.o obj-y += umwait.o obj-y += capflags.o powerflags.o @@ -38,6 +38,9 @@ obj-y += intel.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o +ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy) +obj-y += amd_cache_disable.o +endif obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 4e06baab40bb..93da466dfe2c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,6 +9,7 @@ #include <linux/sched/clock.h> #include <linux/random.h> #include <linux/topology.h> +#include <asm/amd/fch.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cacheinfo.h> @@ -21,6 +22,7 @@ #include <asm/delay.h> #include <asm/debugreg.h> #include <asm/resctrl.h> +#include <asm/msr.h> #include <asm/sev.h> #ifdef CONFIG_X86_64 @@ -31,7 +33,7 @@ u16 invlpgb_count_max __ro_after_init; -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +static inline int rdmsrq_amd_safe(unsigned msr, u64 *p) { u32 gprs[8] = { 0 }; int err; @@ -49,7 +51,7 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) return err; } -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +static inline int wrmsrq_amd_safe(unsigned msr, u64 val) { u32 gprs[8] = { 0 }; @@ -383,7 +385,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) (c->x86 == 0x10 && c->x86_model >= 0x2)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -422,7 +424,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; @@ -513,7 +515,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; @@ -530,7 +532,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) if (!sme_me_mask) setup_clear_cpu_cap(X86_FEATURE_SME); - rdmsrl(MSR_K7_HWCR, msr); + rdmsrq(MSR_K7_HWCR, msr); if (!(msr & MSR_K7_HWCR_SMMLOCK)) goto clear_sev; @@ -617,7 +619,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); - else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); setup_force_cpu_cap(X86_FEATURE_SBPB); } @@ -641,14 +643,14 @@ static void init_amd_k8(struct cpuinfo_x86 *c) */ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) { clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &value)) { + if (!rdmsrq_amd_safe(0xc001100d, &value)) { value &= ~BIT_64(32); - wrmsrl_amd_safe(0xc001100d, value); + wrmsrq_amd_safe(0xc001100d, value); } } if (!c->x86_model_id[0]) - strcpy(c->x86_model_id, "Hammer"); + strscpy(c->x86_model_id, "Hammer"); #ifdef CONFIG_SMP /* @@ -793,9 +795,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c) * Disable it on the affected CPUs. */ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { + if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { value |= 0x1E; - wrmsrl_safe(MSR_F15H_IC_CFG, value); + wrmsrq_safe(MSR_F15H_IC_CFG, value); } } @@ -844,9 +846,9 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) * suppresses non-branch predictions. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { - if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; - wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); } } #endif @@ -1030,7 +1032,7 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); @@ -1211,7 +1213,7 @@ void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask) return; - wrmsr(amd_msr_dr_addr_masks[dr], mask, 0); + wrmsrq(amd_msr_dr_addr_masks[dr], mask); per_cpu(amd_dr_addr_mask, cpu)[dr] = mask; } @@ -1242,3 +1244,56 @@ void amd_check_microcode(void) if (cpu_feature_enabled(X86_FEATURE_ZEN2)) on_each_cpu(zenbleed_check_cpu, NULL, 1); } + +static const char * const s5_reset_reason_txt[] = { + [0] = "thermal pin BP_THERMTRIP_L was tripped", + [1] = "power button was pressed for 4 seconds", + [2] = "shutdown pin was tripped", + [4] = "remote ASF power off command was received", + [9] = "internal CPU thermal limit was tripped", + [16] = "system reset pin BP_SYS_RST_L was tripped", + [17] = "software issued PCI reset", + [18] = "software wrote 0x4 to reset control register 0xCF9", + [19] = "software wrote 0x6 to reset control register 0xCF9", + [20] = "software wrote 0xE to reset control register 0xCF9", + [21] = "ACPI power state transition occurred", + [22] = "keyboard reset pin KB_RST_L was tripped", + [23] = "internal CPU shutdown event occurred", + [24] = "system failed to boot before failed boot timer expired", + [25] = "hardware watchdog timer expired", + [26] = "remote ASF reset command was received", + [27] = "an uncorrected error caused a data fabric sync flood event", + [29] = "FCH and MP1 failed warm reset handshake", + [30] = "a parity error occurred", + [31] = "a software sync flood event occurred", +}; + +static __init int print_s5_reset_status_mmio(void) +{ + unsigned long value; + void __iomem *addr; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return 0; + + addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value)); + if (!addr) + return 0; + + value = ioread32(addr); + iounmap(addr); + + for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) { + if (!(value & BIT(i))) + continue; + + if (s5_reset_reason_txt[i]) { + pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n", + value, s5_reset_reason_txt[i]); + } + } + + return 0; +} +late_initcall(print_s5_reset_status_mmio); diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c new file mode 100644 index 000000000000..8843b9557aea --- /dev/null +++ b/arch/x86/kernel/cpu/amd_cache_disable.c @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD L3 cache_disable_{0,1} sysfs handling + * Documentation/ABI/testing/sysfs-devices-system-cpu + */ + +#include <linux/cacheinfo.h> +#include <linux/capability.h> +#include <linux/pci.h> +#include <linux/sysfs.h> + +#include <asm/amd/nb.h> + +#include "cpu.h" + +/* + * L3 cache descriptors + */ +static void amd_calc_l3_indices(struct amd_northbridge *nb) +{ + struct amd_l3_cache *l3 = &nb->l3_cache; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot) +{ + int index; + struct amd_northbridge *nb = ci->priv; + + index = amd_get_l3_disable_slot(nb, slot); + if (index >= 0) + return sysfs_emit(buf, "%d\n", index); + + return sysfs_emit(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return show_cache_disable(ci, buf, slot); \ +} + +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!nb->l3_cache.subcaches[i]) + continue; + + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); + if (ret >= 0) + return -EEXIST; + + if (index > nb->l3_cache.indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; + + amd_l3_disable_index(nb, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf, + size_t count, unsigned int slot) +{ + struct amd_northbridge *nb = ci->priv; + unsigned long val = 0; + int cpu, err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cpu = cpumask_first(&ci->shared_cpu_map); + + if (kstrtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(nb, cpu, slot, val); + if (err) { + if (err == -EEXIST) + pr_warn("L3 slot %d in use/index already disabled!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return store_cache_disable(ci, buf, count, slot); \ +} + +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + + return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *ci = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!ci->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + static struct attribute **amd_l3_attrs; + int n = 1; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci) +{ + struct amd_northbridge *nb = ci->priv; + + if (ci->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +struct amd_northbridge *amd_init_l3_cache(int index) +{ + struct amd_northbridge *nb; + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return NULL; + + node = topology_amd_node_id(smp_processor_id()); + nb = node_to_amd_nb(node); + if (nb && !nb->l3_cache.indices) + amd_calc_l3_indices(nb); + + return nb; +} diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 6cf31a1649c4..a315b0627dfb 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -20,6 +20,7 @@ #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/msr.h> #include "cpu.h" @@ -40,8 +41,8 @@ static void init_counter_refs(void) { u64 aperf, mperf; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); this_cpu_write(cpu_samples.aperf, aperf); this_cpu_write(cpu_samples.mperf, mperf); @@ -99,7 +100,7 @@ static bool __init turbo_disabled(void) u64 misc_en; int err; - err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en); if (err) return false; @@ -110,11 +111,11 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; - err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); if (err) return false; @@ -152,13 +153,13 @@ static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int err, i; u64 msr; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -190,17 +191,17 @@ static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int s u32 group_size; int err, i; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts); if (err) return false; @@ -220,11 +221,11 @@ static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) u64 msr; int err; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -474,8 +475,8 @@ void arch_scale_freq_tick(void) if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); acnt = aperf - s->aperf; mcnt = mperf - s->mperf; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 8596ce85026c..7f94e6a5497d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -34,22 +34,66 @@ #include "cpu.h" +/* + * Speculation Vulnerability Handling + * + * Each vulnerability is handled with the following functions: + * <vuln>_select_mitigation() -- Selects a mitigation to use. This should + * take into account all relevant command line + * options. + * <vuln>_update_mitigation() -- This is called after all vulnerabilities have + * selected a mitigation, in case the selection + * may want to change based on other choices + * made. This function is optional. + * <vuln>_apply_mitigation() -- Enable the selected mitigation. + * + * The compile-time mitigation in all cases should be AUTO. An explicit + * command-line option can override AUTO. If no such option is + * provided, <vuln>_select_mitigation() will override AUTO to the best + * mitigation option. + */ + static void __init spectre_v1_select_mitigation(void); +static void __init spectre_v1_apply_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init spectre_v2_update_mitigation(void); +static void __init spectre_v2_apply_mitigation(void); static void __init retbleed_select_mitigation(void); +static void __init retbleed_update_mitigation(void); +static void __init retbleed_apply_mitigation(void); static void __init spectre_v2_user_select_mitigation(void); +static void __init spectre_v2_user_update_mitigation(void); +static void __init spectre_v2_user_apply_mitigation(void); static void __init ssb_select_mitigation(void); +static void __init ssb_apply_mitigation(void); static void __init l1tf_select_mitigation(void); +static void __init l1tf_apply_mitigation(void); static void __init mds_select_mitigation(void); -static void __init md_clear_update_mitigation(void); -static void __init md_clear_select_mitigation(void); +static void __init mds_update_mitigation(void); +static void __init mds_apply_mitigation(void); static void __init taa_select_mitigation(void); +static void __init taa_update_mitigation(void); +static void __init taa_apply_mitigation(void); static void __init mmio_select_mitigation(void); +static void __init mmio_update_mitigation(void); +static void __init mmio_apply_mitigation(void); +static void __init rfds_select_mitigation(void); +static void __init rfds_update_mitigation(void); +static void __init rfds_apply_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init srbds_apply_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); +static void __init srso_update_mitigation(void); +static void __init srso_apply_mitigation(void); static void __init gds_select_mitigation(void); +static void __init gds_apply_mitigation(void); +static void __init bhi_select_mitigation(void); +static void __init bhi_update_mitigation(void); +static void __init bhi_apply_mitigation(void); static void __init its_select_mitigation(void); +static void __init its_update_mitigation(void); +static void __init its_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -79,7 +123,7 @@ static void __init set_return_thunk(void *thunk) static void update_spec_ctrl(u64 val) { this_cpu_write(x86_spec_ctrl_current, val); - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } /* @@ -98,7 +142,7 @@ void update_spec_ctrl_cond(u64 val) * forced the update can be delayed until that time. */ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } noinstr u64 spec_ctrl_current(void) @@ -136,9 +180,13 @@ EXPORT_SYMBOL_GPL(mds_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ -DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); -EXPORT_SYMBOL_GPL(mmio_stale_data_clear); +/* + * Controls CPU Fill buffer clear before VMenter. This is a subset of + * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only + * mitigation is required. + */ +DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); +EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); void __init cpu_select_mitigations(void) { @@ -148,7 +196,7 @@ void __init cpu_select_mitigations(void) * init code as it is not enumerated and depends on the family. */ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { - rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); /* * Previously running kernel (kexec), may have some controls @@ -163,31 +211,67 @@ void __init cpu_select_mitigations(void) /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); - /* - * retbleed_select_mitigation() relies on the state set by - * spectre_v2_select_mitigation(); specifically it wants to know about - * spectre_v2=ibrs. - */ retbleed_select_mitigation(); - /* - * spectre_v2_user_select_mitigation() relies on the state set by - * retbleed_select_mitigation(); specifically the STIBP selection is - * forced for UNRET or IBPB. - */ spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); - md_clear_select_mitigation(); + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + rfds_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); - - /* - * srso_select_mitigation() depends and must run after - * retbleed_select_mitigation(). - */ srso_select_mitigation(); gds_select_mitigation(); its_select_mitigation(); + bhi_select_mitigation(); + + /* + * After mitigations are selected, some may need to update their + * choices. + */ + spectre_v2_update_mitigation(); + /* + * retbleed_update_mitigation() relies on the state set by + * spectre_v2_update_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_update_mitigation(); + /* + * its_update_mitigation() depends on spectre_v2_update_mitigation() + * and retbleed_update_mitigation(). + */ + its_update_mitigation(); + + /* + * spectre_v2_user_update_mitigation() depends on + * retbleed_update_mitigation(), specifically the STIBP + * selection is forced for UNRET or IBPB. + */ + spectre_v2_user_update_mitigation(); + mds_update_mitigation(); + taa_update_mitigation(); + mmio_update_mitigation(); + rfds_update_mitigation(); + bhi_update_mitigation(); + /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ + srso_update_mitigation(); + + spectre_v1_apply_mitigation(); + spectre_v2_apply_mitigation(); + retbleed_apply_mitigation(); + spectre_v2_user_apply_mitigation(); + ssb_apply_mitigation(); + l1tf_apply_mitigation(); + mds_apply_mitigation(); + taa_apply_mitigation(); + mmio_apply_mitigation(); + rfds_apply_mitigation(); + srbds_apply_mitigation(); + srso_apply_mitigation(); + gds_apply_mitigation(); + its_apply_mitigation(); + bhi_apply_mitigation(); } /* @@ -237,9 +321,9 @@ static void x86_amd_ssb_disable(void) u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) - wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) - wrmsrl(MSR_AMD64_LS_CFG, msrval); + wrmsrq(MSR_AMD64_LS_CFG, msrval); } #undef pr_fmt @@ -290,6 +374,12 @@ enum rfds_mitigations { static enum rfds_mitigations rfds_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; +/* + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing + * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry. + */ +static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; + static void __init mds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { @@ -300,12 +390,34 @@ static void __init mds_select_mitigation(void) if (mds_mitigation == MDS_MITIGATION_AUTO) mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_OFF) + return; + + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; + + /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */ + if (verw_clear_cpu_buf_mitigation_selected) + mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; + } - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + pr_info("%s\n", mds_strings[mds_mitigation]); +} +static void __init mds_apply_mitigation(void) +{ + if (mds_mitigation == MDS_MITIGATION_FULL || + mds_mitigation == MDS_MITIGATION_VMWERV) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); @@ -345,6 +457,11 @@ static const char * const taa_strings[] = { [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", }; +static bool __init taa_vulnerable(void) +{ + return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM); +} + static void __init taa_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_TAA)) { @@ -358,48 +475,63 @@ static void __init taa_select_mitigation(void) return; } - if (cpu_mitigations_off()) { + if (cpu_mitigations_off()) taa_mitigation = TAA_MITIGATION_OFF; - return; - } - /* - * TAA mitigation via VERW is turned off if both - * tsx_async_abort=off and mds=off are specified. - */ - if (taa_mitigation == TAA_MITIGATION_OFF && - mds_mitigation == MDS_MITIGATION_OFF) + /* Microcode will be checked in taa_update_mitigation(). */ + if (taa_mitigation == TAA_MITIGATION_AUTO) + taa_mitigation = TAA_MITIGATION_VERW; + + if (taa_mitigation != TAA_MITIGATION_OFF) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init taa_update_mitigation(void) +{ + if (!taa_vulnerable() || cpu_mitigations_off()) return; - if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + if (verw_clear_cpu_buf_mitigation_selected) taa_mitigation = TAA_MITIGATION_VERW; - else - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. - * A microcode update fixes this behavior to clear CPU buffers. It also - * adds support for MSR_IA32_TSX_CTRL which is enumerated by the - * ARCH_CAP_TSX_CTRL_MSR bit. - * - * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode - * update is required. - */ - if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && - !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + if (taa_mitigation == TAA_MITIGATION_VERW) { + /* Check if the requisite ucode is available. */ + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * TSX is enabled, select alternate mitigation for TAA which is - * the same as MDS. Enable MDS static branch to clear CPU buffers. - * - * For guests that can't determine whether the correct microcode is - * present on host, enable the mitigation for UCODE_NEEDED as well. - */ - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + } - if (taa_nosmt || cpu_mitigations_auto_nosmt()) - cpu_smt_disable(false); + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static void __init taa_apply_mitigation(void) +{ + if (taa_mitigation == TAA_MITIGATION_VERW || + taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) { + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + } } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -437,31 +569,67 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } + /* Microcode will be checked in mmio_update_mitigation(). */ + if (mmio_mitigation == MMIO_MITIGATION_AUTO) + mmio_mitigation = MMIO_MITIGATION_VERW; + if (mmio_mitigation == MMIO_MITIGATION_OFF) return; /* * Enable CPU buffer clear mitigation for host and VMM, if also affected - * by MDS or TAA. Otherwise, enable mitigation for VMM only. + * by MDS or TAA. */ - if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && - boot_cpu_has(X86_FEATURE_RTM))) - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mmio_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) + mmio_mitigation = MMIO_MITIGATION_VERW; + + if (mmio_mitigation == MMIO_MITIGATION_VERW) { + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))) + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", mmio_strings[mmio_mitigation]); +} + +static void __init mmio_apply_mitigation(void) +{ + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; /* - * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based - * mitigations, disable KVM-only mitigation in that case. + * Only enable the VMM mitigation if the CPU buffer clear mitigation is + * not being used. */ - if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - static_branch_disable(&mmio_stale_data_clear); - else - static_branch_enable(&mmio_stale_data_clear); + if (verw_clear_cpu_buf_mitigation_selected) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + static_branch_disable(&cpu_buf_vm_clear); + } else { + static_branch_enable(&cpu_buf_vm_clear); + } /* * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can @@ -471,21 +639,6 @@ static void __init mmio_select_mitigation(void) if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); - /* - * Check if the system has the right microcode. - * - * CPU Fill buffer clear mitigation is enumerated by either an explicit - * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS - * affected systems. - */ - if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || - (boot_cpu_has(X86_FEATURE_MD_CLEAR) && - boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) - mmio_mitigation = MMIO_MITIGATION_VERW; - else - mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; - if (mmio_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); } @@ -520,22 +673,48 @@ static const char * const rfds_strings[] = { [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", }; +static inline bool __init verw_clears_cpu_reg_file(void) +{ + return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR); +} + static void __init rfds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { rfds_mitigation = RFDS_MITIGATION_OFF; return; } + + if (rfds_mitigation == RFDS_MITIGATION_AUTO) + rfds_mitigation = RFDS_MITIGATION_VERW; + if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (rfds_mitigation == RFDS_MITIGATION_AUTO) + if (verw_clears_cpu_reg_file()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init rfds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) rfds_mitigation = RFDS_MITIGATION_VERW; - if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) + if (rfds_mitigation == RFDS_MITIGATION_VERW) { + if (!verw_clears_cpu_reg_file()) + rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", rfds_strings[rfds_mitigation]); +} + +static void __init rfds_apply_mitigation(void) +{ + if (rfds_mitigation == RFDS_MITIGATION_VERW) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); - else - rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; } static __init int rfds_parse_cmdline(char *str) @@ -556,76 +735,11 @@ static __init int rfds_parse_cmdline(char *str) early_param("reg_file_data_sampling", rfds_parse_cmdline); #undef pr_fmt -#define pr_fmt(fmt) "" fmt - -static void __init md_clear_update_mitigation(void) -{ - if (cpu_mitigations_off()) - return; - - if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - goto out; - - /* - * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO - * Stale Data mitigation, if necessary. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } - if (taa_mitigation == TAA_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_TAA)) { - taa_mitigation = TAA_MITIGATION_VERW; - taa_select_mitigation(); - } - /* - * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear - * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state. - */ - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { - mmio_mitigation = MMIO_MITIGATION_VERW; - mmio_select_mitigation(); - } - if (rfds_mitigation == RFDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_RFDS)) { - rfds_mitigation = RFDS_MITIGATION_VERW; - rfds_select_mitigation(); - } -out: - if (boot_cpu_has_bug(X86_BUG_MDS)) - pr_info("MDS: %s\n", mds_strings[mds_mitigation]); - if (boot_cpu_has_bug(X86_BUG_TAA)) - pr_info("TAA: %s\n", taa_strings[taa_mitigation]); - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) - pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); - else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - pr_info("MMIO Stale Data: Unknown: No mitigations\n"); - if (boot_cpu_has_bug(X86_BUG_RFDS)) - pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); -} - -static void __init md_clear_select_mitigation(void) -{ - mds_select_mitigation(); - taa_select_mitigation(); - mmio_select_mitigation(); - rfds_select_mitigation(); - - /* - * As these mitigations are inter-related and rely on VERW instruction - * to clear the microarchitural buffers, update and print their status - * after mitigation selection is done for each of these vulnerabilities. - */ - md_clear_update_mitigation(); -} - -#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_AUTO, SRBDS_MITIGATION_UCODE_NEEDED, SRBDS_MITIGATION_FULL, SRBDS_MITIGATION_TSX_OFF, @@ -633,7 +747,7 @@ enum srbds_mitigations { }; static enum srbds_mitigations srbds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", @@ -665,7 +779,7 @@ void update_srbds_msr(void) if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { case SRBDS_MITIGATION_OFF: @@ -679,13 +793,18 @@ void update_srbds_msr(void) break; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); } static void __init srbds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) { + srbds_mitigation = SRBDS_MITIGATION_OFF; return; + } + + if (srbds_mitigation == SRBDS_MITIGATION_AUTO) + srbds_mitigation = SRBDS_MITIGATION_FULL; /* * Check to see if this is one of the MDS_NO systems supporting TSX that @@ -699,13 +818,17 @@ static void __init srbds_select_mitigation(void) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; - else if (cpu_mitigations_off() || srbds_off) + else if (srbds_off) srbds_mitigation = SRBDS_MITIGATION_OFF; - update_srbds_msr(); pr_info("%s\n", srbds_strings[srbds_mitigation]); } +static void __init srbds_apply_mitigation(void) +{ + update_srbds_msr(); +} + static int __init srbds_parse_cmdline(char *str) { if (!str) @@ -752,6 +875,7 @@ early_param("l1d_flush", l1d_flush_parse_cmdline); enum gds_mitigations { GDS_MITIGATION_OFF, + GDS_MITIGATION_AUTO, GDS_MITIGATION_UCODE_NEEDED, GDS_MITIGATION_FORCE, GDS_MITIGATION_FULL, @@ -760,7 +884,7 @@ enum gds_mitigations { }; static enum gds_mitigations gds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF; static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", @@ -785,7 +909,7 @@ void update_gds_msr(void) switch (gds_mitigation) { case GDS_MITIGATION_OFF: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl |= GDS_MITG_DIS; break; case GDS_MITIGATION_FULL_LOCKED: @@ -795,23 +919,24 @@ void update_gds_msr(void) * CPUs. */ case GDS_MITIGATION_FULL: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl &= ~GDS_MITG_DIS; break; case GDS_MITIGATION_FORCE: case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: + case GDS_MITIGATION_AUTO: return; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); /* * Check to make sure that the WRMSR value was not ignored. Writes to * GDS_MITG_DIS will be ignored if this processor is locked but the boot * processor was not. */ - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); } @@ -824,33 +949,28 @@ static void __init gds_select_mitigation(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { gds_mitigation = GDS_MITIGATION_HYPERVISOR; - goto out; + return; } if (cpu_mitigations_off()) gds_mitigation = GDS_MITIGATION_OFF; /* Will verify below that mitigation _can_ be disabled */ + if (gds_mitigation == GDS_MITIGATION_AUTO) + gds_mitigation = GDS_MITIGATION_FULL; + /* No microcode */ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { - if (gds_mitigation == GDS_MITIGATION_FORCE) { - /* - * This only needs to be done on the boot CPU so do it - * here rather than in update_gds_msr() - */ - setup_clear_cpu_cap(X86_FEATURE_AVX); - pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); - } else { + if (gds_mitigation != GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; - } - goto out; + return; } /* Microcode has mitigation, use it */ if (gds_mitigation == GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_FULL; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); if (mcu_ctrl & GDS_MITG_LOCKED) { if (gds_mitigation == GDS_MITIGATION_OFF) pr_warn("Mitigation locked. Disable failed.\n"); @@ -864,9 +984,25 @@ static void __init gds_select_mitigation(void) */ gds_mitigation = GDS_MITIGATION_FULL_LOCKED; } +} + +static void __init gds_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + /* Microcode is present */ + if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL) + update_gds_msr(); + else if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } - update_gds_msr(); -out: pr_info("%s\n", gds_strings[gds_mitigation]); } @@ -927,10 +1063,14 @@ static bool smap_works_speculatively(void) static void __init spectre_v1_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; +} + +static void __init spectre_v1_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) return; - } if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { /* @@ -983,8 +1123,20 @@ enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt +enum its_mitigation { + ITS_MITIGATION_OFF, + ITS_MITIGATION_AUTO, + ITS_MITIGATION_VMEXIT_ONLY, + ITS_MITIGATION_ALIGNED_THUNKS, + ITS_MITIGATION_RETPOLINE_STUFF, +}; + +static enum its_mitigation its_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF; + enum retbleed_mitigation { RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_AUTO, RETBLEED_MITIGATION_UNRET, RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, @@ -992,14 +1144,6 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_STUFF, }; -enum retbleed_mitigation_cmd { - RETBLEED_CMD_OFF, - RETBLEED_CMD_AUTO, - RETBLEED_CMD_UNRET, - RETBLEED_CMD_IBPB, - RETBLEED_CMD_STUFF, -}; - static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_NONE] = "Vulnerable", [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", @@ -1010,9 +1154,7 @@ static const char * const retbleed_strings[] = { }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = - RETBLEED_MITIGATION_NONE; -static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF; + IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE; static int __ro_after_init retbleed_nosmt = false; @@ -1029,15 +1171,15 @@ static int __init retbleed_parse_cmdline(char *str) } if (!strcmp(str, "off")) { - retbleed_cmd = RETBLEED_CMD_OFF; + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } else if (!strcmp(str, "auto")) { - retbleed_cmd = RETBLEED_CMD_AUTO; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else if (!strcmp(str, "unret")) { - retbleed_cmd = RETBLEED_CMD_UNRET; + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else if (!strcmp(str, "ibpb")) { - retbleed_cmd = RETBLEED_CMD_IBPB; + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else if (!strcmp(str, "stuff")) { - retbleed_cmd = RETBLEED_CMD_STUFF; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else if (!strcmp(str, "force")) { @@ -1058,72 +1200,117 @@ early_param("retbleed", retbleed_parse_cmdline); static void __init retbleed_select_mitigation(void) { - bool mitigate_smt = false; - - if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) - return; - - switch (retbleed_cmd) { - case RETBLEED_CMD_OFF: + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) { + retbleed_mitigation = RETBLEED_MITIGATION_NONE; return; + } - case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - } else { + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); - goto do_cmd_auto; } break; - - case RETBLEED_CMD_IBPB: + case RETBLEED_MITIGATION_IBPB: if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); - goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; - } else { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } break; + case RETBLEED_MITIGATION_STUFF: + if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } + break; + default: + break; + } - case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && - spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { - retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO) + return; + /* Intel mitigation selected in retbleed_update_mitigation() */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + else + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +} + +static void __init retbleed_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + + if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) + goto out; + + /* + * retbleed=stuff is only allowed on Intel. If stuffing can't be used + * then a different mitigation will be selected below. + * + * its=stuff will also attempt to enable stuffing. + */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF || + its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) { + if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) { + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else { - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) - pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); - else - pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_info("Retbleed mitigation updated to stuffing\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } - break; - -do_cmd_auto: - case RETBLEED_CMD_AUTO: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && - boot_cpu_has(X86_FEATURE_IBPB)) - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option except for call depth based stuffing + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); } + /* If nothing has set the mitigation yet, default to NONE. */ + if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO) + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +out: + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} - /* - * The Intel mitigation (IBRS or eIBRS) was already selected in - * spectre_v2_select_mitigation(). 'retbleed_mitigation' will - * be set accordingly below. - */ - break; - } +static void __init retbleed_apply_mitigation(void) +{ + bool mitigate_smt = false; switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_NONE: + return; + case RETBLEED_MITIGATION_UNRET: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); @@ -1173,47 +1360,11 @@ do_cmd_auto: if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && (retbleed_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); - - /* - * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option except for call depth based stuffing - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - switch (spectre_v2_enabled) { - case SPECTRE_V2_IBRS: - retbleed_mitigation = RETBLEED_MITIGATION_IBRS; - break; - case SPECTRE_V2_EIBRS: - case SPECTRE_V2_EIBRS_RETPOLINE: - case SPECTRE_V2_EIBRS_LFENCE: - retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; - break; - default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) - pr_err(RETBLEED_INTEL_MSG); - } - } - - pr_info("%s\n", retbleed_strings[retbleed_mitigation]); } #undef pr_fmt #define pr_fmt(fmt) "ITS: " fmt -enum its_mitigation_cmd { - ITS_CMD_OFF, - ITS_CMD_ON, - ITS_CMD_VMEXIT, - ITS_CMD_RSB_STUFF, -}; - -enum its_mitigation { - ITS_MITIGATION_OFF, - ITS_MITIGATION_VMEXIT_ONLY, - ITS_MITIGATION_ALIGNED_THUNKS, - ITS_MITIGATION_RETPOLINE_STUFF, -}; - static const char * const its_strings[] = { [ITS_MITIGATION_OFF] = "Vulnerable", [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected", @@ -1221,11 +1372,6 @@ static const char * const its_strings[] = { [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB", }; -static enum its_mitigation its_mitigation __ro_after_init = ITS_MITIGATION_ALIGNED_THUNKS; - -static enum its_mitigation_cmd its_cmd __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_CMD_ON : ITS_CMD_OFF; - static int __init its_parse_cmdline(char *str) { if (!str) @@ -1237,16 +1383,16 @@ static int __init its_parse_cmdline(char *str) } if (!strcmp(str, "off")) { - its_cmd = ITS_CMD_OFF; + its_mitigation = ITS_MITIGATION_OFF; } else if (!strcmp(str, "on")) { - its_cmd = ITS_CMD_ON; + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; } else if (!strcmp(str, "force")) { - its_cmd = ITS_CMD_ON; + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; setup_force_cpu_bug(X86_BUG_ITS); } else if (!strcmp(str, "vmexit")) { - its_cmd = ITS_CMD_VMEXIT; + its_mitigation = ITS_MITIGATION_VMEXIT_ONLY; } else if (!strcmp(str, "stuff")) { - its_cmd = ITS_CMD_RSB_STUFF; + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; } else { pr_err("Ignoring unknown indirect_target_selection option (%s).", str); } @@ -1257,85 +1403,90 @@ early_param("indirect_target_selection", its_parse_cmdline); static void __init its_select_mitigation(void) { - enum its_mitigation_cmd cmd = its_cmd; - if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) { its_mitigation = ITS_MITIGATION_OFF; return; } - /* Retpoline+CDT mitigates ITS, bail out */ - if (boot_cpu_has(X86_FEATURE_RETPOLINE) && - boot_cpu_has(X86_FEATURE_CALL_DEPTH)) { - its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; - goto out; - } + if (its_mitigation == ITS_MITIGATION_AUTO) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + + if (its_mitigation == ITS_MITIGATION_OFF) + return; - /* Exit early to avoid irrelevant warnings */ - if (cmd == ITS_CMD_OFF) { - its_mitigation = ITS_MITIGATION_OFF; - goto out; - } - if (spectre_v2_enabled == SPECTRE_V2_NONE) { - pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); - its_mitigation = ITS_MITIGATION_OFF; - goto out; - } if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) { pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n"); its_mitigation = ITS_MITIGATION_OFF; - goto out; + return; } + if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) { pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n"); its_mitigation = ITS_MITIGATION_OFF; - goto out; - } - if (boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { - pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); - its_mitigation = ITS_MITIGATION_OFF; - goto out; + return; } - if (cmd == ITS_CMD_RSB_STUFF && - (!boot_cpu_has(X86_FEATURE_RETPOLINE) || !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING))) { + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { pr_err("RSB stuff mitigation not supported, using default\n"); - cmd = ITS_CMD_ON; + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; } - switch (cmd) { - case ITS_CMD_OFF: + if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY && + !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; +} + +static void __init its_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) + return; + + switch (spectre_v2_enabled) { + case SPECTRE_V2_NONE: + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); its_mitigation = ITS_MITIGATION_OFF; break; - case ITS_CMD_VMEXIT: - if (boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) { - its_mitigation = ITS_MITIGATION_VMEXIT_ONLY; - goto out; - } - fallthrough; - case ITS_CMD_ON: - its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; - if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) - setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - set_return_thunk(its_return_thunk); + case SPECTRE_V2_RETPOLINE: + /* Retpoline+CDT mitigates ITS */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF) + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; break; - case ITS_CMD_RSB_STUFF: - its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - set_return_thunk(call_depth_return_thunk); - if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) { - retbleed_mitigation = RETBLEED_MITIGATION_STUFF; - pr_info("Retbleed mitigation updated to stuffing\n"); - } + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); + its_mitigation = ITS_MITIGATION_OFF; + break; + default: break; } -out: + + /* + * retbleed_update_mitigation() will try to do stuffing if its=stuff. + * If it can't, such as if spectre_v2!=retpoline, then fall back to + * aligned thunks. + */ + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + pr_info("%s\n", its_strings[its_mitigation]); } +static void __init its_apply_mitigation(void) +{ + /* its=stuff forces retbleed stuffing and is enabled there. */ + if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS) + return; + + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); +} + #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt @@ -1413,6 +1564,8 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; + enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, @@ -1451,31 +1604,18 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } -static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; - -static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(void) +static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { - enum spectre_v2_user_cmd mode; char arg[20]; int ret, i; - mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? - SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; - - switch (spectre_v2_cmd) { - case SPECTRE_V2_CMD_NONE: + if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) return SPECTRE_V2_USER_CMD_NONE; - case SPECTRE_V2_CMD_FORCE: - return SPECTRE_V2_USER_CMD_FORCE; - default: - break; - } ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); if (ret < 0) - return mode; + return SPECTRE_V2_USER_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { if (match_option(arg, ret, v2_user_options[i].option)) { @@ -1486,7 +1626,7 @@ spectre_v2_parse_user_cmdline(void) } pr_err("Unknown user space protection option (%s). Switching to default\n", arg); - return mode; + return SPECTRE_V2_USER_CMD_AUTO; } static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) @@ -1494,60 +1634,72 @@ static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; } -static void __init -spectre_v2_user_select_mitigation(void) +static void __init spectre_v2_user_select_mitigation(void) { - enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; - enum spectre_v2_user_cmd cmd; - if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - cmd = spectre_v2_parse_user_cmdline(); - switch (cmd) { + switch (spectre_v2_parse_user_cmdline()) { case SPECTRE_V2_USER_CMD_NONE: - goto set_mode; + return; case SPECTRE_V2_USER_CMD_FORCE: - mode = SPECTRE_V2_USER_STRICT; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; + break; case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; case SPECTRE_V2_USER_CMD_SECCOMP: + if (IS_ENABLED(CONFIG_SECCOMP)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP; + else + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = spectre_v2_user_ibpb; + break; case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPECTRE_V2_USER_SECCOMP; + spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP; else - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; } - /* Initialize Indirect Branch Prediction Barrier */ - if (boot_cpu_has(X86_FEATURE_IBPB)) { - static_branch_enable(&switch_vcpu_ibpb); + /* + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. + */ + if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; - spectre_v2_user_ibpb = mode; - switch (cmd) { - case SPECTRE_V2_USER_CMD_NONE: - break; - case SPECTRE_V2_USER_CMD_FORCE: - case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: - static_branch_enable(&switch_mm_always_ibpb); - spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; - break; - case SPECTRE_V2_USER_CMD_PRCTL: - case SPECTRE_V2_USER_CMD_AUTO: - case SPECTRE_V2_USER_CMD_SECCOMP: - static_branch_enable(&switch_mm_cond_ibpb); - break; - } + if (!boot_cpu_has(X86_FEATURE_IBPB)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; - pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", - static_key_enabled(&switch_mm_always_ibpb) ? - "always-on" : "conditional"); + if (!boot_cpu_has(X86_FEATURE_STIBP)) + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; +} + +static void __init spectre_v2_user_update_mitigation(void) +{ + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) + return; + + /* The spectre_v2 cmd line can override spectre_v2_user options */ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; + } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; } /* @@ -1565,30 +1717,44 @@ spectre_v2_user_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_STIBP) || !cpu_smt_possible() || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && - !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) { + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; return; + } - /* - * At this point, an STIBP mode other than "off" has been set. - * If STIBP support is not being forced, check if STIBP always-on - * is preferred. - */ - if (mode != SPECTRE_V2_USER_STRICT && - boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) - mode = SPECTRE_V2_USER_STRICT_PREFERRED; - - if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || - retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (mode != SPECTRE_V2_USER_STRICT && - mode != SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE && + (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) { + if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT && + spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED) pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); - mode = SPECTRE_V2_USER_STRICT_PREFERRED; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; } + pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]); +} - spectre_v2_user_stibp = mode; +static void __init spectre_v2_user_apply_mitigation(void) +{ + /* Initialize Indirect Branch Prediction Barrier */ + if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) { + static_branch_enable(&switch_vcpu_ibpb); + + switch (spectre_v2_user_ibpb) { + case SPECTRE_V2_USER_STRICT: + static_branch_enable(&switch_mm_always_ibpb); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + static_branch_enable(&switch_mm_cond_ibpb); + break; + default: + break; + } -set_mode: - pr_info("%s\n", spectre_v2_user_strings[mode]); + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", + static_key_enabled(&switch_mm_always_ibpb) ? + "always-on" : "conditional"); + } } static const char * const spectre_v2_strings[] = { @@ -1808,12 +1974,13 @@ static bool __init spec_ctrl_bhi_dis(void) enum bhi_mitigations { BHI_MITIGATION_OFF, + BHI_MITIGATION_AUTO, BHI_MITIGATION_ON, BHI_MITIGATION_VMEXIT_ONLY, }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { @@ -1835,6 +2002,25 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline); static void __init bhi_select_mitigation(void) { + if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off()) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (bhi_mitigation == BHI_MITIGATION_AUTO) + bhi_mitigation = BHI_MITIGATION_ON; +} + +static void __init bhi_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) + bhi_mitigation = BHI_MITIGATION_OFF; +} + +static void __init bhi_apply_mitigation(void) +{ if (bhi_mitigation == BHI_MITIGATION_OFF) return; @@ -1855,86 +2041,92 @@ static void __init bhi_select_mitigation(void) if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) { pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n"); - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); return; } pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n"); setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); } static void __init spectre_v2_select_mitigation(void) { - enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); - enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + spectre_v2_cmd = spectre_v2_parse_cmdline(); - /* - * If the CPU is not affected and the command line mode is NONE or AUTO - * then nothing to do. - */ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) return; - switch (cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return; case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_EIBRS; - break; - } - - if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && - boot_cpu_has_bug(X86_BUG_RETBLEED) && - retbleed_cmd != RETBLEED_CMD_OFF && - retbleed_cmd != RETBLEED_CMD_STUFF && - boot_cpu_has(X86_FEATURE_IBRS) && - boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; } - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: pr_err(SPECTRE_V2_LFENCE_MSG); - mode = SPECTRE_V2_LFENCE; + spectre_v2_enabled = SPECTRE_V2_LFENCE; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - mode = SPECTRE_V2_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_RETPOLINE; break; case SPECTRE_V2_CMD_RETPOLINE: - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_IBRS: - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_IBRS; break; case SPECTRE_V2_CMD_EIBRS: - mode = SPECTRE_V2_EIBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; case SPECTRE_V2_CMD_EIBRS_LFENCE: - mode = SPECTRE_V2_EIBRS_LFENCE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE; break; case SPECTRE_V2_CMD_EIBRS_RETPOLINE: - mode = SPECTRE_V2_EIBRS_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE; break; } +} + +static void __init spectre_v2_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO && + !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) { + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_mitigation != RETBLEED_MITIGATION_NONE && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + spectre_v2_enabled = SPECTRE_V2_IBRS; + } + } - if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off()) + pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]); +} + +static void __init spectre_v2_apply_mitigation(void) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); - if (spectre_v2_in_ibrs_mode(mode)) { + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); } else { @@ -1943,8 +2135,10 @@ static void __init spectre_v2_select_mitigation(void) } } - switch (mode) { + switch (spectre_v2_enabled) { case SPECTRE_V2_NONE: + return; + case SPECTRE_V2_EIBRS: break; @@ -1970,18 +2164,12 @@ static void __init spectre_v2_select_mitigation(void) * JMPs gets protection against BHI and Intramode-BTI, but RET * prediction from a non-RSB predictor is still a risk. */ - if (mode == SPECTRE_V2_EIBRS_LFENCE || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_RETPOLINE) + if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE || + spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE || + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); - if (boot_cpu_has(X86_BUG_BHI)) - bhi_select_mitigation(); - - spectre_v2_enabled = mode; - pr_info("%s\n", spectre_v2_strings[mode]); - - spectre_v2_select_rsb_mitigation(mode); + spectre_v2_select_rsb_mitigation(spectre_v2_enabled); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -1989,28 +2177,26 @@ static void __init spectre_v2_select_mitigation(void) * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't * otherwise enabled. * - * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because - * the user might select retpoline on the kernel command line and if - * the CPU supports Enhanced IBRS, kernel might un-intentionally not - * enable IBRS around firmware calls. + * Use "spectre_v2_enabled" to check Enhanced IBRS instead of + * boot_cpu_has(), because the user might select retpoline on the kernel + * command line and if the CPU supports Enhanced IBRS, kernel might + * un-intentionally not enable IBRS around firmware calls. */ if (boot_cpu_has_bug(X86_BUG_RETBLEED) && boot_cpu_has(X86_FEATURE_IBPB) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { - if (retbleed_cmd != RETBLEED_CMD_IBPB) { + if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); pr_info("Enabling Speculation Barrier for firmware calls\n"); } - } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + } else if (boot_cpu_has(X86_FEATURE_IBRS) && + !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } - - /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) @@ -2199,19 +2385,18 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) return cmd; } -static enum ssb_mitigation __init __ssb_select_mitigation(void) +static void __init ssb_select_mitigation(void) { - enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; enum ssb_mitigation_cmd cmd; if (!boot_cpu_has(X86_FEATURE_SSBD)) - return mode; + goto out; cmd = ssb_parse_cmdline(); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && (cmd == SPEC_STORE_BYPASS_CMD_NONE || cmd == SPEC_STORE_BYPASS_CMD_AUTO)) - return mode; + return; switch (cmd) { case SPEC_STORE_BYPASS_CMD_SECCOMP: @@ -2220,28 +2405,35 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) * enabled. */ if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPEC_STORE_BYPASS_SECCOMP; + ssb_mode = SPEC_STORE_BYPASS_SECCOMP; else - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_ON: - mode = SPEC_STORE_BYPASS_DISABLE; + ssb_mode = SPEC_STORE_BYPASS_DISABLE; break; case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_NONE: break; } +out: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +static void __init ssb_apply_mitigation(void) +{ /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation */ - if (mode == SPEC_STORE_BYPASS_DISABLE) { + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) { setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); /* * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may @@ -2255,16 +2447,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) update_spec_ctrl(x86_spec_ctrl_base); } } - - return mode; -} - -static void ssb_select_mitigation(void) -{ - ssb_mode = __ssb_select_mitigation(); - - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); } #undef pr_fmt @@ -2520,7 +2702,7 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); /* Default mitigation for L1TF-affected CPUs */ enum l1tf_mitigations l1tf_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif @@ -2568,22 +2750,33 @@ static void override_cache_bits(struct cpuinfo_x86 *c) static void __init l1tf_select_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) { + l1tf_mitigation = L1TF_MITIGATION_OFF; + return; + } + + if (l1tf_mitigation == L1TF_MITIGATION_AUTO) { + if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + } +} + +static void __init l1tf_apply_mitigation(void) +{ u64 half_pa; if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; - if (cpu_mitigations_off()) - l1tf_mitigation = L1TF_MITIGATION_OFF; - else if (cpu_mitigations_auto_nosmt()) - l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; - override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_AUTO: break; case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -2643,6 +2836,7 @@ early_param("l1tf", l1tf_cmdline); enum srso_mitigation { SRSO_MITIGATION_NONE, + SRSO_MITIGATION_AUTO, SRSO_MITIGATION_UCODE_NEEDED, SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, @@ -2652,14 +2846,6 @@ enum srso_mitigation { SRSO_MITIGATION_BP_SPEC_REDUCE, }; -enum srso_mitigation_cmd { - SRSO_CMD_OFF, - SRSO_CMD_MICROCODE, - SRSO_CMD_SAFE_RET, - SRSO_CMD_IBPB, - SRSO_CMD_IBPB_ON_VMEXIT, -}; - static const char * const srso_strings[] = { [SRSO_MITIGATION_NONE] = "Vulnerable", [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", @@ -2671,8 +2857,7 @@ static const char * const srso_strings[] = { [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation" }; -static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; -static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO; static int __init srso_parse_cmdline(char *str) { @@ -2680,15 +2865,15 @@ static int __init srso_parse_cmdline(char *str) return -EINVAL; if (!strcmp(str, "off")) - srso_cmd = SRSO_CMD_OFF; + srso_mitigation = SRSO_MITIGATION_NONE; else if (!strcmp(str, "microcode")) - srso_cmd = SRSO_CMD_MICROCODE; + srso_mitigation = SRSO_MITIGATION_MICROCODE; else if (!strcmp(str, "safe-ret")) - srso_cmd = SRSO_CMD_SAFE_RET; + srso_mitigation = SRSO_MITIGATION_SAFE_RET; else if (!strcmp(str, "ibpb")) - srso_cmd = SRSO_CMD_IBPB; + srso_mitigation = SRSO_MITIGATION_IBPB; else if (!strcmp(str, "ibpb-vmexit")) - srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; else pr_err("Ignoring unknown SRSO option (%s).", str); @@ -2700,132 +2885,85 @@ early_param("spec_rstack_overflow", srso_parse_cmdline); static void __init srso_select_mitigation(void) { - bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); + bool has_microcode; - if (!boot_cpu_has_bug(X86_BUG_SRSO) || - cpu_mitigations_off() || - srso_cmd == SRSO_CMD_OFF) { - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; - goto out; - } + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + srso_mitigation = SRSO_MITIGATION_NONE; + + if (srso_mitigation == SRSO_MITIGATION_NONE) + return; + + if (srso_mitigation == SRSO_MITIGATION_AUTO) + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. - * - * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); - goto out; - } - - if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - srso_mitigation = SRSO_MITIGATION_IBPB; - goto out; + srso_mitigation = SRSO_MITIGATION_NONE; + return; } } else { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); - - /* may be overwritten by SRSO_CMD_SAFE_RET below */ - srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } - switch (srso_cmd) { - case SRSO_CMD_MICROCODE: - if (has_microcode) { - srso_mitigation = SRSO_MITIGATION_MICROCODE; - pr_warn(SRSO_NOTICE); - } - break; - - case SRSO_CMD_SAFE_RET: - if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) { + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; goto ibpb_on_vmexit; + } - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - /* - * Enable the return thunk for generated code - * like ftrace, static_call, etc. - */ - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_UNRET); - - if (boot_cpu_data.x86 == 0x19) { - setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - set_return_thunk(srso_alias_return_thunk); - } else { - setup_force_cpu_cap(X86_FEATURE_SRSO); - set_return_thunk(srso_return_thunk); - } - if (has_microcode) - srso_mitigation = SRSO_MITIGATION_SAFE_RET; - else - srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; - } else { + if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } - break; - case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB; - - /* - * IBPB on entry already obviates the need for - * software-based untraining so clear those in case some - * other mitigation like Retbleed has selected them. - */ - setup_clear_cpu_cap(X86_FEATURE_UNRET); - setup_clear_cpu_cap(X86_FEATURE_RETHUNK); - - /* - * There is no need for RSB filling: write_ibpb() ensures - * all predictions, including the RSB, are invalidated, - * regardless of IBPB implementation. - */ - setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); - } - } else { - pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - } + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; break; - ibpb_on_vmexit: - case SRSO_CMD_IBPB_ON_VMEXIT: + case SRSO_MITIGATION_IBPB_ON_VMEXIT: if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) { pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n"); srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE; break; } - - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; - - /* - * There is no need for RSB filling: write_ibpb() ensures - * all predictions, including the RSB, are invalidated, - * regardless of IBPB implementation. - */ - setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); - } - } else { + fallthrough; + case SRSO_MITIGATION_IBPB: + if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } + + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; break; default: break; } +} -out: +static void __init srso_update_mitigation(void) +{ + /* If retbleed is using IBPB, that works for SRSO as well */ + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && + boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) + srso_mitigation = SRSO_MITIGATION_IBPB; + + if (boot_cpu_has_bug(X86_BUG_SRSO) && + !cpu_mitigations_off() && + !boot_cpu_has(X86_FEATURE_SRSO_NO)) + pr_info("%s\n", srso_strings[srso_mitigation]); +} + +static void __init srso_apply_mitigation(void) +{ /* * Clear the feature flag if this mitigation is not selected as that * feature flag controls the BpSpecReduce MSR bit toggling in KVM. @@ -2833,8 +2971,52 @@ out: if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE) setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE); - if (srso_mitigation != SRSO_MITIGATION_NONE) - pr_info("%s\n", srso_strings[srso_mitigation]); + if (srso_mitigation == SRSO_MITIGATION_NONE) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } + + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED: + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86 == 0x19) { + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + set_return_thunk(srso_alias_return_thunk); + } else { + setup_force_cpu_cap(X86_FEATURE_SRSO); + set_return_thunk(srso_return_thunk); + } + break; + case SRSO_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + fallthrough; + case SRSO_MITIGATION_IBPB_ON_VMEXIT: + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; + default: + break; + } } #undef pr_fmt @@ -2929,9 +3111,6 @@ static ssize_t tsx_async_abort_show_state(char *buf) static ssize_t mmio_stale_data_show_state(char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return sysfs_emit(buf, "Unknown: No mitigations\n"); - if (mmio_mitigation == MMIO_MITIGATION_OFF) return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); @@ -2949,6 +3128,14 @@ static ssize_t rfds_show_state(char *buf) return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } +static ssize_t old_microcode_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return sysfs_emit(buf, "Unknown: running under hypervisor"); + + return sysfs_emit(buf, "Vulnerable\n"); +} + static ssize_t its_show_state(char *buf) { return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]); @@ -3012,7 +3199,7 @@ static const char *spectre_bhi_state(void) !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && rrsba_disabled) return "; BHI: Retpoline"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT)) return "; BHI: Vulnerable, KVM: SW loop"; return "; BHI: Vulnerable"; @@ -3121,7 +3308,6 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return srbds_show_state(buf); case X86_BUG_MMIO_STALE_DATA: - case X86_BUG_MMIO_UNKNOWN: return mmio_stale_data_show_state(buf); case X86_BUG_RETBLEED: @@ -3136,6 +3322,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RFDS: return rfds_show_state(buf); + case X86_BUG_OLD_MICROCODE: + return old_microcode_show_state(buf); + case X86_BUG_ITS: return its_show_state(buf); @@ -3193,10 +3382,7 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); - else - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) @@ -3219,6 +3405,11 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } +ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE); +} + ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf) { return cpu_show_common(dev, attr, buf, X86_BUG_ITS); diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 237faf7e700c..981f8b1f0792 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -10,6 +10,7 @@ #include <asm/cmdline.h> #include <asm/traps.h> #include <asm/cpu.h> +#include <asm/msr.h> enum split_lock_detect_state { sld_off = 0, @@ -95,15 +96,15 @@ static bool split_lock_verify_msr(bool on) { u64 ctrl, tmp; - if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl)) return false; if (on) ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; else ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + if (wrmsrq_safe(MSR_TEST_CTRL, ctrl)) return false; - rdmsrl(MSR_TEST_CTRL, tmp); + rdmsrq(MSR_TEST_CTRL, tmp); return ctrl == tmp; } @@ -137,7 +138,7 @@ static void __init __split_lock_setup(void) return; } - rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); if (!split_lock_verify_msr(true)) { pr_info("MSR access failed: Disabled\n"); @@ -145,7 +146,7 @@ static void __init __split_lock_setup(void) } /* Restore the MSR to its cached value. */ - wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); } @@ -162,7 +163,7 @@ static void sld_update_msr(bool on) if (on) test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - wrmsrl(MSR_TEST_CTRL, test_ctrl_val); + wrmsrq(MSR_TEST_CTRL, test_ctrl_val); } void split_lock_init(void) @@ -297,7 +298,7 @@ void bus_lock_init(void) if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + rdmsrq(MSR_IA32_DEBUGCTLMSR, val); if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && (sld_state == sld_warn || sld_state == sld_fatal)) || @@ -311,7 +312,7 @@ void bus_lock_init(void) val |= DEBUGCTLMSR_BUS_LOCK_DETECT; } - wrmsrl(MSR_IA32_DEBUGCTLMSR, val); + wrmsrq(MSR_IA32_DEBUGCTLMSR, val); } bool handle_user_split_lock(struct pt_regs *regs, long error_code) @@ -375,7 +376,7 @@ static void __init split_lock_setup(struct cpuinfo_x86 *c) * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set * it have split lock detection. */ - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps); if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) goto supported; diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index b3a520959b51..adfa7e8bb865 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -1,35 +1,28 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Routines to identify caches on Intel CPU. + * x86 CPU caches detection and configuration * - * Changes: - * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. - * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. + * Previous changes + * - Venkatesh Pallipadi: Cache identification through CPUID(0x4) + * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure + * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD */ #include <linux/cacheinfo.h> -#include <linux/capability.h> #include <linux/cpu.h> #include <linux/cpuhotplug.h> -#include <linux/pci.h> #include <linux/stop_machine.h> -#include <linux/sysfs.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> +#include <asm/cpuid/api.h> #include <asm/mtrr.h> #include <asm/smp.h> #include <asm/tlbflush.h> #include "cpu.h" -#define LVL_1_INST 1 -#define LVL_1_DATA 2 -#define LVL_2 3 -#define LVL_3 4 - /* Shared last level cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); @@ -41,208 +34,127 @@ static cpumask_var_t cpu_cacheinfo_mask; /* Kernel controls MTRR and/or PAT MSRs. */ unsigned int memory_caching_control __ro_after_init; -struct _cache_table { - unsigned char descriptor; - char cache_type; - short size; -}; - -#define MB(x) ((x) * 1024) - -/* All the cache descriptor types we care about (no TLB or - trace cache entries) */ - -static const struct _cache_table cache_table[] = -{ - { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ - { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ - { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ - { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ - { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ - { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ - { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ - { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ - { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ - { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ - { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ - { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ - { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ - { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ - { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ - { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ - { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ - { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ - { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ - { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ - { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ - { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ - { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ - { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ - { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ - { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ - { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ - { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ - { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ - { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ - { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ - { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ - { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ - { 0x00, 0, 0} -}; - - enum _cache_type { - CTYPE_NULL = 0, - CTYPE_DATA = 1, - CTYPE_INST = 2, - CTYPE_UNIFIED = 3 + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 }; union _cpuid4_leaf_eax { struct { - enum _cache_type type:5; - unsigned int level:3; - unsigned int is_self_initializing:1; - unsigned int is_fully_associative:1; - unsigned int reserved:4; - unsigned int num_threads_sharing:12; - unsigned int num_cores_on_die:6; + enum _cache_type type :5; + unsigned int level :3; + unsigned int is_self_initializing :1; + unsigned int is_fully_associative :1; + unsigned int reserved :4; + unsigned int num_threads_sharing :12; + unsigned int num_cores_on_die :6; } split; u32 full; }; union _cpuid4_leaf_ebx { struct { - unsigned int coherency_line_size:12; - unsigned int physical_line_partition:10; - unsigned int ways_of_associativity:10; + unsigned int coherency_line_size :12; + unsigned int physical_line_partition :10; + unsigned int ways_of_associativity :10; } split; u32 full; }; union _cpuid4_leaf_ecx { struct { - unsigned int number_of_sets:32; + unsigned int number_of_sets :32; } split; u32 full; }; -struct _cpuid4_info_regs { +struct _cpuid4_info { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned int id; unsigned long size; - struct amd_northbridge *nb; }; -/* AMD doesn't have CPUID4. Emulate it here to report the same - information to the user. This makes some assumptions about the machine: - L2 not shared, no SMT etc. that is currently true on AMD CPUs. +/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */ +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + +/* + * Fallback AMD CPUID(0x4) emulation + * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d) + * + * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should + * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006). + */ + +#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff +#define AMD_L2_L3_INVALID_ASSOC 0x9 - In theory the TLBs could be reported as fake type (they are in "dummy"). - Maybe later */ union l1_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:8; - unsigned assoc:8; - unsigned size_in_kb:8; + unsigned line_size :8; + unsigned lines_per_tag :8; + unsigned assoc :8; + unsigned size_in_kb :8; }; - unsigned val; + unsigned int val; }; union l2_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned size_in_kb:16; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned size_in_kb :16; }; - unsigned val; + unsigned int val; }; union l3_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned res:2; - unsigned size_encoded:14; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned res :2; + unsigned size_encoded :14; }; - unsigned val; + unsigned int val; }; +/* L2/L3 associativity mapping */ static const unsigned short assocs[] = { - [1] = 1, - [2] = 2, - [4] = 4, - [6] = 8, - [8] = 16, - [0xa] = 32, - [0xb] = 48, - [0xc] = 64, - [0xd] = 96, - [0xe] = 128, - [0xf] = 0xffff /* fully associative - no way to show this currently */ + [1] = 1, + [2] = 2, + [3] = 3, + [4] = 4, + [5] = 6, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, + [0xc] = 64, + [0xd] = 96, + [0xe] = 128, + [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE }; static const unsigned char levels[] = { 1, 1, 2, 3 }; -static const unsigned char types[] = { 1, 2, 3, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; -static const enum cache_type cache_type_map[] = { - [CTYPE_NULL] = CACHE_TYPE_NOCACHE, - [CTYPE_DATA] = CACHE_TYPE_DATA, - [CTYPE_INST] = CACHE_TYPE_INST, - [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, -}; - -static void -amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, - union _cpuid4_leaf_ebx *ebx, - union _cpuid4_leaf_ecx *ecx) +static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx) { - unsigned dummy; - unsigned line_size, lines_per_tag, assoc, size_in_kb; - union l1_cache l1i, l1d; + unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d, *l1; union l2_cache l2; union l3_cache l3; - union l1_cache *l1 = &l1d; eax->full = 0; ebx->full = 0; @@ -251,430 +163,155 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); - switch (leaf) { + l1 = &l1d; + switch (index) { case 1: l1 = &l1i; fallthrough; case 0: if (!l1->val) return; - assoc = assocs[l1->assoc]; - line_size = l1->line_size; - lines_per_tag = l1->lines_per_tag; - size_in_kb = l1->size_in_kb; + + assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; break; case 2: - if (!l2.val) + if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l2.assoc]; - line_size = l2.line_size; - lines_per_tag = l2.lines_per_tag; - /* cpu_data has errata corrections for K7 applied */ - size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); + + /* Use x86_cache_size as it might have K7 errata fixes */ + assoc = assocs[l2.assoc]; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); break; case 3: - if (!l3.val) + if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l3.assoc]; - line_size = l3.line_size; - lines_per_tag = l3.lines_per_tag; - size_in_kb = l3.size_encoded * 512; + + assoc = assocs[l3.assoc]; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { - size_in_kb = size_in_kb >> 1; - assoc = assoc >> 1; + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; } break; default: return; } - eax->split.is_self_initializing = 1; - eax->split.type = types[leaf]; - eax->split.level = levels[leaf]; - eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = topology_num_cores_per_package(); + eax->split.is_self_initializing = 1; + eax->split.type = types[index]; + eax->split.level = levels[index]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = topology_num_cores_per_package(); - - if (assoc == 0xffff) + if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE) eax->split.is_fully_associative = 1; - ebx->split.coherency_line_size = line_size - 1; - ebx->split.ways_of_associativity = assoc - 1; - ebx->split.physical_line_partition = lines_per_tag - 1; - ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / - (ebx->split.ways_of_associativity + 1) - 1; -} - -#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) - -/* - * L3 cache descriptors - */ -static void amd_calc_l3_indices(struct amd_northbridge *nb) -{ - struct amd_l3_cache *l3 = &nb->l3_cache; - unsigned int sc0, sc1, sc2, sc3; - u32 val = 0; - - pci_read_config_dword(nb->misc, 0x1C4, &val); - - /* calculate subcache sizes */ - l3->subcaches[0] = sc0 = !(val & BIT(0)); - l3->subcaches[1] = sc1 = !(val & BIT(4)); - - if (boot_cpu_data.x86 == 0x15) { - l3->subcaches[0] = sc0 += !(val & BIT(1)); - l3->subcaches[1] = sc1 += !(val & BIT(5)); - } - - l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); - l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - - l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; -} - -/* - * check whether a slot used for disabling an L3 index is occupied. - * @l3: L3 cache descriptor - * @slot: slot number (0..1) - * - * @returns: the disabled index if used or negative value if slot free. - */ -static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) -{ - unsigned int reg = 0; - - pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); - - /* check whether this slot is activated already */ - if (reg & (3UL << 30)) - return reg & 0xfff; - - return -1; -} - -static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, - unsigned int slot) -{ - int index; - struct amd_northbridge *nb = this_leaf->priv; - - index = amd_get_l3_disable_slot(nb, slot); - if (index >= 0) - return sprintf(buf, "%d\n", index); - - return sprintf(buf, "FREE\n"); -} - -#define SHOW_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return show_cache_disable(this_leaf, buf, slot); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long idx) -{ - int i; - idx |= BIT(30); - - /* - * disable index in all 4 subcaches - */ - for (i = 0; i < 4; i++) { - u32 reg = idx | (i << 20); - - if (!nb->l3_cache.subcaches[i]) - continue; - - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - - /* - * We need to WBINVD on a core on the node containing the L3 - * cache which indices we disable therefore a simple wbinvd() - * is not sufficient. - */ - wbinvd_on_cpu(cpu); - - reg |= BIT(31); - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - } -} - -/* - * disable a L3 cache index by using a disable-slot - * - * @l3: L3 cache descriptor - * @cpu: A CPU on the node containing the L3 cache - * @slot: slot number (0..1) - * @index: index to disable - * - * @return: 0 on success, error status on failure - */ -static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long index) -{ - int ret = 0; - - /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(nb, slot); - if (ret >= 0) - return -EEXIST; - - if (index > nb->l3_cache.indices) - return -EINVAL; - - /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(nb, !slot)) - return -EEXIST; - - amd_l3_disable_index(nb, cpu, slot, index); - - return 0; -} - -static ssize_t store_cache_disable(struct cacheinfo *this_leaf, - const char *buf, size_t count, - unsigned int slot) -{ - unsigned long val = 0; - int cpu, err = 0; - struct amd_northbridge *nb = this_leaf->priv; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - cpu = cpumask_first(&this_leaf->shared_cpu_map); - - if (kstrtoul(buf, 10, &val) < 0) - return -EINVAL; - - err = amd_set_l3_disable_slot(nb, cpu, slot, val); - if (err) { - if (err == -EEXIST) - pr_warn("L3 slot %d in use/index already disabled!\n", - slot); - return err; - } - return count; -} - -#define STORE_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_store(struct device *dev, \ - struct device_attribute *attr, \ - const char *buf, size_t count) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return store_cache_disable(this_leaf, buf, count, slot); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -static ssize_t subcaches_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - - return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); -} - -static ssize_t subcaches_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - unsigned long val; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (kstrtoul(buf, 16, &val) < 0) - return -EINVAL; - - if (amd_set_subcaches(cpu, val)) - return -EINVAL; - - return count; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assoc - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; } -static DEVICE_ATTR_RW(cache_disable_0); -static DEVICE_ATTR_RW(cache_disable_1); -static DEVICE_ATTR_RW(subcaches); - -static umode_t -cache_private_attrs_is_visible(struct kobject *kobj, - struct attribute *attr, int unused) +static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax, + union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx) { - struct device *dev = kobj_to_dev(kobj); - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - umode_t mode = attr->mode; - - if (!this_leaf->priv) - return 0; - - if ((attr == &dev_attr_subcaches.attr) && - amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return mode; + if (eax.split.type == CTYPE_NULL) + return -EIO; - if ((attr == &dev_attr_cache_disable_0.attr || - attr == &dev_attr_cache_disable_1.attr) && - amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return mode; + id4->eax = eax; + id4->ebx = ebx; + id4->ecx = ecx; + id4->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } -static struct attribute_group cache_private_group = { - .is_visible = cache_private_attrs_is_visible, -}; - -static void init_amd_l3_attrs(void) -{ - int n = 1; - static struct attribute **amd_l3_attrs; - - if (amd_l3_attrs) /* already initialized */ - return; - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - n += 2; - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - n += 1; - - amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); - if (!amd_l3_attrs) - return; - - n = 0; - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { - amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; - amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; - } - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - amd_l3_attrs[n++] = &dev_attr_subcaches.attr; - - cache_private_group.attrs = amd_l3_attrs; -} - -const struct attribute_group * -cache_get_priv_group(struct cacheinfo *this_leaf) +static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - struct amd_northbridge *nb = this_leaf->priv; - - if (this_leaf->level < 3 || !nb) - return NULL; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - if (nb && nb->l3_cache.indices) - init_amd_l3_attrs(); + if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored); + else + legacy_amd_cpuid4(index, &eax, &ebx, &ecx); - return &cache_private_group; + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - int node; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - /* only for L3, and not in virtualized environments */ - if (index < 3) - return; + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored); - node = topology_amd_node_id(smp_processor_id()); - this_leaf->nb = node_to_amd_nb(node); - if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) - amd_calc_l3_indices(this_leaf->nb); + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -#else -#define amd_init_l3_cache(x, y) -#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ -static int -cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) +static int fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned edx; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - else - amd_cpuid4(index, &eax, &ebx, &ecx); - amd_init_l3_cache(this_leaf, index); - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - amd_init_l3_cache(this_leaf, index); - } else { - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); - } + u8 cpu_vendor = boot_cpu_data.x86_vendor; - if (eax.split.type == CTYPE_NULL) - return -EIO; /* better error ? */ - - this_leaf->eax = eax; - this_leaf->ebx = ebx; - this_leaf->ecx = ecx; - this_leaf->size = (ecx.split.number_of_sets + 1) * - (ebx.split.coherency_line_size + 1) * - (ebx.split.physical_line_partition + 1) * - (ebx.split.ways_of_associativity + 1); - return 0; + return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ? + amd_fill_cpuid4_info(index, id4) : + intel_fill_cpuid4_info(index, id4); } static int find_num_cache_leaves(struct cpuinfo_x86 *c) { - unsigned int eax, ebx, ecx, edx, op; - union _cpuid4_leaf_eax cache_eax; - int i = -1; - - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) - op = 0x8000001d; - else - op = 4; + unsigned int eax, ebx, ecx, edx, op; + union _cpuid4_leaf_eax cache_eax; + int i = -1; + /* Do a CPUID(op) loop to calculate num_cache_leaves */ + op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4; do { ++i; - /* Do cpuid(op) loop to find out num_cache_leaves */ cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CTYPE_NULL); return i; } +/* + * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. + */ + void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; if (c->x86 < 0x17) { - /* LLC is at the node level. */ + /* Pre-Zen: LLC is at the node level */ c->topo.llc_id = die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Family 17h up to 1F models: LLC is at the core + * complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } else { /* - * LLC ID is calculated from the number of threads sharing the - * cache. - * */ + * Newer families: LLC ID is calculated from the number + * of threads sharing the L3 cache. + */ u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; @@ -683,25 +320,21 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) num_sharing_cache = ((eax >> 14) & 0xfff) + 1; if (num_sharing_cache) { - int bits = get_count_order(num_sharing_cache); + int index_msb = get_count_order(num_sharing_cache); - c->topo.llc_id = c->topo.apicid >> bits; + c->topo.llc_id = c->topo.apicid >> index_msb; } } } void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Hygons are similar to AMD Family 17h up to 1F models: LLC is + * at the core complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } @@ -710,14 +343,10 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) ci->num_leaves = find_num_cache_leaves(c); - } else if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - ci->num_leaves = 4; - else - ci->num_leaves = 3; - } + else if (c->extended_cpuid_level >= 0x80000006) + ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3; } void init_hygon_cacheinfo(struct cpuinfo_x86 *c) @@ -727,148 +356,131 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c) ci->num_leaves = find_num_cache_leaves(c); } -void init_intel_cacheinfo(struct cpuinfo_x86 *c) +static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3, + unsigned int l2, unsigned int l1i, unsigned int l1d) { - /* Cache sizes */ - unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; - unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ - unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ - unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; - struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + /* + * If llc_id is still unset, then cpuid_level < 4, which implies + * that the only possibility left is SMT. Since CPUID(0x2) doesn't + * specify any shared caches and SMT shares all caches, we can + * unconditionally set LLC ID to the package ID so that all + * threads share it. + */ + if (c->topo.llc_id == BAD_APICID) + c->topo.llc_id = c->topo.pkg_id; - if (c->cpuid_level > 3) { - /* - * There should be at least one leaf. A non-zero value means - * that the number of leaves has been initialized. - */ - if (!ci->num_leaves) - ci->num_leaves = find_num_cache_leaves(c); + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d); - /* - * Whenever possible use cpuid(4), deterministic cache - * parameters cpuid leaf to find the cache details - */ - for (i = 0; i < ci->num_leaves; i++) { - struct _cpuid4_info_regs this_leaf = {}; - int retval; + if (!l2) + cpu_detect_cache_sizes(c); +} - retval = cpuid4_cache_lookup_regs(i, &this_leaf); - if (retval < 0) - continue; +/* + * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available. + */ +static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c) +{ + unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; - switch (this_leaf.eax.split.level) { - case 1: - if (this_leaf.eax.split.type == CTYPE_DATA) - new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == CTYPE_INST) - new_l1i = this_leaf.size/1024; - break; - case 2: - new_l2 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l2_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - case 3: - new_l3 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l3_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - default: - break; - } - } - } + if (c->cpuid_level < 2) + return; - /* Don't use CPUID(2) if CPUID(4) is supported. */ - if (!ci->num_leaves && c->cpuid_level > 1) { - /* supports eax=2 call */ - int j, n; - unsigned int regs[4]; - unsigned char *dp = (unsigned char *)regs; - - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 4 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) { - unsigned char des = dp[j]; - unsigned char k = 0; - - /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) { - if (cache_table[k].descriptor == des) { - switch (cache_table[k].cache_type) { - case LVL_1_INST: - l1i += cache_table[k].size; - break; - case LVL_1_DATA: - l1d += cache_table[k].size; - break; - case LVL_2: - l2 += cache_table[k].size; - break; - case LVL_3: - l3 += cache_table[k].size; - break; - } - - break; - } - - k++; - } - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) { + switch (desc->c_type) { + case CACHE_L1_INST: l1i += desc->c_size; break; + case CACHE_L1_DATA: l1d += desc->c_size; break; + case CACHE_L2: l2 += desc->c_size; break; + case CACHE_L3: l3 += desc->c_size; break; } } - if (new_l1d) - l1d = new_l1d; + intel_cacheinfo_done(c, l3, l2, l1i, l1d); +} - if (new_l1i) - l1i = new_l1i; +static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4) +{ + unsigned int num_threads_sharing; + int index_msb; - if (new_l2) { - l2 = new_l2; - c->topo.llc_id = l2_id; - c->topo.l2c_id = l2_id; - } + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + return c->topo.apicid & ~((1 << index_msb) - 1); +} - if (new_l3) { - l3 = new_l3; - c->topo.llc_id = l3_id; - } +static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID; + unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0; + + if (c->cpuid_level < 4) + return false; /* - * If llc_id is not yet set, this means cpuid_level < 4 which in - * turns means that the only possibility is SMT (as indicated in - * cpuid1). Since cpuid2 doesn't specify shared caches, and we know - * that SMT shares all caches, we can unconditionally set cpu_llc_id to - * c->topo.pkg_id. + * There should be at least one leaf. A non-zero value means + * that the number of leaves has been previously initialized. */ - if (c->topo.llc_id == BAD_APICID) - c->topo.llc_id = c->topo.pkg_id; + if (!ci->num_leaves) + ci->num_leaves = find_num_cache_leaves(c); + + if (!ci->num_leaves) + return false; - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); + for (int i = 0; i < ci->num_leaves; i++) { + struct _cpuid4_info id4 = {}; + int ret; - if (!l2) - cpu_detect_cache_sizes(c); + ret = intel_fill_cpuid4_info(i, &id4); + if (ret < 0) + continue; + + switch (id4.eax.split.level) { + case 1: + if (id4.eax.split.type == CTYPE_DATA) + l1d = id4.size / 1024; + else if (id4.eax.split.type == CTYPE_INST) + l1i = id4.size / 1024; + break; + case 2: + l2 = id4.size / 1024; + l2_id = calc_cache_topo_id(c, &id4); + break; + case 3: + l3 = id4.size / 1024; + l3_id = calc_cache_topo_id(c, &id4); + break; + default: + break; + } + } + + c->topo.l2c_id = l2_id; + c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id; + intel_cacheinfo_done(c, l3, l2, l1i, l1d); + return true; +} + +void init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */ + if (intel_cacheinfo_0x4(c)) + return; + + intel_cacheinfo_0x2(c); } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon + */ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci; - struct cacheinfo *this_leaf; + struct cacheinfo *ci; int i, sibling; /* @@ -880,18 +492,18 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, this_cpu_ci = get_cpu_cacheinfo(i); if (!this_cpu_ci->info_list) continue; - this_leaf = this_cpu_ci->info_list + index; + + ci = this_cpu_ci->info_list + index; for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; - nshared = base->eax.split.num_threads_sharing + 1; + nshared = id4->eax.split.num_threads_sharing + 1; apicid = cpu_data(cpu).topo.apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; @@ -905,14 +517,13 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, if ((apicid < first) || (apicid > last)) continue; - this_leaf = this_cpu_ci->info_list + index; + ci = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { apicid = cpu_data(sibling).topo.apicid; if ((apicid < first) || (apicid > last)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else @@ -921,25 +532,27 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, return 1; } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon + */ static void __cache_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf, *sibling_leaf; + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cacheinfo *ci, *sibling_ci; unsigned long num_threads_sharing; int index_msb, i; - struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) { - if (__cache_amd_cpumap_setup(cpu, index, base)) + if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { + if (__cache_amd_cpumap_setup(cpu, index, id4)) return; } - this_leaf = this_cpu_ci->info_list + index; - num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + ci = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &ci->shared_cpu_map); if (num_threads_sharing == 1) return; @@ -949,30 +562,29 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) { struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); + /* Skip if itself or no cacheinfo */ if (i == cpu || !sib_cpu_ci->info_list) - continue;/* skip if itself or no cacheinfo */ - sibling_leaf = sib_cpu_ci->info_list + index; - cpumask_set_cpu(i, &this_leaf->shared_cpu_map); - cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); + continue; + + sibling_ci = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &ci->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map); } } -static void ci_leaf_init(struct cacheinfo *this_leaf, - struct _cpuid4_info_regs *base) +static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4, + struct amd_northbridge *nb) { - this_leaf->id = base->id; - this_leaf->attributes = CACHE_ID; - this_leaf->level = base->eax.split.level; - this_leaf->type = cache_type_map[base->eax.split.type]; - this_leaf->coherency_line_size = - base->ebx.split.coherency_line_size + 1; - this_leaf->ways_of_associativity = - base->ebx.split.ways_of_associativity + 1; - this_leaf->size = base->size; - this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; - this_leaf->physical_line_partition = - base->ebx.split.physical_line_partition + 1; - this_leaf->priv = base->nb; + ci->id = id4->id; + ci->attributes = CACHE_ID; + ci->level = id4->eax.split.level; + ci->type = cache_type_map[id4->eax.split.type]; + ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1; + ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1; + ci->size = id4->size; + ci->number_of_sets = id4->ecx.split.number_of_sets + 1; + ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1; + ci->priv = nb; } int init_cache_level(unsigned int cpu) @@ -987,38 +599,45 @@ int init_cache_level(unsigned int cpu) } /* - * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input * ECX as cache index. Then right shift apicid by the number's order to get * cache id for this cache node. */ -static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +static void get_cache_id(int cpu, struct _cpuid4_info *id4) { struct cpuinfo_x86 *c = &cpu_data(cpu); unsigned long num_threads_sharing; int index_msb; - num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - id4_regs->id = c->topo.apicid >> index_msb; + id4->id = c->topo.apicid >> index_msb; } int populate_cache_leaves(unsigned int cpu) { - unsigned int idx, ret; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; - struct _cpuid4_info_regs id4_regs = {}; + struct cacheinfo *ci = this_cpu_ci->info_list; + u8 cpu_vendor = boot_cpu_data.x86_vendor; + struct amd_northbridge *nb = NULL; + struct _cpuid4_info id4 = {}; + int idx, ret; for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { - ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + ret = fill_cpuid4_info(idx, &id4); if (ret) return ret; - get_cache_id(cpu, &id4_regs); - ci_leaf_init(this_leaf++, &id4_regs); - __cache_cpumap_setup(cpu, idx, &id4_regs); + + get_cache_id(cpu, &id4); + + if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) + nb = amd_init_l3_cache(idx); + + ci_info_init(ci++, &id4, nb); + __cache_cpumap_setup(cpu, idx, &id4); } - this_cpu_ci->cpu_map_populated = true; + this_cpu_ci->cpu_map_populated = true; return 0; } @@ -1034,31 +653,33 @@ int populate_cache_leaves(unsigned int cpu) static unsigned long saved_cr4; static DEFINE_RAW_SPINLOCK(cache_disable_lock); +/* + * Cache flushing is the most time-consuming step when programming the + * MTRRs. On many Intel CPUs without known erratas, it can be skipped + * if the CPU declares cache self-snooping support. + */ +static void maybe_flush_caches(void) +{ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); +} + void cache_disable(void) __acquires(cache_disable_lock) { unsigned long cr0; /* - * Note that this is not ideal - * since the cache is only flushed/disabled for this CPU while the - * MTRRs are changed, but changing this requires more invasive - * changes to the way the kernel boots + * This is not ideal since the cache is only flushed/disabled + * for this CPU while the MTRRs are changed, but changing this + * requires more invasive changes to the way the kernel boots. */ - raw_spin_lock(&cache_disable_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - /* - * Cache flushing is the most time-consuming step when programming - * the MTRRs. Fortunately, as per the Intel Software Development - * Manual, we can skip it if the processor supports cache self- - * snooping. - */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_feature_enabled(X86_FEATURE_PGE)) { @@ -1073,9 +694,7 @@ void cache_disable(void) __acquires(cache_disable_lock) if (cpu_feature_enabled(X86_FEATURE_MTRR)) mtrr_disable(); - /* Again, only flush caches if we have to. */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); } void cache_enable(void) __releases(cache_disable_lock) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0ff057ff11ce..8feb8fd2957a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -29,7 +29,7 @@ #include <asm/alternative.h> #include <asm/cmdline.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/doublefault.h> @@ -148,7 +148,7 @@ static void ppin_init(struct cpuinfo_x86 *c) */ info = (struct ppin_info *)id->driver_data; - if (rdmsrl_safe(info->msr_ppin_ctl, &val)) + if (rdmsrq_safe(info->msr_ppin_ctl, &val)) goto clear_ppin; if ((val & 3UL) == 1UL) { @@ -158,13 +158,13 @@ static void ppin_init(struct cpuinfo_x86 *c) /* If PPIN is disabled, try to enable */ if (!(val & 2UL)) { - wrmsrl_safe(info->msr_ppin_ctl, val | 2UL); - rdmsrl_safe(info->msr_ppin_ctl, &val); + wrmsrq_safe(info->msr_ppin_ctl, val | 2UL); + rdmsrq_safe(info->msr_ppin_ctl, &val); } /* Is the enable bit set? */ if (val & 2UL) { - c->ppin = __rdmsr(info->msr_ppin); + c->ppin = native_rdmsrq(info->msr_ppin); set_cpu_cap(c, info->feature); return; } @@ -242,6 +242,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +SYM_PIC_ALIAS(gdt_page); #ifdef CONFIG_X86_64 static int __init x86_nopcid_setup(char *s) @@ -321,7 +322,7 @@ static int __init cachesize_setup(char *str) __setup("cachesize=", cachesize_setup); /* Probe for the CPUID instruction */ -bool have_cpuid_p(void) +bool cpuid_feature(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -562,9 +563,9 @@ __noendbr u64 ibt_save(bool disable) u64 msr = 0; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); if (disable) - wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); } return msr; @@ -575,10 +576,10 @@ __noendbr void ibt_restore(u64 save) u64 msr; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); msr &= ~CET_ENDBR_EN; msr |= (save & CET_ENDBR_EN); - wrmsrl(MSR_IA32_S_CET, msr); + wrmsrq(MSR_IA32_S_CET, msr); } } @@ -602,15 +603,15 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_USER_SHSTK); if (kernel_ibt) - wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN); else - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); cr4_set_bits(X86_CR4_CET); if (kernel_ibt && ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); } } @@ -621,8 +622,8 @@ __noendbr void cet_disable(void) cpu_feature_enabled(X86_FEATURE_SHSTK))) return; - wrmsrl(MSR_IA32_S_CET, 0); - wrmsrl(MSR_IA32_U_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_U_CET, 0); } /* @@ -751,9 +752,9 @@ void __init switch_gdt_and_percpu_base(int cpu) * No need to load %gs. It is already correct. * * Writing %gs on 64bit would zero GSBASE which would make any per - * CPU operation up to the point of the wrmsrl() fault. + * CPU operation up to the point of the wrmsrq() fault. * - * Set GSBASE to the new offset. Until the wrmsrl() happens the + * Set GSBASE to the new offset. Until the wrmsrq() happens the * early mapping is still valid. That means the GSBASE update will * lose any prior per CPU data which was not copied over in * setup_per_cpu_areas(). @@ -761,7 +762,7 @@ void __init switch_gdt_and_percpu_base(int cpu) * This works even with stackprotector enabled because the * per CPU stack canary is 0 in both per CPU areas. */ - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); + wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #else /* * %fs is already set to __KERNEL_PERCPU, but after switching GDT @@ -1005,17 +1006,18 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_D_1_EAX] = eax; } - /* AMD-defined flags: level 0x80000001 */ + /* + * Check if extended CPUID leaves are implemented: Max extended + * CPUID leaf must be in the 0x80000001-0x8000ffff range. + */ eax = cpuid_eax(0x80000000); - c->extended_cpuid_level = eax; + c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0; - if ((eax & 0xffff0000) == 0x80000000) { - if (eax >= 0x80000001) { - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (c->extended_cpuid_level >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_8000_0001_ECX] = ecx; - c->x86_capability[CPUID_8000_0001_EDX] = edx; - } + c->x86_capability[CPUID_8000_0001_ECX] = ecx; + c->x86_capability[CPUID_8000_0001_EDX] = edx; } if (c->extended_cpuid_level >= 0x80000007) { @@ -1295,7 +1297,7 @@ u64 x86_read_arch_cap_msr(void) u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); return x86_arch_cap_msr; } @@ -1351,10 +1353,52 @@ static bool __init vulnerable_to_its(u64 x86_arch_cap_msr) return false; } +static struct x86_cpu_id cpu_latest_microcode[] = { +#include "microcode/intel-ucode-defs.h" + {} +}; + +static bool __init cpu_has_old_microcode(void) +{ + const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode); + + /* Give unknown CPUs a pass: */ + if (!m) { + /* Intel CPUs should be in the list. Warn if not: */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + pr_info("x86/CPU: Model not found in latest microcode list\n"); + return false; + } + + /* + * Hosts usually lie to guests with a super high microcode + * version. Just ignore what hosts tell guests: + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + /* Consider all debug microcode to be old: */ + if (boot_cpu_data.microcode & BIT(31)) + return true; + + /* Give new microcode a pass: */ + if (boot_cpu_data.microcode >= m->driver_data) + return false; + + /* Uh oh, too old: */ + return true; +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (cpu_has_old_microcode()) { + pr_warn("x86/CPU: Running old microcode\n"); + setup_force_cpu_bug(X86_BUG_OLD_MICROCODE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) @@ -1435,15 +1479,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Affected CPU list is generally enough to enumerate the vulnerability, * but for virtualization case check for ARCH_CAP MSR bits also, VMM may * not want the guest to enumerate the bug. - * - * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, - * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); - else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) - setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { @@ -1672,11 +1711,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (have_cpuid_p()) { + if (cpuid_feature()) { cpu_detect(c); get_cpu_vendor(c); intel_unlock_cpuid_leafs(c); @@ -1791,11 +1830,11 @@ static bool detect_null_seg_behavior(void) */ unsigned long old_base, tmp; - rdmsrl(MSR_FS_BASE, old_base); - wrmsrl(MSR_FS_BASE, 1); + rdmsrq(MSR_FS_BASE, old_base); + wrmsrq(MSR_FS_BASE, 1); loadsegment(fs, 0); - rdmsrl(MSR_FS_BASE, tmp); - wrmsrl(MSR_FS_BASE, old_base); + rdmsrq(MSR_FS_BASE, tmp); + wrmsrq(MSR_FS_BASE, old_base); return tmp == 0; } @@ -1836,11 +1875,11 @@ static void generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) + if (!cpuid_feature()) return; cpu_detect(c); @@ -2024,9 +2063,9 @@ void enable_sep_cpu(void) */ tss->x86_tss.ss1 = __KERNEL_CS; - wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1); + wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32); put_cpu(); } @@ -2133,7 +2172,7 @@ DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_ DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); EXPORT_PER_CPU_SYMBOL(__x86_call_depth); -static void wrmsrl_cstar(unsigned long val) +static void wrmsrq_cstar(unsigned long val) { /* * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR @@ -2141,37 +2180,37 @@ static void wrmsrl_cstar(unsigned long val) * guest. Avoid the pointless write on all Intel CPUs. */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - wrmsrl(MSR_CSTAR, val); + wrmsrq(MSR_CSTAR, val); } static inline void idt_syscall_init(void) { - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + wrmsrq_cstar((unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); } else { - wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); + wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore); + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL); } /* * Flags to clear on syscall; clear as much as possible * to minimize user space-kernel interference. */ - wrmsrl(MSR_SYSCALL_MASK, + wrmsrq(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| @@ -2240,7 +2279,7 @@ static inline void setup_getcpu(int cpu) struct desc_struct d = { }; if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) - wrmsr(MSR_TSC_AUX, cpudata, 0); + wrmsrq(MSR_TSC_AUX, cpudata); /* Store CPU and node number in limit. */ d.limit0 = cpudata; @@ -2355,8 +2394,8 @@ void cpu_init(void) memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); + wrmsrq(MSR_FS_BASE, 0); + wrmsrq(MSR_KERNEL_GS_BASE, 0); barrier(); x2apic_setup(); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 51deb60a9d26..bc38b2d56f26 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -75,6 +75,15 @@ extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) +struct amd_northbridge *amd_init_l3_cache(int index); +#else +static inline struct amd_northbridge *amd_init_l3_cache(int index) +{ + return NULL; +} +#endif + unsigned int aperfmperf_get_khz(int cpu); void cpu_select_mitigations(void); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index a2fbea0be535..46efcbd6afa4 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -28,6 +28,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_APX, X86_FEATURE_XSAVE }, { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, { X86_FEATURE_MMX, X86_FEATURE_FXSR }, { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, @@ -82,8 +83,12 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, { X86_FEATURE_FRED, X86_FEATURE_LKGS }, + { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL }, {} }; diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c new file mode 100644 index 000000000000..89bc8db5e9c6 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/sizes.h> + +#include <asm/cpuid/types.h> + +#include "cpu.h" + +#define CACHE_ENTRY(_desc, _type, _size) \ + [_desc] = { \ + .c_type = (_type), \ + .c_size = (_size) / SZ_1K, \ + } + +#define TLB_ENTRY(_desc, _type, _entries) \ + [_desc] = { \ + .t_type = (_type), \ + .entries = (_entries), \ + } + +const struct leaf_0x2_table cpuid_0x2_table[256] = { + CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */ + + TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */ + TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */ + TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */ + TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */ + TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative + * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ + TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */ + TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ + TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */ + TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ + TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ + TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */ + TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ + TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */ +}; diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 4a4118784c13..d69757246bde 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/processor.h> #include <asm/vmx.h> @@ -118,7 +119,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) bool enable_vmx; u64 msr; - if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { + if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); clear_cpu_cap(c, X86_FEATURE_SGX); return; @@ -165,7 +166,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) msr |= FEAT_CTL_SGX_LC_ENABLED; } - wrmsrl(MSR_IA32_FEAT_CTL, msr); + wrmsrq(MSR_IA32_FEAT_CTL, msr); update_caps: set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 6af4a4a90a52..2154f12766fb 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -15,6 +15,7 @@ #include <asm/cacheinfo.h> #include <asm/spec-ctrl.h> #include <asm/delay.h> +#include <asm/msr.h> #include "cpu.h" @@ -96,7 +97,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -110,7 +111,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << 10; @@ -194,7 +195,7 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cdc9813871ef..076eaa41b8c8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -6,6 +6,7 @@ #include <linux/minmax.h> #include <linux/smp.h> #include <linux/string.h> +#include <linux/types.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -15,6 +16,7 @@ #include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <asm/cpu.h> +#include <asm/cpuid/api.h> #include <asm/hwcap2.h> #include <asm/intel-family.h> #include <asm/microcode.h> @@ -157,7 +159,7 @@ static void detect_tme_early(struct cpuinfo_x86 *c) u64 tme_activate; int keyid_bits; - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate); if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { pr_info_once("x86/tme: not enabled by BIOS\n"); @@ -299,7 +301,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * string flag and enhanced fast string capabilities accordingly. */ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) { - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { /* X86_FEATURE_ERMS is set based on CPUID */ set_cpu_cap(c, X86_FEATURE_REP_GOOD); @@ -488,7 +490,7 @@ static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; - if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) { if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); } @@ -498,7 +500,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) { u64 msr; - if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr)) return; /* Clear all MISC features */ @@ -509,7 +511,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) probe_xeon_phi_r3mwait(c); msr = this_cpu_read(msr_misc_features_shadow); - wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msr); } /* @@ -646,103 +648,11 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) } #endif -#define TLB_INST_4K 0x01 -#define TLB_INST_4M 0x02 -#define TLB_INST_2M_4M 0x03 - -#define TLB_INST_ALL 0x05 -#define TLB_INST_1G 0x06 - -#define TLB_DATA_4K 0x11 -#define TLB_DATA_4M 0x12 -#define TLB_DATA_2M_4M 0x13 -#define TLB_DATA_4K_4M 0x14 - -#define TLB_DATA_1G 0x16 -#define TLB_DATA_1G_2M_4M 0x17 - -#define TLB_DATA0_4K 0x21 -#define TLB_DATA0_4M 0x22 -#define TLB_DATA0_2M_4M 0x23 - -#define STLB_4K 0x41 -#define STLB_4K_2M 0x42 - -/* - * All of leaf 0x2's one-byte TLB descriptors implies the same number of - * entries for their respective TLB types. The 0x63 descriptor is an - * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries - * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for - * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the - * intel_tlb_table[] mapping. - */ -#define TLB_0x63_2M_4M_ENTRIES 32 - -struct _tlb_table { - unsigned char descriptor; - char tlb_type; - unsigned int entries; -}; - -static const struct _tlb_table intel_tlb_table[] = { - { 0x01, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages, 4-way set associative */ - { 0x02, TLB_INST_4M, 2}, /* TLB_INST 4 MByte pages, full associative */ - { 0x03, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way set associative */ - { 0x04, TLB_DATA_4M, 8}, /* TLB_DATA 4 MByte pages, 4-way set associative */ - { 0x05, TLB_DATA_4M, 32}, /* TLB_DATA 4 MByte pages, 4-way set associative */ - { 0x0b, TLB_INST_4M, 4}, /* TLB_INST 4 MByte pages, 4-way set associative */ - { 0x4f, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages */ - { 0x50, TLB_INST_ALL, 64}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ - { 0x51, TLB_INST_ALL, 128}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ - { 0x52, TLB_INST_ALL, 256}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ - { 0x55, TLB_INST_2M_4M, 7}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ - { 0x56, TLB_DATA0_4M, 16}, /* TLB_DATA0 4 MByte pages, 4-way set associative */ - { 0x57, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, 4-way associative */ - { 0x59, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, fully associative */ - { 0x5a, TLB_DATA0_2M_4M, 32}, /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ - { 0x5b, TLB_DATA_4K_4M, 64}, /* TLB_DATA 4 KByte and 4 MByte pages */ - { 0x5c, TLB_DATA_4K_4M, 128}, /* TLB_DATA 4 KByte and 4 MByte pages */ - { 0x5d, TLB_DATA_4K_4M, 256}, /* TLB_DATA 4 KByte and 4 MByte pages */ - { 0x61, TLB_INST_4K, 48}, /* TLB_INST 4 KByte pages, full associative */ - { 0x63, TLB_DATA_1G_2M_4M, 4}, /* TLB_DATA 1 GByte pages, 4-way set associative - * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ - { 0x6b, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 8-way associative */ - { 0x6c, TLB_DATA_2M_4M, 128}, /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ - { 0x6d, TLB_DATA_1G, 16}, /* TLB_DATA 1 GByte pages, fully associative */ - { 0x76, TLB_INST_2M_4M, 8}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ - { 0xb0, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 4-way set associative */ - { 0xb1, TLB_INST_2M_4M, 4}, /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ - { 0xb2, TLB_INST_4K, 64}, /* TLB_INST 4KByte pages, 4-way set associative */ - { 0xb3, TLB_DATA_4K, 128}, /* TLB_DATA 4 KByte pages, 4-way set associative */ - { 0xb4, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 4-way associative */ - { 0xb5, TLB_INST_4K, 64}, /* TLB_INST 4 KByte pages, 8-way set associative */ - { 0xb6, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 8-way set associative */ - { 0xba, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way associative */ - { 0xc0, TLB_DATA_4K_4M, 8}, /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ - { 0xc1, STLB_4K_2M, 1024}, /* STLB 4 KByte and 2 MByte pages, 8-way associative */ - { 0xc2, TLB_DATA_2M_4M, 16}, /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ - { 0xca, STLB_4K, 512}, /* STLB 4 KByte pages, 4-way associative */ - { 0x00, 0, 0 } -}; - -static void intel_tlb_lookup(const unsigned char desc) +static void intel_tlb_lookup(const struct leaf_0x2_table *desc) { - unsigned int entries; - unsigned char k; - - if (desc == 0) - return; - - /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && - intel_tlb_table[k].descriptor != 0; k++) - ; + short entries = desc->entries; - if (intel_tlb_table[k].tlb_type == 0) - return; - - entries = intel_tlb_table[k].entries; - switch (intel_tlb_table[k].tlb_type) { + switch (desc->t_type) { case STLB_4K: tlb_lli_4k = max(tlb_lli_4k, entries); tlb_lld_4k = max(tlb_lld_4k, entries); @@ -799,28 +709,16 @@ static void intel_tlb_lookup(const unsigned char desc) static void intel_detect_tlb(struct cpuinfo_x86 *c) { - int i, j, n; - unsigned int regs[4]; - unsigned char *desc = (unsigned char *)regs; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; if (c->cpuid_level < 2) return; - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 4 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) - intel_tlb_lookup(desc[j]); - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) + intel_tlb_lookup(desc); } static const struct cpu_dev intel_cpu_dev = { diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 30b1d63b97f3..bc7671f920a7 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -79,7 +79,7 @@ static int intel_epb_save(void) { u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); /* * Ensure that saved_epb will always be nonzero after this write even if * the EPB value read from the MSR is 0. @@ -94,7 +94,7 @@ static void intel_epb_restore(void) u64 val = this_cpu_read(saved_epb); u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); if (val) { val &= EPB_MASK; } else { @@ -111,7 +111,7 @@ static void intel_epb_restore(void) pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); } } - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); + wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); } static struct syscore_ops intel_epb_syscore_ops = { @@ -135,7 +135,7 @@ static ssize_t energy_perf_bias_show(struct device *dev, u64 epb; int ret; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; @@ -157,11 +157,11 @@ static ssize_t energy_perf_bias_store(struct device *dev, else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) return -EINVAL; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; - ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); if (ret < 0) return ret; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 1075a90141da..9d852c3b2cb5 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -662,12 +662,12 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) return; } - rdmsrl(MSR_K7_HWCR, hwcr); + rdmsrq(MSR_K7_HWCR, hwcr); /* McStatusWrEn has to be set */ need_toggle = !(hwcr & BIT(18)); if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + wrmsrq(MSR_K7_HWCR, hwcr | BIT(18)); /* Clear CntP bit safely */ for (i = 0; i < num_msrs; i++) @@ -675,7 +675,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) /* restore old settings */ if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); + wrmsrq(MSR_K7_HWCR, hwcr); } /* cpu init entry point, called from mce.c with preempt off */ @@ -805,12 +805,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); + rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); if (m->status & MCI_STATUS_SYNDV) { - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); - rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); - rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); } } @@ -834,16 +834,16 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { u64 status, addr = 0; - rdmsrl(msr_stat, status); + rdmsrq(msr_stat, status); if (!(status & MCI_STATUS_VAL)) return false; if (status & MCI_STATUS_ADDRV) - rdmsrl(msr_addr, addr); + rdmsrq(msr_addr, addr); __log_error(bank, status, addr, misc); - wrmsrl(msr_stat, 0); + wrmsrq(msr_stat, 0); return status & MCI_STATUS_DEFERRED; } @@ -862,7 +862,7 @@ static bool _log_error_deferred(unsigned int bank, u32 misc) return true; /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); return true; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f6fd71b64b66..e9b3c5d4a52e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -121,7 +121,7 @@ void mce_prep_record_common(struct mce *m) { m->cpuid = cpuid_eax(1); m->cpuvendor = boot_cpu_data.x86_vendor; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); + m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ m->time = __ktime_get_real_seconds(); } @@ -388,9 +388,9 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) } /* MSR access wrappers used for error injection */ -noinstr u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrq(u32 msr) { - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { int offset; @@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrl(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -444,7 +444,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) low = (u32)v; high = (u32)(v >> 32); - /* See comment in mce_rdmsrl() */ + /* See comment in mce_rdmsrq() */ asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) @@ -468,7 +468,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs instrumentation_end(); m = &err->m; - m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); if (regs) { /* * Get the address of the instruction at the time of @@ -488,7 +488,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs } /* Use accurate RIP reporting if available. */ if (mca_cfg.rip_msr) - m->ip = mce_rdmsrl(mca_cfg.rip_msr); + m->ip = mce_rdmsrq(mca_cfg.rip_msr); } } @@ -684,10 +684,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) struct mce *m = &err->m; if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); + m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); + m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -702,12 +702,12 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) } if (mce_flags.smca) { - m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i)); if (m->status & MCI_STATUS_SYNDV) { - m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); - err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); - err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i)); } } } @@ -753,7 +753,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m->bank = i; barrier(); - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); /* * Update storm tracking here, before checking for the @@ -829,7 +829,7 @@ clear_it: /* * Clear state for this bank. */ - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } /* @@ -887,8 +887,8 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) */ static noinstr bool quirk_skylake_repmov(void) { - u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE); + u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); + u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE); u64 mc1_status; /* @@ -899,7 +899,7 @@ static noinstr bool quirk_skylake_repmov(void) !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) return false; - mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1)); + mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1)); /* Check for a software-recoverable data fetch error. */ if ((mc1_status & @@ -910,8 +910,8 @@ static noinstr bool quirk_skylake_repmov(void) MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_AR | MCI_STATUS_S)) { misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0); instrumentation_begin(); pr_err_once("Erratum detected, disable fast string copy instructions.\n"); @@ -955,7 +955,7 @@ static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, un int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1274,7 +1274,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (arch_test_bit(i, toclear)) - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1298,7 +1298,7 @@ static noinstr bool mce_check_crashing_cpu(void) (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); + mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1306,7 +1306,7 @@ static noinstr bool mce_check_crashing_cpu(void) } if (mcgstatus & MCG_STATUS_RIPV) { - __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); + native_wrmsrq(MSR_IA32_MCG_STATUS, 0); return true; } } @@ -1335,7 +1335,7 @@ __mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1693,7 +1693,7 @@ out: instrumentation_end(); clear: - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrq(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1822,7 +1822,7 @@ static void __mcheck_cpu_cap_init(void) u64 cap; u8 b; - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; @@ -1863,7 +1863,7 @@ static void __mcheck_cpu_init_generic(void) cr4_set_bits(X86_CR4_MCE); - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } @@ -1878,8 +1878,8 @@ static void __mcheck_cpu_init_clear_banks(void) if (!b->init) continue; - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1905,7 +1905,7 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } @@ -2436,7 +2436,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2786,7 +2786,7 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 06e3cf7229ce..d02c4f556cd0 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -24,10 +24,11 @@ #include <linux/pci.h> #include <linux/uaccess.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/apic.h> #include <asm/irq_vectors.h> #include <asm/mce.h> +#include <asm/msr.h> #include <asm/nmi.h> #include <asm/smp.h> @@ -475,27 +476,27 @@ static void prepare_msrs(void *info) struct mce m = *(struct mce *)info; u8 b = m.bank; - wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus); if (boot_cpu_has(X86_FEATURE_SMCA)) { if (m.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); } - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); if (m.misc) - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrq(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr); if (m.misc) - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + wrmsrq(MSR_IA32_MCx_MISC(b), m.misc); } } @@ -589,7 +590,7 @@ static int inj_bank_set(void *data, u64 val) u64 cap; /* Get bank count on target CPU so we can handle non-uniform values. */ - rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); n_banks = cap & MCG_BANKCNT_MASK; if (val >= n_banks) { @@ -613,7 +614,7 @@ static int inj_bank_set(void *data, u64 val) if (cpu_feature_enabled(X86_FEATURE_SMCA)) { u64 ipid; - if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { pr_err("Error reading IPID on CPU%d\n", m->extcpu); return -EINVAL; } @@ -741,15 +742,15 @@ static void check_hw_inj_possible(void) u64 status = MCI_STATUS_VAL, ipid; /* Check whether bank is populated */ - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); if (!ipid) continue; toggle_hw_mce_inject(cpu, true); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); - rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0); if (!status) { hw_injection_possible = false; diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f863df0ff42c..efcf21e9552e 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -94,7 +94,7 @@ static bool cmci_supported(int *banks) if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return false; - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK); return !!(cap & MCG_CMCI_P); } @@ -106,7 +106,7 @@ static bool lmce_supported(void) if (mca_cfg.lmce_disabled) return false; - rdmsrl(MSR_IA32_MCG_CAP, tmp); + rdmsrq(MSR_IA32_MCG_CAP, tmp); /* * LMCE depends on recovery support in the processor. Hence both @@ -123,7 +123,7 @@ static bool lmce_supported(void) * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally * locks the MSR in the event that it wasn't already locked by BIOS. */ - rdmsrl(MSR_IA32_FEAT_CTL, tmp); + rdmsrq(MSR_IA32_FEAT_CTL, tmp); if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED))) return false; @@ -141,9 +141,9 @@ static void cmci_set_threshold(int bank, int thresh) u64 val; raw_spin_lock_irqsave(&cmci_discover_lock, flags); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -184,7 +184,7 @@ static bool cmci_skip_bank(int bank, u64 *val) if (test_bit(bank, mce_banks_ce_disabled)) return true; - rdmsrl(MSR_IA32_MCx_CTL2(bank), *val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), *val); /* Already owned by someone else? */ if (*val & MCI_CTL2_CMCI_EN) { @@ -232,8 +232,8 @@ static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_w struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); val |= MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); /* If the enable bit did not stick, this bank should be polled. */ if (!(val & MCI_CTL2_CMCI_EN)) { @@ -324,9 +324,9 @@ static void __cmci_disable_bank(int bank) if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) return; - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) @@ -430,10 +430,10 @@ void intel_init_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); if (!(val & MCG_EXT_CTL_LMCE_EN)) - wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } void intel_clear_lmce(void) @@ -443,9 +443,9 @@ void intel_clear_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); val &= ~MCG_EXT_CTL_LMCE_EN; - wrmsrl(MSR_IA32_MCG_EXT_CTL, val); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val); } /* @@ -460,10 +460,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c) case INTEL_SANDYBRIDGE_X: case INTEL_IVYBRIDGE_X: case INTEL_HASWELL_X: - if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) + if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control)) return; error_control |= 2; - wrmsrl_safe(MSR_ERROR_CONTROL, error_control); + wrmsrq_safe(MSR_ERROR_CONTROL, error_control); break; } } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 95a504ece43e..b5ba598e54cb 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -312,7 +312,7 @@ static __always_inline void pentium_machine_check(struct pt_regs *regs) {} static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif -noinstr u64 mce_rdmsrl(u32 msr); +noinstr u64 mce_rdmsrq(u32 msr); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 96cb992d50ef..097e39327942 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -217,7 +217,6 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi { struct patch_digest *pd = NULL; u8 digest[SHA256_DIGEST_SIZE]; - struct sha256_state s; int i; if (x86_family(bsp_cpuid_1_eax) < 0x17) @@ -235,9 +234,7 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi return false; } - sha256_init(&s); - sha256_update(&s, data, len); - sha256_final(&s, digest); + sha256(data, len, digest); if (memcmp(digest, pd->sha256, sizeof(digest))) { pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id); @@ -607,7 +604,7 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize)) return false; - native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr); + native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr); if (x86_family(bsp_cpuid_1_eax) == 0x17) { unsigned long p_addr_end = p_addr + psize - 1; @@ -1178,11 +1175,18 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } +static void finalize_late_load_amd(int result) +{ + if (result) + cleanup(); +} + static struct microcode_ops microcode_amd_ops = { .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, + .finalize_late_load = finalize_late_load_amd, .nmi_safe = true, }; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 079f046ee26d..fe50eb5b7c4a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -37,6 +37,7 @@ #include <asm/perf_event.h> #include <asm/processor.h> #include <asm/cmdline.h> +#include <asm/msr.h> #include <asm/setup.h> #include "internal.h" @@ -117,7 +118,7 @@ bool __init microcode_loader_disabled(void) * 3) Certain AMD patch levels are not allowed to be * overwritten. */ - if (!have_cpuid_p() || + if (!cpuid_feature() || native_cpuid_ecx(1) & BIT(31) || amd_check_current_patch_level()) dis_ucode_ldr = true; @@ -696,6 +697,8 @@ static int load_late_locked(void) return load_late_stop_cpus(true); case UCODE_NFOUND: return -ENOENT; + case UCODE_OK: + return 0; default: return -EBADFD; } diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h new file mode 100644 index 000000000000..cb6e601701ab --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -0,0 +1,150 @@ +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 }, diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 2a397da43923..371ca6eac00e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -320,7 +320,7 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, } /* write microcode via MSR 0x79 */ - native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); rev = intel_get_microcode_revision(); if (rev != mc->hdr.rev) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 3e2533954675..c78f860419d6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -30,6 +30,7 @@ #include <asm/reboot.h> #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> +#include <asm/msr.h> #include <asm/numa.h> #include <asm/svm.h> @@ -70,7 +71,7 @@ u64 hv_get_non_nested_msr(unsigned int reg) if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) hv_ivm_msr_read(reg, &value); else - rdmsrl(reg, value); + rdmsrq(reg, value); return value; } EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); @@ -82,9 +83,9 @@ void hv_set_non_nested_msr(unsigned int reg, u64 value) /* Write proxy bit via wrmsl instruction */ if (hv_is_sint_msr(reg)) - wrmsrl(reg, value | 1 << 20); + wrmsrq(reg, value | 1 << 20); } else { - wrmsrl(reg, value); + wrmsrq(reg, value); } } EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); @@ -345,7 +346,7 @@ static unsigned long hv_get_tsc_khz(void) { unsigned long freq; - rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq); return freq / 1000; } @@ -541,7 +542,7 @@ static void __init ms_hyperv_init_platform(void) */ u64 hv_lapic_frequency; - rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_period = hv_lapic_frequency; pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", @@ -574,7 +575,7 @@ static void __init ms_hyperv_init_platform(void) * setting of this MSR bit should happen before init_intel() * is called. */ - wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); + wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e2c6b471d230..8c18327eb10b 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -593,7 +593,7 @@ static void get_fixed_ranges(mtrr_type *frs) void mtrr_save_fixed_ranges(void *info) { - if (boot_cpu_has(X86_FEATURE_MTRR)) + if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); } diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index 0c13b0befd8a..d8a04b195da2 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -2,4 +2,6 @@ obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o + +# To allow define_trace.h's recursive include: CFLAGS_pseudo_lock.o = -I$(src) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index cf29681d01e0..7109cbfcad4f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,6 +22,7 @@ #include <linux/cpuhotplug.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include <asm/resctrl.h> #include "internal.h" @@ -60,7 +61,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L3] = { .r_resctrl = { - .rid = RDT_RESOURCE_L3, .name = "L3", .ctrl_scope = RESCTRL_L3_CACHE, .mon_scope = RESCTRL_L3_CACHE, @@ -74,7 +74,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L2] = { .r_resctrl = { - .rid = RDT_RESOURCE_L2, .name = "L2", .ctrl_scope = RESCTRL_L2_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), @@ -86,7 +85,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_MBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_MBA, .name = "MB", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), @@ -96,7 +94,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_SMBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_SMBA, .name = "SMBA", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), @@ -145,10 +142,10 @@ static inline void cache_alloc_hsw_probe(void) struct rdt_resource *r = &hw_res->r_resctrl; u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) + if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); + rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ if (l3_cbm_0 != max_cbm) @@ -164,21 +161,6 @@ static inline void cache_alloc_hsw_probe(void) rdt_alloc_capable = true; } -bool is_mba_sc(struct rdt_resource *r) -{ - if (!r) - r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - - /* - * The software controller support is only applicable to MBA resource. - * Make sure to check for resource type. - */ - if (r->rid != RDT_RESOURCE_MBA) - return false; - - return r->membw.mba_sc; -} - /* * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values * exposed to user interface and the h/w understandable delay values. @@ -309,7 +291,7 @@ static void mba_wrmsr_amd(struct msr_param *m) unsigned int i; for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } /* @@ -334,7 +316,7 @@ static void mba_wrmsr_intel(struct msr_param *m) /* Write the delay values for mba. */ for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); + wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); } static void cat_wrmsr(struct msr_param *m) @@ -344,7 +326,7 @@ static void cat_wrmsr(struct msr_param *m) unsigned int i; for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } u32 resctrl_arch_get_num_closid(struct rdt_resource *r) @@ -737,7 +719,7 @@ struct rdt_options { bool force_off, force_on; }; -static struct rdt_options rdt_options[] __initdata = { +static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), @@ -777,7 +759,7 @@ static int __init set_rdt_options(char *str) } __setup("rdt", set_rdt_options); -bool __init rdt_cpu_has(int flag) +bool rdt_cpu_has(int flag) { bool ret = boot_cpu_has(flag); struct rdt_options *o; @@ -797,7 +779,7 @@ bool __init rdt_cpu_has(int flag) return ret; } -__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) { if (!rdt_cpu_has(X86_FEATURE_BMEC)) return false; @@ -1011,7 +993,11 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) static int __init resctrl_arch_late_init(void) { struct rdt_resource *r; - int state, ret; + int state, ret, i; + + /* for_each_rdt_resource() requires all rid to be initialised. */ + for (i = 0; i < RDT_NUM_RESOURCES; i++) + rdt_resources_all[i].r_resctrl.rid = i; /* * Initialize functions(or definitions) that are different diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 0a0ac5f6112e..1189c0df4ad7 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -16,277 +16,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpu.h> -#include <linux/kernfs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/tick.h> #include "internal.h" -struct rdt_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - -typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, - struct resctrl_schema *s, - struct rdt_ctrl_domain *d); - -/* - * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and max bandwidth values specified by the - * hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - int ret; - u32 bw; - - /* - * Only linear delay values is supported for current Intel SKUs. - */ - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { - rdt_last_cmd_puts("No support for non-linear MB domains\n"); - return false; - } - - ret = kstrtou32(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Invalid MB value %s\n", buf); - return false; - } - - /* Nothing else to do if software controller is enabled. */ - if (is_mba_sc(r)) { - *data = bw; - return true; - } - - if (bw < r->membw.min_bw || bw > r->membw.max_bw) { - rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", - bw, r->membw.min_bw, r->membw.max_bw); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct resctrl_staged_config *cfg; - u32 closid = data->rdtgrp->closid; - struct rdt_resource *r = s->res; - u32 bw_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - if (!bw_validate(data->buf, &bw_val, r)) - return -EINVAL; - - if (is_mba_sc(r)) { - d->mbps_val[closid] = bw_val; - return 0; - } - - cfg->new_ctrl = bw_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether a cache bit mask is valid. - * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: - * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 - * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 - * - * Haswell does not support a non-contiguous 1s value and additionally - * requires at least two bits set. - * AMD allows non-contiguous bitmasks. - */ -static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit, val; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Are non-contiguous bitmasks allowed? */ - if (!r->cache.arch_has_sparse_bitmasks && - (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { - rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); - return false; - } - - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("Need at least %d bits in the mask\n", - r->cache.min_cbm_bits); - return false; - } - - *data = val; - return true; -} - -/* - * Read one cache bit mask (hex). Check that it is valid for the current - * resource type. - */ -static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct rdtgroup *rdtgrp = data->rdtgrp; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 cbm_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - /* - * Cannot set up more than one pseudo-locked region in a cache - * hierarchy. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - rdtgroup_pseudo_locked_in_hierarchy(d)) { - rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); - return -EINVAL; - } - - if (!cbm_validate(data->buf, &cbm_val, r)) - return -EINVAL; - - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_SHAREABLE) && - rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { - rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); - return -EINVAL; - } - - /* - * The CBM may not overlap with the CBM of another closid if - * either is exclusive. - */ - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { - rdt_last_cmd_puts("Overlaps with exclusive group\n"); - return -EINVAL; - } - - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - rdt_last_cmd_puts("Overlaps with other group\n"); - return -EINVAL; - } - } - - cfg->new_ctrl = cbm_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * For each domain in this resource we expect to find a series of: - * id=mask - * separated by ";". The "id" is in decimal, and must match one of - * the "id"s for this resource. - */ -static int parse_line(char *line, struct resctrl_schema *s, - struct rdtgroup *rdtgrp) -{ - enum resctrl_conf_type t = s->conf_type; - ctrlval_parser_t *parse_ctrlval = NULL; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - struct rdt_parse_data data; - struct rdt_ctrl_domain *d; - char *dom = NULL, *id; - unsigned long dom_id; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - switch (r->schema_fmt) { - case RESCTRL_SCHEMA_BITMAP: - parse_ctrlval = &parse_cbm; - break; - case RESCTRL_SCHEMA_RANGE: - parse_ctrlval = &parse_bw; - break; - } - - if (WARN_ON_ONCE(!parse_ctrlval)) - return -EINVAL; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { - rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); - return -EINVAL; - } - -next: - if (!line || line[0] == '\0') - return 0; - dom = strsep(&line, ";"); - id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); - return -EINVAL; - } - dom = strim(dom); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (d->hdr.id == dom_id) { - data.buf = dom; - data.rdtgrp = rdtgrp; - if (parse_ctrlval(&data, s, d)) - return -EINVAL; - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - cfg = &d->staged_config[t]; - /* - * In pseudo-locking setup mode and just - * parsed a valid CBM that should be - * pseudo-locked. Only one locked region per - * resource group and domain so just do - * the required initialization for single - * region and return. - */ - rdtgrp->plr->s = s; - rdtgrp->plr->d = d; - rdtgrp->plr->cbm = cfg->new_ctrl; - d->plr = rdtgrp->plr; - return 0; - } - goto next; - } - } - return -EINVAL; -} - int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { @@ -351,100 +83,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) return 0; } -static int rdtgroup_parse_resource(char *resname, char *tok, - struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - - list_for_each_entry(s, &resctrl_schema_all, list) { - if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) - return parse_line(tok, s, rdtgrp); - } - rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); - return -EINVAL; -} - -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct resctrl_schema *s; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - char *tok, *resname; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - /* - * No changes to pseudo-locked region allowed. It has to be removed - * and re-created instead. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = -EINVAL; - rdt_last_cmd_puts("Resource group is pseudo-locked\n"); - goto out; - } - - rdt_staged_configs_clear(); - - while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strim(strsep(&tok, ":")); - if (!tok) { - rdt_last_cmd_puts("Missing ':'\n"); - ret = -EINVAL; - goto out; - } - if (tok[0] == '\0') { - rdt_last_cmd_printf("Missing '%s' value\n", resname); - ret = -EINVAL; - goto out; - } - ret = rdtgroup_parse_resource(resname, tok, rdtgrp); - if (ret) - goto out; - } - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - - /* - * Writes to mba_sc resources update the software controller, - * not the control MSR. - */ - if (is_mba_sc(r)) - continue; - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret) - goto out; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * If pseudo-locking fails we keep the resource group in - * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service - * active and updated for just the domain the pseudo-locked - * region was requested for. - */ - ret = rdtgroup_pseudo_lock_create(rdtgrp); - } - -out: - rdt_staged_configs_clear(); - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -453,276 +91,3 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, return hw_dom->ctrl_val[idx]; } - -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) -{ - struct rdt_resource *r = schema->res; - struct rdt_ctrl_domain *dom; - bool sep = false; - u32 ctrl_val; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - if (is_mba_sc(r)) - ctrl_val = dom->mbps_val[closid]; - else - ctrl_val = resctrl_arch_get_config(r, dom, closid, - schema->conf_type); - - seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); - sep = true; - } - seq_puts(s, "\n"); -} - -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - struct rdtgroup *rdtgrp; - int ret = 0; - u32 closid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - list_for_each_entry(schema, &resctrl_schema_all, list) { - seq_printf(s, "%s:uninitialized\n", schema->name); - } - } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%s:%d=%x\n", - rdtgrp->plr->s->res->name, - rdtgrp->plr->d->hdr.id, - rdtgrp->plr->cbm); - } - } else { - closid = rdtgrp->closid; - list_for_each_entry(schema, &resctrl_schema_all, list) { - if (closid < schema->num_closid) - show_doms(s, schema, closid); - } - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); - return ret; -} - -static int smp_mon_event_count(void *arg) -{ - mon_event_count(arg); - - return 0; -} - -ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) - rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else - ret = -EINVAL; - } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) - rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; - else - ret = -EINVAL; - } else { - ret = -EINVAL; - } - - if (ret) - rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); - - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - switch (rdtgrp->mba_mbps_event) { - case QOS_L3_MBM_LOCAL_EVENT_ID: - seq_puts(s, "mbm_local_bytes\n"); - break; - case QOS_L3_MBM_TOTAL_EVENT_ID: - seq_puts(s, "mbm_total_bytes\n"); - break; - default: - pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); - ret = -EINVAL; - break; - } - } else { - ret = -ENOENT; - } - - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, - struct list_head **pos) -{ - struct rdt_domain_hdr *d; - struct list_head *l; - - list_for_each(l, h) { - d = list_entry(l, struct rdt_domain_hdr, list); - /* When id is found, return its domain. */ - if (id == d->id) - return d; - /* Stop searching when finding id's position in sorted list. */ - if (id < d->id) - break; - } - - if (pos) - *pos = l; - - return NULL; -} - -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) -{ - int cpu; - - /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - /* - * Setup the parameters to pass to mon_event_count() to read the data. - */ - rr->rgrp = rdtgrp; - rr->evtid = evtid; - rr->r = r; - rr->d = d; - rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; - } - - cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); - - /* - * cpumask_any_housekeeping() prefers housekeeping CPUs, but - * are all the CPUs nohz_full? If yes, pick a CPU to IPI. - * MPAM's resctrl_arch_rmid_read() is unable to read the - * counters on some platforms if its called in IRQ context. - */ - if (tick_nohz_full_cpu(cpu)) - smp_call_function_any(cpumask, mon_event_count, rr, 1); - else - smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); -} - -int rdtgroup_mondata_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rdt_domain_hdr *hdr; - struct rmid_read rr = {0}; - struct rdt_mon_domain *d; - u32 resid, evtid, domid; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - union mon_data_bits md; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto out; - } - - md.priv = of->kn->priv; - resid = md.u.rid; - domid = md.u.domid; - evtid = md.u.evtid; - r = resctrl_arch_get_resource(resid); - - if (md.u.sum) { - /* - * This file requires summing across all domains that share - * the L3 cache id that was provided in the "domid" field of the - * mon_data_bits union. Search all domains in the resource for - * one that matches this cache id. - */ - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci->id == domid) { - rr.ci = d->ci; - mon_event_read(&rr, r, NULL, rdtgrp, - &d->ci->shared_cpu_map, evtid, false); - goto checkresult; - } - } - ret = -ENOENT; - goto out; - } else { - /* - * This file provides data from a single domain. Search - * the resource to find the domain with "domid". - */ - hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { - ret = -ENOENT; - goto out; - } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); - } - -checkresult: - - if (rr.err == -EIO) - seq_puts(m, "Error\n"); - else if (rr.err == -EINVAL) - seq_puts(m, "Unavailable\n"); - else - seq_printf(m, "%llu\n", rr.val); - -out: - rdtgroup_kn_unlock(of->kn); - return ret; -} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index eaae99602b61..5e3c41b36437 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -3,28 +3,21 @@ #define _ASM_X86_RESCTRL_INTERNAL_H #include <linux/resctrl.h> -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/fs_context.h> -#include <linux/jump_label.h> -#include <linux/tick.h> - -#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL -#define CQM_LIMBOCHECK_INTERVAL 1000 - #define MBM_CNTR_WIDTH_BASE 24 -#define MBM_OVERFLOW_INTERVAL 1000 -#define MAX_MBA_BW 100u + #define MBA_IS_LINEAR 0x4 + #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) + #define RMID_VAL_UNAVAIL BIT_ULL(62) + /* * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for * data to be returned. The counter width is discovered from the hardware @@ -33,278 +26,6 @@ #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) /** - * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that - * aren't marked nohz_full - * @mask: The mask to pick a CPU from. - * @exclude_cpu:The CPU to avoid picking. - * - * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping - * CPUs that don't use nohz_full, these are preferred. Pass - * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. - * - * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. - */ -static inline unsigned int -cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) -{ - unsigned int cpu, hk_cpu; - - if (exclude_cpu == RESCTRL_PICK_ANY_CPU) - cpu = cpumask_any(mask); - else - cpu = cpumask_any_but(mask, exclude_cpu); - - /* Only continue if tick_nohz_full_mask has been initialized. */ - if (!tick_nohz_full_enabled()) - return cpu; - - /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ - if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) - return cpu; - - /* Try to find a CPU that isn't nohz_full to use in preference */ - hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); - if (hk_cpu == exclude_cpu) - hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); - - if (hk_cpu < nr_cpu_ids) - cpu = hk_cpu; - - return cpu; -} - -struct rdt_fs_context { - struct kernfs_fs_context kfc; - bool enable_cdpl2; - bool enable_cdpl3; - bool enable_mba_mbps; - bool enable_debug; -}; - -static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - - return container_of(kfc, struct rdt_fs_context, kfc); -} - -/** - * struct mon_evt - Entry in the event list of a resource - * @evtid: event id - * @name: name of the event - * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list - */ -struct mon_evt { - enum resctrl_event_id evtid; - char *name; - bool configurable; - struct list_head list; -}; - -/** - * union mon_data_bits - Monitoring details for each event file. - * @priv: Used to store monitoring event data in @u - * as kernfs private data. - * @u.rid: Resource id associated with the event file. - * @u.evtid: Event id associated with the event file. - * @u.sum: Set when event must be summed across multiple - * domains. - * @u.domid: When @u.sum is zero this is the domain to which - * the event file belongs. When @sum is one this - * is the id of the L3 cache that all domains to be - * summed share. - * @u: Name of the bit fields struct. - */ -union mon_data_bits { - void *priv; - struct { - unsigned int rid : 10; - enum resctrl_event_id evtid : 7; - unsigned int sum : 1; - unsigned int domid : 14; - } u; -}; - -/** - * struct rmid_read - Data passed across smp_call*() to read event count. - * @rgrp: Resource group for which the counter is being read. If it is a parent - * resource group then its event count is summed with the count from all - * its child resource groups. - * @r: Resource describing the properties of the event being read. - * @d: Domain that the counter should be read from. If NULL then sum all - * domains in @r sharing L3 @ci.id - * @evtid: Which monitor event to read. - * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. - * @err: Error encountered when reading counter. - * @val: Returned value of event counter. If @rgrp is a parent resource group, - * @val includes the sum of event counts from its child resource groups. - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, - * (summed across child resource groups if @rgrp is a parent resource group). - * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). - */ -struct rmid_read { - struct rdtgroup *rgrp; - struct rdt_resource *r; - struct rdt_mon_domain *d; - enum resctrl_event_id evtid; - bool first; - struct cacheinfo *ci; - int err; - u64 val; - void *arch_mon_ctx; -}; - -extern struct list_head resctrl_schema_all; -extern bool resctrl_mounted; - -enum rdt_group_type { - RDTCTRL_GROUP = 0, - RDTMON_GROUP, - RDT_NUM_GROUP, -}; - -/** - * enum rdtgrp_mode - Mode of a RDT resource group - * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations - * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed - * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking - * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations - * allowed AND the allocations are Cache Pseudo-Locked - * @RDT_NUM_MODES: Total number of modes - * - * The mode of a resource group enables control over the allowed overlap - * between allocations associated with different resource groups (classes - * of service). User is able to modify the mode of a resource group by - * writing to the "mode" resctrl file associated with the resource group. - * - * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by - * writing the appropriate text to the "mode" file. A resource group enters - * "pseudo-locked" mode after the schemata is written while the resource - * group is in "pseudo-locksetup" mode. - */ -enum rdtgrp_mode { - RDT_MODE_SHAREABLE = 0, - RDT_MODE_EXCLUSIVE, - RDT_MODE_PSEUDO_LOCKSETUP, - RDT_MODE_PSEUDO_LOCKED, - - /* Must be last */ - RDT_NUM_MODES, -}; - -/** - * struct mongroup - store mon group's data in resctrl fs. - * @mon_data_kn: kernfs node for the mon_data directory - * @parent: parent rdtgrp - * @crdtgrp_list: child rdtgroup node list - * @rmid: rmid for this rdtgroup - */ -struct mongroup { - struct kernfs_node *mon_data_kn; - struct rdtgroup *parent; - struct list_head crdtgrp_list; - u32 rmid; -}; - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - * @type: indicates type of this rdtgroup - either - * monitor only or ctrl_mon group - * @mon: mongroup related data - * @mode: mode of resource group - * @mba_mbps_event: input monitoring event id when mba_sc is enabled - * @plr: pseudo-locked region - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - u32 closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; - enum rdt_group_type type; - struct mongroup mon; - enum rdtgrp_mode mode; - enum resctrl_event_id mba_mbps_event; - struct pseudo_lock_region *plr; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* - * Define the file type flags for base and info directories. - */ -#define RFTYPE_INFO BIT(0) -#define RFTYPE_BASE BIT(1) -#define RFTYPE_CTRL BIT(4) -#define RFTYPE_MON BIT(5) -#define RFTYPE_TOP BIT(6) -#define RFTYPE_RES_CACHE BIT(8) -#define RFTYPE_RES_MB BIT(9) -#define RFTYPE_DEBUG BIT(10) -#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) -#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width; - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RFTYPE_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - const struct kernfs_ops *kf_ops; - unsigned long flags; - unsigned long fflags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct mbm_state - status for each MBM counter in each domain - * @prev_bw_bytes: Previous bytes value read for bandwidth calculation - * @prev_bw: The most recent bandwidth in MBps - */ -struct mbm_state { - u64 prev_bw_bytes; - u32 prev_bw; -}; - -/** * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s * return value. * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) @@ -401,24 +122,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r return container_of(r, struct rdt_hw_resource, r_resctrl); } -extern struct mutex rdtgroup_mutex; - -static inline const char *rdt_kn_name(const struct kernfs_node *kn) -{ - return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); -} - extern struct rdt_hw_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -extern struct dentry *debugfs_resctrl; -extern enum resctrl_event_id mba_mbps_default_event; - -static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) -{ - return rdt_resources_all[l].cdp_enabled; -} - -int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); @@ -455,99 +159,14 @@ union cpuid_0x10_x_edx { unsigned int full; }; -void rdt_last_cmd_clear(void); -void rdt_last_cmd_puts(const char *s); -__printf(1, 2) -void rdt_last_cmd_printf(const char *fmt, ...); - void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive); -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm); -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); -int rdtgroup_tasks_assigned(struct rdtgroup *r); -int closids_supported(void); -void closid_free(int closid); -int alloc_rmid(u32 closid); -void free_rmid(u32 closid, u32 rmid); -int rdt_get_mon_l3_config(struct rdt_resource *r); -void resctrl_mon_resource_exit(void); -bool __init rdt_cpu_has(int flag); -void mon_event_count(void *info); -int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); -int __init resctrl_mon_resource_init(void); -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, - unsigned long delay_ms, - int exclude_cpu); -void mbm_handle_overflow(struct work_struct *work); -void __init intel_rdt_mbm_apply_quirk(void); -bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu); -void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_mon_domain *d); -void __check_limbo(struct rdt_mon_domain *d, bool force_free); -void rdt_domain_reconfigure_cdp(struct rdt_resource *r); -void resctrl_file_fflags_init(const char *config, unsigned long fflags); -void rdt_staged_configs_clear(void); -bool closid_allocated(unsigned int closid); -int resctrl_find_cleanest_closid(void); - -#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); -int rdt_pseudo_lock_init(void); -void rdt_pseudo_lock_release(void); -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -#else -static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} -static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} - -static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) -{ - return false; -} +int rdt_get_mon_l3_config(struct rdt_resource *r); -static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) -{ - return false; -} +bool rdt_cpu_has(int flag); -static inline int rdt_pseudo_lock_init(void) { return 0; } -static inline void rdt_pseudo_lock_release(void) { } -static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} +void __init intel_rdt_mbm_apply_quirk(void); -static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } -#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ +void rdt_domain_reconfigure_cdp(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index a93ed7d2a160..c261558276cd 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -18,62 +18,12 @@ #define pr_fmt(fmt) "resctrl: " fmt #include <linux/cpu.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <linux/slab.h> +#include <linux/resctrl.h> #include <asm/cpu_device_id.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include "internal.h" -#include "trace.h" - -/** - * struct rmid_entry - dirty tracking for all RMID. - * @closid: The CLOSID for this entry. - * @rmid: The RMID for this entry. - * @busy: The number of domains with cached data using this RMID. - * @list: Member of the rmid_free_lru list when busy == 0. - * - * Depending on the architecture the correct monitor is accessed using - * both @closid and @rmid, or @rmid only. - * - * Take the rdtgroup_mutex when accessing. - */ -struct rmid_entry { - u32 closid; - u32 rmid; - int busy; - struct list_head list; -}; - -/* - * @rmid_free_lru - A least recently used list of free RMIDs - * These RMIDs are guaranteed to have an occupancy less than the - * threshold occupancy - */ -static LIST_HEAD(rmid_free_lru); - -/* - * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. - * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. - * Indexed by CLOSID. Protected by rdtgroup_mutex. - */ -static u32 *closid_num_dirty_rmid; - -/* - * @rmid_limbo_count - count of currently unused but (potentially) - * dirty RMIDs. - * This counts RMIDs that no one is currently using but that - * may have a occupancy value > resctrl_rmid_realloc_threshold. User can - * change the threshold occupancy value. - */ -static unsigned int rmid_limbo_count; - -/* - * @rmid_entry - The entry in the limbo and free lists. - */ -static struct rmid_entry *rmid_ptrs; /* * Global boolean for rdt_monitor which is true if any @@ -86,23 +36,12 @@ bool rdt_mon_capable; */ unsigned int rdt_mon_features; -/* - * This is the threshold cache occupancy in bytes at which we will consider an - * RMID available for re-allocation. - */ -unsigned int resctrl_rmid_realloc_threshold; - -/* - * This is the maximum value for the reallocation threshold, in bytes. - */ -unsigned int resctrl_rmid_realloc_limit; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; /* - * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. + * The correction factor table is documented in Documentation/filesystems/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied * by the correction factor. * @@ -151,6 +90,7 @@ static const struct mbm_correction_factor_table { }; static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; + static u64 mbm_cf __read_mostly; static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) @@ -163,33 +103,6 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) } /* - * x86 and arm64 differ in their handling of monitoring. - * x86's RMID are independent numbers, there is only one source of traffic - * with an RMID value of '1'. - * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of - * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID - * value is no longer unique. - * To account for this, resctrl uses an index. On x86 this is just the RMID, - * on arm64 it encodes the CLOSID and RMID. This gives a unique number. - * - * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code - * must accept an attempt to read every index. - */ -static inline struct rmid_entry *__rmid_entry(u32 idx) -{ - struct rmid_entry *entry; - u32 closid, rmid; - - entry = &rmid_ptrs[idx]; - resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - - WARN_ON_ONCE(entry->closid != closid); - WARN_ON_ONCE(entry->rmid != rmid); - - return entry; -} - -/* * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is * needed. The physical RMID is the same as the logical RMID. @@ -238,7 +151,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) * are error bits. */ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); - rdmsrl(MSR_IA32_QM_CTR, msr_val); + rdmsrq(MSR_IA32_QM_CTR, msr_val); if (msr_val & RMID_VAL_ERROR) return -EIO; @@ -260,12 +173,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do return &hw_dom->arch_mbm_total[rmid]; case QOS_L3_MBM_LOCAL_EVENT_ID: return &hw_dom->arch_mbm_local[rmid]; + default: + /* Never expect to get here */ + WARN_ON_ONCE(1); + return NULL; } - - /* Never expect to get here */ - WARN_ON_ONCE(1); - - return NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -346,769 +258,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, return 0; } -static void limbo_release_entry(struct rmid_entry *entry) -{ - lockdep_assert_held(&rdtgroup_mutex); - - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]--; -} - -/* - * Check the RMIDs that are marked as busy for this domain. If the - * reported LLC occupancy is below the threshold clear the busy bit and - * decrement the count. If the busy count gets to zero on an RMID, we - * free the RMID - */ -void __check_limbo(struct rdt_mon_domain *d, bool force_free) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - struct rmid_entry *entry; - u32 idx, cur_idx = 1; - void *arch_mon_ctx; - bool rmid_dirty; - u64 val = 0; - - arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); - if (IS_ERR(arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(arch_mon_ctx)); - return; - } - - /* - * Skip RMID 0 and start from RMID 1 and check all the RMIDs that - * are marked as busy for occupancy < threshold. If the occupancy - * is less than the threshold decrement the busy counter of the - * RMID and move it to the free list when the counter reaches 0. - */ - for (;;) { - idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); - if (idx >= idx_limit) - break; - - entry = __rmid_entry(idx); - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, - arch_mon_ctx)) { - rmid_dirty = true; - } else { - rmid_dirty = (val >= resctrl_rmid_realloc_threshold); - - /* - * x86's CLOSID and RMID are independent numbers, so the entry's - * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the - * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't - * used to select the configuration. It is thus necessary to track both - * CLOSID and RMID because there may be dependencies between them - * on some architectures. - */ - trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); - } - - if (force_free || !rmid_dirty) { - clear_bit(idx, d->rmid_busy_llc); - if (!--entry->busy) - limbo_release_entry(entry); - } - cur_idx = idx + 1; - } - - resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); -} - -bool has_busy_rmid(struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - - return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; -} - -static struct rmid_entry *resctrl_find_free_rmid(u32 closid) -{ - struct rmid_entry *itr; - u32 itr_idx, cmp_idx; - - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); - - list_for_each_entry(itr, &rmid_free_lru, list) { - /* - * Get the index of this free RMID, and the index it would need - * to be if it were used with this CLOSID. - * If the CLOSID is irrelevant on this architecture, the two - * index values are always the same on every entry and thus the - * very first entry will be returned. - */ - itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); - cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); - - if (itr_idx == cmp_idx) - return itr; - } - - return ERR_PTR(-ENOSPC); -} - -/** - * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated - * RMID are clean, or the CLOSID that has - * the most clean RMID. - * - * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID - * may not be able to allocate clean RMID. To avoid this the allocator will - * choose the CLOSID with the most clean RMID. - * - * When the CLOSID and RMID are independent numbers, the first free CLOSID will - * be returned. - */ -int resctrl_find_cleanest_closid(void) -{ - u32 cleanest_closid = ~0; - int i = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - return -EIO; - - for (i = 0; i < closids_supported(); i++) { - int num_dirty; - - if (closid_allocated(i)) - continue; - - num_dirty = closid_num_dirty_rmid[i]; - if (num_dirty == 0) - return i; - - if (cleanest_closid == ~0) - cleanest_closid = i; - - if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) - cleanest_closid = i; - } - - if (cleanest_closid == ~0) - return -ENOSPC; - - return cleanest_closid; -} - -/* - * For MPAM the RMID value is not unique, and has to be considered with - * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which - * allows all domains to be managed by a single free list. - * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. - */ -int alloc_rmid(u32 closid) -{ - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - entry = resctrl_find_free_rmid(closid); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - list_del(&entry->list); - return entry->rmid; -} - -static void add_rmid_to_limbo(struct rmid_entry *entry) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; - u32 idx; - - lockdep_assert_held(&rdtgroup_mutex); - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); - - entry->busy = 0; - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* - * For the first limbo RMID in the domain, - * setup up the limbo worker. - */ - if (!has_busy_rmid(d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, - RESCTRL_PICK_ANY_CPU); - set_bit(idx, d->rmid_busy_llc); - entry->busy++; - } - - rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]++; -} - -void free_rmid(u32 closid, u32 rmid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - /* - * Do not allow the default rmid to be free'd. Comparing by index - * allows architectures that ignore the closid parameter to avoid an - * unnecessary check. - */ - if (!resctrl_arch_mon_capable() || - idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID)) - return; - - entry = __rmid_entry(idx); - - if (resctrl_arch_is_llc_occupancy_enabled()) - add_rmid_to_limbo(entry); - else - list_add_tail(&entry->list, &rmid_free_lru); -} - -static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, - u32 rmid, enum resctrl_event_id evtid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: - return NULL; - } -} - -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - int cpu = smp_processor_id(); - struct rdt_mon_domain *d; - struct mbm_state *m; - int err, ret; - u64 tval = 0; - - if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (m) - memset(m, 0, sizeof(struct mbm_state)); - return 0; - } - - if (rr->d) { - /* Reading a single domain, must be on a CPU in that domain. */ - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) - return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; - - rr->val += tval; - - return 0; - } - - /* Summing domains that share a cache, must be on a CPU for that cache. */ - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) - return -EINVAL; - - /* - * Legacy files must report the sum of an event across all - * domains that share the same L3 cache instance. - * Report success if a read from any domain succeeds, -EINVAL - * (translated to "Unavailable" for user space) if reading from - * all domains fail for any reason. - */ - ret = -EINVAL; - list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci->id != rr->ci->id) - continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (!err) { - rr->val += tval; - ret = 0; - } - } - - if (ret) - rr->err = ret; - - return ret; -} - -/* - * mbm_bw_count() - Update bw count from values previously read by - * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. - * @rr: The struct rmid_read populated by __mon_event_count(). - * - * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. The chunks value previously read by - * __mon_event_count() is compared with the chunks value from the previous - * invocation. This must be called once per second to maintain values in MBps. - */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - u64 cur_bw, bytes, cur_bytes; - struct mbm_state *m; - - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (WARN_ON_ONCE(!m)) - return; - - cur_bytes = rr->val; - bytes = cur_bytes - m->prev_bw_bytes; - m->prev_bw_bytes = cur_bytes; - - cur_bw = bytes / SZ_1M; - - m->prev_bw = cur_bw; -} - -/* - * This is scheduled by mon_event_read() to read the CQM/MBM counters - * on a domain. - */ -void mon_event_count(void *info) -{ - struct rdtgroup *rdtgrp, *entry; - struct rmid_read *rr = info; - struct list_head *head; - int ret; - - rdtgrp = rr->rgrp; - - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); - - /* - * For Ctrl groups read data from child monitor groups and - * add them together. Count events which are read successfully. - * Discard the rmid_read's reporting errors. - */ - head = &rdtgrp->mon.crdtgrp_list; - - if (rdtgrp->type == RDTCTRL_GROUP) { - list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) - ret = 0; - } - } - - /* - * __mon_event_count() calls for newly created monitor groups may - * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. - * Discard error if any of the monitor event reads succeeded. - */ - if (ret == 0) - rr->err = 0; -} - -static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, - struct rdt_resource *r) -{ - struct rdt_ctrl_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -/* - * Feedback loop for MBA software controller (mba_sc) - * - * mba_sc is a feedback loop where we periodically read MBM counters and - * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so - * that: - * - * current bandwidth(cur_bw) < user specified bandwidth(user_bw) - * - * This uses the MBM counters to measure the bandwidth and MBA throttle - * MSRs to control the bandwidth for a particular rdtgrp. It builds on the - * fact that resctrl rdtgroups have both monitoring and control. - * - * The frequency of the checks is 1s and we just tag along the MBM overflow - * timer. Having 1s interval makes the calculation of bandwidth simpler. - * - * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid unnecessarily restricting - * the L2 <-> L3 traffic. - * - * Since MBA controls the L2 external bandwidth where as MBM measures the - * L3 external bandwidth the following sequence could lead to such a - * situation. - * - * Consider an rdtgroup which had high L3 <-> memory traffic in initial - * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but - * after some time rdtgroup has mostly L2 <-> L3 traffic. - * - * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its - * throttle MSRs already have low percentage values. To avoid - * unnecessarily restricting such rdtgroups, we also increase the bandwidth. - */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) -{ - u32 closid, rmid, cur_msr_val, new_msr_val; - struct mbm_state *pmbm_data, *cmbm_data; - struct rdt_ctrl_domain *dom_mba; - enum resctrl_event_id evt_id; - struct rdt_resource *r_mba; - struct list_head *head; - struct rdtgroup *entry; - u32 cur_bw, user_bw; - - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - evt_id = rgrp->mba_mbps_event; - - closid = rgrp->closid; - rmid = rgrp->mon.rmid; - pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); - if (WARN_ON_ONCE(!pmbm_data)) - return; - - dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); - if (!dom_mba) { - pr_warn_once("Failure to get domain for MBA update\n"); - return; - } - - cur_bw = pmbm_data->prev_bw; - user_bw = dom_mba->mbps_val[closid]; - - /* MBA resource doesn't support CDP */ - cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); - - /* - * For Ctrl groups read data from child monitor groups. - */ - head = &rgrp->mon.crdtgrp_list; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); - if (WARN_ON_ONCE(!cmbm_data)) - return; - cur_bw += cmbm_data->prev_bw; - } - - /* - * Scale up/down the bandwidth linearly for the ctrl group. The - * bandwidth step is the bandwidth granularity specified by the - * hardware. - * Always increase throttling if current bandwidth is above the - * target set by user. - * But avoid thrashing up and down on every poll by checking - * whether a decrease in throttling is likely to push the group - * back over target. E.g. if currently throttling to 30% of bandwidth - * on a system with 10% granularity steps, check whether moving to - * 40% would go past the limit by multiplying current bandwidth by - * "(30 + 10) / 30". - */ - if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { - new_msr_val = cur_msr_val - r_mba->membw.bw_gran; - } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { - new_msr_val = cur_msr_val + r_mba->membw.bw_gran; - } else { - return; - } - - resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); -} - -static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) -{ - struct rmid_read rr = {0}; - - rr.r = r; - rr.d = d; - rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } - - __mon_event_count(closid, rmid, &rr); - - /* - * If the software controller is enabled, compute the - * bandwidth for this event id. - */ - if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); - - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); -} - -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) -{ - /* - * This is protected from concurrent reads from user as both - * the user and overflow handler hold the global mutex. - */ - if (resctrl_arch_is_mbm_total_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - - if (resctrl_arch_is_mbm_local_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); -} - -/* - * Handler to scan the limbo list and move the RMIDs - * to free list whose occupancy < threshold_occupancy. - */ -void cqm_handle_limbo(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_mon_domain *d; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); - - __check_limbo(d, false); - - if (has_busy_rmid(d)) { - d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, - delay); - } - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this - * domain. - * @dom: The domain the limbo handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->cqm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); -} - -void mbm_handle_overflow(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); - struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; - struct list_head *head; - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* - * If the filesystem has been unmounted this work no longer needs to - * run. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - goto out_unlock; - - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - d = container_of(work, struct rdt_mon_domain, mbm_over.work); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); - - if (is_mba_sc(NULL)) - update_mba_bw(prgrp, d); - } - - /* - * Re-check for housekeeping CPUs. This allows the overflow handler to - * move off a nohz_full CPU quickly. - */ - d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this - * domain. - * @dom: The domain the overflow handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - /* - * When a domain comes online there is no guarantee the filesystem is - * mounted. If not, there is no need to catch counter overflow. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - return; - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->mbm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); -} - -static int dom_data_init(struct rdt_resource *r) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rmid_entry *entry = NULL; - int err = 0, i; - u32 idx; - - mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } - - closid_num_dirty_rmid = tmp; - } - - rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - err = -ENOMEM; - goto out_unlock; - } - - for (i = 0; i < idx_limit; i++) { - entry = &rmid_ptrs[i]; - INIT_LIST_HEAD(&entry->list); - - resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); - list_add_tail(&entry->list, &rmid_free_lru); - } - - /* - * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and - * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in resctrl_init(). - */ - idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID); - entry = __rmid_entry(idx); - list_del(&entry->list); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -static void dom_data_exit(struct rdt_resource *r) -{ - mutex_lock(&rdtgroup_mutex); - - if (!r->mon_capable) - goto out_unlock; - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - - kfree(rmid_ptrs); - rmid_ptrs = NULL; - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; - -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; - -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; - -/* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. - */ -static void l3_mon_evt_init(struct rdt_resource *r) -{ - INIT_LIST_HEAD(&r->evt_list); - - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); -} - /* * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 * which indicates that RMIDs are configured in legacy mode. @@ -1192,51 +341,6 @@ static __init int snc_get_config(void) return ret; } -/** - * resctrl_mon_resource_init() - Initialise global monitoring structures. - * - * Allocate and initialise global monitor resources that do not belong to a - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. - * Called once during boot after the struct rdt_resource's have been configured - * but before the filesystem is mounted. - * Resctrl's cpuhp callbacks may be called before this point to bring a domain - * online. - * - * Returns 0 for success, or -ENOMEM. - */ -int __init resctrl_mon_resource_init(void) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - int ret; - - if (!r->mon_capable) - return 0; - - ret = dom_data_init(r); - if (ret) - return ret; - - l3_mon_evt_init(r); - - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; - resctrl_file_fflags_init("mbm_total_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; - resctrl_file_fflags_init("mbm_local_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } - - if (resctrl_arch_is_mbm_local_enabled()) - mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) - mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; - - return 0; -} - int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; @@ -1284,13 +388,6 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } -void resctrl_mon_resource_exit(void) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - - dom_data_exit(r); -} - void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 92ea1472bde9..de580eca3363 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -11,26 +11,22 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/cacheflush.h> #include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/debugfs.h> -#include <linux/kthread.h> -#include <linux/mman.h> #include <linux/perf_event.h> #include <linux/pm_qos.h> -#include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/resctrl.h> -#include <asm/cacheflush.h> #include <asm/cpu_device_id.h> -#include <asm/resctrl.h> #include <asm/perf_event.h> +#include <asm/msr.h> #include "../../events/perf_event.h" /* For X86_CONFIG() */ #include "internal.h" #define CREATE_TRACE_POINTS -#include "trace.h" + +#include "pseudo_lock_trace.h" /* * The bits needed to disable hardware prefetching varies based on the @@ -38,29 +34,6 @@ */ static u64 prefetch_disable_bits; -/* - * Major number assigned to and shared by all devices exposing - * pseudo-locked regions. - */ -static unsigned int pseudo_lock_major; -static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); - -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) -{ - const struct rdtgroup *rdtgrp; - - rdtgrp = dev_get_drvdata(dev); - if (mode) - *mode = 0600; - guard(mutex)(&rdtgroup_mutex); - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); -} - -static const struct class pseudo_lock_class = { - .name = "pseudo_lock", - .devnode = pseudo_lock_devnode, -}; - /** * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported * platforms @@ -122,298 +95,6 @@ u64 resctrl_arch_get_prefetch_disable_bits(void) } /** - * pseudo_lock_minor_get - Obtain available minor number - * @minor: Pointer to where new minor number will be stored - * - * A bitmask is used to track available minor numbers. Here the next free - * minor number is marked as unavailable and returned. - * - * Return: 0 on success, <0 on failure. - */ -static int pseudo_lock_minor_get(unsigned int *minor) -{ - unsigned long first_bit; - - first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); - - if (first_bit == MINORBITS) - return -ENOSPC; - - __clear_bit(first_bit, &pseudo_lock_minor_avail); - *minor = first_bit; - - return 0; -} - -/** - * pseudo_lock_minor_release - Return minor number to available - * @minor: The minor number made available - */ -static void pseudo_lock_minor_release(unsigned int minor) -{ - __set_bit(minor, &pseudo_lock_minor_avail); -} - -/** - * region_find_by_minor - Locate a pseudo-lock region by inode minor number - * @minor: The minor number of the device representing pseudo-locked region - * - * When the character device is accessed we need to determine which - * pseudo-locked region it belongs to. This is done by matching the minor - * number of the device to the pseudo-locked region it belongs. - * - * Minor numbers are assigned at the time a pseudo-locked region is associated - * with a cache instance. - * - * Return: On success return pointer to resource group owning the pseudo-locked - * region, NULL on failure. - */ -static struct rdtgroup *region_find_by_minor(unsigned int minor) -{ - struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->plr && rdtgrp->plr->minor == minor) { - rdtgrp_match = rdtgrp; - break; - } - } - return rdtgrp_match; -} - -/** - * struct pseudo_lock_pm_req - A power management QoS request list entry - * @list: Entry within the @pm_reqs list for a pseudo-locked region - * @req: PM QoS request - */ -struct pseudo_lock_pm_req { - struct list_head list; - struct dev_pm_qos_request req; -}; - -static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req, *next; - - list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { - dev_pm_qos_remove_request(&pm_req->req); - list_del(&pm_req->list); - kfree(pm_req); - } -} - -/** - * pseudo_lock_cstates_constrain - Restrict cores from entering C6 - * @plr: Pseudo-locked region - * - * To prevent the cache from being affected by power management entering - * C6 has to be avoided. This is accomplished by requesting a latency - * requirement lower than lowest C6 exit latency of all supported - * platforms as found in the cpuidle state tables in the intel_idle driver. - * At this time it is possible to do so with a single latency requirement - * for all supported platforms. - * - * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, - * the ACPI latencies need to be considered while keeping in mind that C2 - * may be set to map to deeper sleep states. In this case the latency - * requirement needs to prevent entering C2 also. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req; - int cpu; - int ret; - - for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { - pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); - if (!pm_req) { - rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); - ret = -ENOMEM; - goto out_err; - } - ret = dev_pm_qos_add_request(get_cpu_device(cpu), - &pm_req->req, - DEV_PM_QOS_RESUME_LATENCY, - 30); - if (ret < 0) { - rdt_last_cmd_printf("Failed to add latency req CPU%d\n", - cpu); - kfree(pm_req); - ret = -1; - goto out_err; - } - list_add(&pm_req->list, &plr->pm_reqs); - } - - return 0; - -out_err: - pseudo_lock_cstates_relax(plr); - return ret; -} - -/** - * pseudo_lock_region_clear - Reset pseudo-lock region data - * @plr: pseudo-lock region - * - * All content of the pseudo-locked region is reset - any memory allocated - * freed. - * - * Return: void - */ -static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) -{ - plr->size = 0; - plr->line_size = 0; - kfree(plr->kmem); - plr->kmem = NULL; - plr->s = NULL; - if (plr->d) - plr->d->plr = NULL; - plr->d = NULL; - plr->cbm = 0; - plr->debugfs_dir = NULL; -} - -/** - * pseudo_lock_region_init - Initialize pseudo-lock region information - * @plr: pseudo-lock region - * - * Called after user provided a schemata to be pseudo-locked. From the - * schemata the &struct pseudo_lock_region is on entry already initialized - * with the resource, domain, and capacity bitmask. Here the information - * required for pseudo-locking is deduced from this data and &struct - * pseudo_lock_region initialized further. This information includes: - * - size in bytes of the region to be pseudo-locked - * - cache line size to know the stride with which data needs to be accessed - * to be pseudo-locked - * - a cpu associated with the cache instance on which the pseudo-locking - * flow can be executed - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_init(struct pseudo_lock_region *plr) -{ - enum resctrl_scope scope = plr->s->res->ctrl_scope; - struct cacheinfo *ci; - int ret; - - if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) - return -ENODEV; - - /* Pick the first cpu we find that is associated with the cache. */ - plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); - - if (!cpu_online(plr->cpu)) { - rdt_last_cmd_printf("CPU %u associated with cache not online\n", - plr->cpu); - ret = -ENODEV; - goto out_region; - } - - ci = get_cpu_cacheinfo_level(plr->cpu, scope); - if (ci) { - plr->line_size = ci->coherency_line_size; - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); - return 0; - } - - ret = -1; - rdt_last_cmd_puts("Unable to determine cache line size\n"); -out_region: - pseudo_lock_region_clear(plr); - return ret; -} - -/** - * pseudo_lock_init - Initialize a pseudo-lock region - * @rdtgrp: resource group to which new pseudo-locked region will belong - * - * A pseudo-locked region is associated with a resource group. When this - * association is created the pseudo-locked region is initialized. The - * details of the pseudo-locked region are not known at this time so only - * allocation is done and association established. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_init(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr; - - plr = kzalloc(sizeof(*plr), GFP_KERNEL); - if (!plr) - return -ENOMEM; - - init_waitqueue_head(&plr->lock_thread_wq); - INIT_LIST_HEAD(&plr->pm_reqs); - rdtgrp->plr = plr; - return 0; -} - -/** - * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked - * @plr: pseudo-lock region - * - * Initialize the details required to set up the pseudo-locked region and - * allocate the contiguous memory that will be pseudo-locked to the cache. - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) -{ - int ret; - - ret = pseudo_lock_region_init(plr); - if (ret < 0) - return ret; - - /* - * We do not yet support contiguous regions larger than - * KMALLOC_MAX_SIZE. - */ - if (plr->size > KMALLOC_MAX_SIZE) { - rdt_last_cmd_puts("Requested region exceeds maximum size\n"); - ret = -E2BIG; - goto out_region; - } - - plr->kmem = kzalloc(plr->size, GFP_KERNEL); - if (!plr->kmem) { - rdt_last_cmd_puts("Unable to allocate memory\n"); - ret = -ENOMEM; - goto out_region; - } - - ret = 0; - goto out; -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * pseudo_lock_free - Free a pseudo-locked region - * @rdtgrp: resource group to which pseudo-locked region belonged - * - * The pseudo-locked region's resources have already been released, or not - * yet created at this point. Now it can be freed and disassociated from the - * resource group. - * - * Return: void - */ -static void pseudo_lock_free(struct rdtgroup *rdtgrp) -{ - pseudo_lock_region_clear(rdtgrp->plr); - kfree(rdtgrp->plr); - rdtgrp->plr = NULL; -} - -/** * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache * @_plr: the pseudo-lock region descriptor * @@ -481,8 +162,8 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ - saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL); + native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); mem_r = plr->kmem; @@ -494,7 +175,7 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) * pseudo-locked followed by reading of kernel memory to load it * into the cache. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); /* * Cache was flushed earlier. Now access kernel memory to read it @@ -531,10 +212,10 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) * Critical section end: restore closid with capacity bitmask that * does not overlap with pseudo-locked region. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); + wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -543,340 +224,6 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) } /** - * rdtgroup_monitor_in_progress - Test if monitoring in progress - * @rdtgrp: resource group being queried - * - * Return: 1 if monitor groups have been created for this resource - * group, 0 otherwise. - */ -static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) -{ - return !list_empty(&rdtgrp->mon.crdtgrp_list); -} - -/** - * rdtgroup_locksetup_user_restrict - Restrict user access to group - * @rdtgrp: resource group needing access restricted - * - * A resource group used for cache pseudo-locking cannot have cpus or tasks - * assigned to it. This is communicated to the user by restricting access - * to all the files that can be used to make such changes. - * - * Permissions restored with rdtgroup_locksetup_user_restore() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restriction of access an attempt will be made to restore permissions but - * the state of the mode of these files will be uncertain when a failure - * occurs. - */ -static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); -err_cpus: - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); -err_tasks: - rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); -out: - return ret; -} - -/** - * rdtgroup_locksetup_user_restore - Restore user access to group - * @rdtgrp: resource group needing access restored - * - * Restore all file access previously removed using - * rdtgroup_locksetup_user_restrict() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restoration of access an attempt will be made to restrict permissions - * again but the state of the mode of these files will be uncertain when - * a failure occurs. - */ -static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); -err_cpus: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); -err_tasks: - rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); -out: - return ret; -} - -/** - * rdtgroup_locksetup_enter - Resource group enters locksetup mode - * @rdtgrp: resource group requested to enter locksetup mode - * - * A resource group enters locksetup mode to reflect that it would be used - * to represent a pseudo-locked region and is in the process of being set - * up to do so. A resource group used for a pseudo-locked region would - * lose the closid associated with it so we cannot allow it to have any - * tasks or cpus assigned nor permit tasks or cpus to be assigned in the - * future. Monitoring of a pseudo-locked region is not allowed either. - * - * The above and more restrictions on a pseudo-locked region are checked - * for and enforced before the resource group enters the locksetup mode. - * - * Returns: 0 if the resource group successfully entered locksetup mode, <0 - * on failure. On failure the last_cmd_status buffer is updated with text to - * communicate details of failure to the user. - */ -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - int ret; - - /* - * The default resource group can neither be removed nor lose the - * default closid associated with it. - */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); - return -EINVAL; - } - - /* - * Cache Pseudo-locking not supported when CDP is enabled. - * - * Some things to consider if you would like to enable this - * support (using L3 CDP as example): - * - When CDP is enabled two separate resources are exposed, - * L3DATA and L3CODE, but they are actually on the same cache. - * The implication for pseudo-locking is that if a - * pseudo-locked region is created on a domain of one - * resource (eg. L3CODE), then a pseudo-locked region cannot - * be created on that same domain of the other resource - * (eg. L3DATA). This is because the creation of a - * pseudo-locked region involves a call to wbinvd that will - * affect all cache allocations on particular domain. - * - Considering the previous, it may be possible to only - * expose one of the CDP resources to pseudo-locking and - * hide the other. For example, we could consider to only - * expose L3DATA and since the L3 cache is unified it is - * still possible to place instructions there are execute it. - * - If only one region is exposed to pseudo-locking we should - * still keep in mind that availability of a portion of cache - * for pseudo-locking should take into account both resources. - * Similarly, if a pseudo-locked region is created in one - * resource, the portion of cache used by it should be made - * unavailable to all future allocations from both resources. - */ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || - resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { - rdt_last_cmd_puts("CDP enabled\n"); - return -EINVAL; - } - - /* - * Not knowing the bits to disable prefetching implies that this - * platform does not support Cache Pseudo-Locking. - */ - if (resctrl_arch_get_prefetch_disable_bits() == 0) { - rdt_last_cmd_puts("Pseudo-locking not supported\n"); - return -EINVAL; - } - - if (rdtgroup_monitor_in_progress(rdtgrp)) { - rdt_last_cmd_puts("Monitoring in progress\n"); - return -EINVAL; - } - - if (rdtgroup_tasks_assigned(rdtgrp)) { - rdt_last_cmd_puts("Tasks assigned to resource group\n"); - return -EINVAL; - } - - if (!cpumask_empty(&rdtgrp->cpu_mask)) { - rdt_last_cmd_puts("CPUs assigned to resource group\n"); - return -EINVAL; - } - - if (rdtgroup_locksetup_user_restrict(rdtgrp)) { - rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); - return -EIO; - } - - ret = pseudo_lock_init(rdtgrp); - if (ret) { - rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); - goto out_release; - } - - /* - * If this system is capable of monitoring a rmid would have been - * allocated when the control group was created. This is not needed - * anymore when this group would be used for pseudo-locking. This - * is safe to call on platforms not capable of monitoring. - */ - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - ret = 0; - goto out; - -out_release: - rdtgroup_locksetup_user_restore(rdtgrp); -out: - return ret; -} - -/** - * rdtgroup_locksetup_exit - resource group exist locksetup mode - * @rdtgrp: resource group - * - * When a resource group exits locksetup mode the earlier restrictions are - * lifted. - * - * Return: 0 on success, <0 on failure - */ -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - int ret; - - if (resctrl_arch_mon_capable()) { - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - } - - ret = rdtgroup_locksetup_user_restore(rdtgrp); - if (ret) { - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - pseudo_lock_free(rdtgrp); - return 0; -} - -/** - * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked - * @d: RDT domain - * @cbm: CBM to test - * - * @d represents a cache instance and @cbm a capacity bitmask that is - * considered for it. Determine if @cbm overlaps with any existing - * pseudo-locked region on @d. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: true if @cbm overlaps with pseudo-locked region on @d, false - * otherwise. - */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int cbm_len; - unsigned long cbm_b; - - if (d->plr) { - cbm_len = d->plr->s->res->cache.cbm_len; - cbm_b = d->plr->cbm; - if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) - return true; - } - return false; -} - -/** - * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy - * @d: RDT domain under test - * - * The setup of a pseudo-locked region affects all cache instances within - * the hierarchy of the region. It is thus essential to know if any - * pseudo-locked regions exist within a cache hierarchy to prevent any - * attempts to create new pseudo-locked regions in the same hierarchy. - * - * Return: true if a pseudo-locked region exists in the hierarchy of @d or - * if it is not possible to test due to memory allocation issue, - * false otherwise. - */ -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) -{ - struct rdt_ctrl_domain *d_i; - cpumask_var_t cpu_with_psl; - struct rdt_resource *r; - bool ret = false; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) - return true; - - /* - * First determine which cpus have pseudo-locked regions - * associated with them. - */ - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { - if (d_i->plr) - cpumask_or(cpu_with_psl, cpu_with_psl, - &d_i->hdr.cpu_mask); - } - } - - /* - * Next test if new pseudo-locked region would intersect with - * existing region. - */ - if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) - ret = true; - - free_cpumask_var(cpu_with_psl); - return ret; -} - -/** * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read * pseudo-locked memory * @_plr: pseudo-lock region to measure @@ -904,7 +251,7 @@ int resctrl_arch_measure_cycles_lat_fn(void *_plr) * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); mem_r = READ_ONCE(plr->kmem); /* * Dummy execute of the time measurement to load the needed @@ -1000,7 +347,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); /* Initialize rest of local variables */ /* @@ -1018,8 +365,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * used in L1 cache, second to capture accurate value that does not * include cache misses incurred because of instruction loads. */ - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * From SDM: Performing back-to-back fast reads are not guaranteed * to be monotonic. @@ -1027,8 +374,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. @@ -1050,8 +397,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_after); - rdpmcl(miss_pmcnum, miss_after); + hits_after = rdpmc(hit_pmcnum); + miss_after = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. @@ -1168,433 +515,3 @@ out: wake_up_interruptible(&plr->lock_thread_wq); return 0; } - -/** - * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region - * @rdtgrp: Resource group to which the pseudo-locked region belongs. - * @sel: Selector of which measurement to perform on a pseudo-locked region. - * - * The measurement of latency to access a pseudo-locked region should be - * done from a cpu that is associated with that pseudo-locked region. - * Determine which cpu is associated with this region and start a thread on - * that cpu to perform the measurement, wait for that thread to complete. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int cpu; - int ret = -1; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out; - } - - if (!plr->d) { - ret = -ENODEV; - goto out; - } - - plr->thread_done = 0; - cpu = cpumask_first(&plr->d->hdr.cpu_mask); - if (!cpu_online(cpu)) { - ret = -ENODEV; - goto out; - } - - plr->cpu = cpu; - - if (sel == 1) - thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, - plr, cpu, "pseudo_lock_measure/%u"); - else if (sel == 2) - thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, - plr, cpu, "pseudo_lock_measure/%u"); - else if (sel == 3) - thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, - plr, cpu, "pseudo_lock_measure/%u"); - else - goto out; - - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - goto out; - } - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) - goto out; - - ret = 0; - -out: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -static ssize_t pseudo_lock_measure_trigger(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct rdtgroup *rdtgrp = file->private_data; - size_t buf_size; - char buf[32]; - int ret; - int sel; - - buf_size = min(count, (sizeof(buf) - 1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - ret = kstrtoint(buf, 10, &sel); - if (ret == 0) { - if (sel != 1 && sel != 2 && sel != 3) - return -EINVAL; - ret = debugfs_file_get(file->f_path.dentry); - if (ret) - return ret; - ret = pseudo_lock_measure_cycles(rdtgrp, sel); - if (ret == 0) - ret = count; - debugfs_file_put(file->f_path.dentry); - } - - return ret; -} - -static const struct file_operations pseudo_measure_fops = { - .write = pseudo_lock_measure_trigger, - .open = simple_open, - .llseek = default_llseek, -}; - -/** - * rdtgroup_pseudo_lock_create - Create a pseudo-locked region - * @rdtgrp: resource group to which pseudo-lock region belongs - * - * Called when a resource group in the pseudo-locksetup mode receives a - * valid schemata that should be pseudo-locked. Since the resource group is - * in pseudo-locksetup mode the &struct pseudo_lock_region has already been - * allocated and initialized with the essential information. If a failure - * occurs the resource group remains in the pseudo-locksetup mode with the - * &struct pseudo_lock_region associated with it, but cleared from all - * information and ready for the user to re-attempt pseudo-locking by - * writing the schemata again. - * - * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 - * on failure. Descriptive error will be written to last_cmd_status buffer. - */ -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int new_minor; - struct device *dev; - char *kn_name __free(kfree) = NULL; - int ret; - - ret = pseudo_lock_region_alloc(plr); - if (ret < 0) - return ret; - - ret = pseudo_lock_cstates_constrain(plr); - if (ret < 0) { - ret = -EINVAL; - goto out_region; - } - kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); - if (!kn_name) { - ret = -ENOMEM; - goto out_cstates; - } - - plr->thread_done = 0; - - thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, - plr->cpu, "pseudo_lock/%u"); - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - rdt_last_cmd_printf("Locking thread returned error %d\n", ret); - goto out_cstates; - } - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) { - /* - * If the thread does not get on the CPU for whatever - * reason and the process which sets up the region is - * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will dereference - * the cleared, but not freed, plr struct resulting in an - * empty pseudo-locking loop. - */ - rdt_last_cmd_puts("Locking thread interrupted\n"); - goto out_cstates; - } - - ret = pseudo_lock_minor_get(&new_minor); - if (ret < 0) { - rdt_last_cmd_puts("Unable to obtain a new minor number\n"); - goto out_cstates; - } - - /* - * Unlock access but do not release the reference. The - * pseudo-locked region will still be here on return. - * - * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_lock which is obtained in the - * device_create() and debugfs_create_dir() callpath below as well as - * before the mmap() callback is called. - */ - mutex_unlock(&rdtgroup_mutex); - - if (!IS_ERR_OR_NULL(debugfs_resctrl)) { - plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); - if (!IS_ERR_OR_NULL(plr->debugfs_dir)) - debugfs_create_file("pseudo_lock_measure", 0200, - plr->debugfs_dir, rdtgrp, - &pseudo_measure_fops); - } - - dev = device_create(&pseudo_lock_class, NULL, - MKDEV(pseudo_lock_major, new_minor), - rdtgrp, "%s", kn_name); - - mutex_lock(&rdtgroup_mutex); - - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - rdt_last_cmd_printf("Failed to create character device: %d\n", - ret); - goto out_debugfs; - } - - /* We released the mutex - check if group was removed while we did so */ - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out_device; - } - - plr->minor = new_minor; - - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; - closid_free(rdtgrp->closid); - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); - - ret = 0; - goto out; - -out_device: - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); -out_debugfs: - debugfs_remove_recursive(plr->debugfs_dir); - pseudo_lock_minor_release(new_minor); -out_cstates: - pseudo_lock_cstates_relax(plr); -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region - * @rdtgrp: resource group to which the pseudo-locked region belongs - * - * The removal of a pseudo-locked region can be initiated when the resource - * group is removed from user space via a "rmdir" from userspace or the - * unmount of the resctrl filesystem. On removal the resource group does - * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus asymmetry with the creation where the - * &struct pseudo_lock_region is removed here while it was not created in - * rdtgroup_pseudo_lock_create(). - * - * Return: void - */ -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * Default group cannot be a pseudo-locked region so we can - * free closid here. - */ - closid_free(rdtgrp->closid); - goto free; - } - - pseudo_lock_cstates_relax(plr); - debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); - pseudo_lock_minor_release(plr->minor); - -free: - pseudo_lock_free(rdtgrp); -} - -static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = region_find_by_minor(iminor(inode)); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - filp->private_data = rdtgrp; - atomic_inc(&rdtgrp->waitcount); - /* Perform a non-seekable open - llseek is not supported */ - filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); - - mutex_unlock(&rdtgroup_mutex); - - return 0; -} - -static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - filp->private_data = NULL; - atomic_dec(&rdtgrp->waitcount); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) -{ - /* Not supported */ - return -EINVAL; -} - -static const struct vm_operations_struct pseudo_mmap_ops = { - .mremap = pseudo_lock_dev_mremap, -}; - -static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long vsize = vma->vm_end - vma->vm_start; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - struct pseudo_lock_region *plr; - struct rdtgroup *rdtgrp; - unsigned long physical; - unsigned long psize; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - plr = rdtgrp->plr; - - if (!plr->d) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - /* - * Task is required to run with affinity to the cpus associated - * with the pseudo-locked region. If this is not the case the task - * may be scheduled elsewhere and invalidate entries in the - * pseudo-locked region. - */ - if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - physical = __pa(plr->kmem) >> PAGE_SHIFT; - psize = plr->size - off; - - if (off > plr->size) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - /* - * Ensure changes are carried directly to the memory being mapped, - * do not allow copy-on-write mapping. - */ - if (!(vma->vm_flags & VM_SHARED)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - if (vsize > psize) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - memset(plr->kmem + off, 0, vsize); - - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, - vsize, vma->vm_page_prot)) { - mutex_unlock(&rdtgroup_mutex); - return -EAGAIN; - } - vma->vm_ops = &pseudo_mmap_ops; - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static const struct file_operations pseudo_lock_dev_fops = { - .owner = THIS_MODULE, - .read = NULL, - .write = NULL, - .open = pseudo_lock_dev_open, - .release = pseudo_lock_dev_release, - .mmap = pseudo_lock_dev_mmap, -}; - -int rdt_pseudo_lock_init(void) -{ - int ret; - - ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); - if (ret < 0) - return ret; - - pseudo_lock_major = ret; - - ret = class_register(&pseudo_lock_class); - if (ret) { - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - return ret; - } - - return 0; -} - -void rdt_pseudo_lock_release(void) -{ - class_unregister(&pseudo_lock_class); - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - pseudo_lock_major = 0; -} diff --git a/arch/x86/kernel/cpu/resctrl/trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h index 2a506316b303..7c8aef08010f 100644 --- a/arch/x86/kernel/cpu/resctrl/trace.h +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h @@ -2,8 +2,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM resctrl -#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_RESCTRL_H +#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H #include <linux/tracepoint.h> @@ -35,25 +35,11 @@ TRACE_EVENT(pseudo_lock_l3, TP_printk("hits=%llu miss=%llu", __entry->l3_hits, __entry->l3_miss)); -TRACE_EVENT(mon_llc_occupancy_limbo, - TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), - TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), - TP_STRUCT__entry(__field(u32, ctrl_hw_id) - __field(u32, mon_hw_id) - __field(int, domain_id) - __field(u64, llc_occupancy_bytes)), - TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; - __entry->mon_hw_id = mon_hw_id; - __entry->domain_id = domain_id; - __entry->llc_occupancy_bytes = llc_occupancy_bytes;), - TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", - __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, - __entry->llc_occupancy_bytes) - ); - -#endif /* _TRACE_RESCTRL_H */ +#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE trace + +#define TRACE_INCLUDE_FILE pseudo_lock_trace + #include <trace/define_trace.h> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index cc4a54145c83..885026468440 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -18,6 +18,7 @@ #include <linux/fs_parser.h> #include <linux/sysfs.h> #include <linux/kernfs.h> +#include <linux/resctrl.h> #include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> @@ -28,341 +29,17 @@ #include <uapi/linux/magic.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); - -static struct kernfs_root *rdt_root; -struct rdtgroup rdtgroup_default; -LIST_HEAD(rdt_all_groups); - -/* list of entries for the schemata file */ -LIST_HEAD(resctrl_schema_all); - -/* The filesystem can only be mounted once. */ -bool resctrl_mounted; - -/* Kernel fs node for "info" directory under root */ -static struct kernfs_node *kn_info; - -/* Kernel fs node for "mon_groups" directory under root */ -static struct kernfs_node *kn_mongrp; - -/* Kernel fs node for "mon_data" directory under root */ -static struct kernfs_node *kn_mondata; - -/* - * Used to store the max resource name width to display the schemata names in - * a tabular format. - */ -int max_name_width; - -static struct seq_buf last_cmd_status; -static char last_cmd_status_buf[512]; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx); -static void rdtgroup_destroy_root(void); - -struct dentry *debugfs_resctrl; - -/* - * Memory bandwidth monitoring event to use for the default CTRL_MON group - * and each new CTRL_MON group created by the user. Only relevant when - * the filesystem is mounted with the "mba_MBps" option so it does not - * matter that it remains uninitialized on systems that do not support - * the "mba_MBps" option. - */ -enum resctrl_event_id mba_mbps_default_event; - -static bool resctrl_debug; - -void rdt_last_cmd_clear(void) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_clear(&last_cmd_status); -} - -void rdt_last_cmd_puts(const char *s) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_puts(&last_cmd_status, s); -} - -void rdt_last_cmd_printf(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_vprintf(&last_cmd_status, fmt, ap); - va_end(ap); -} - -void rdt_staged_configs_clear(void) -{ - struct rdt_ctrl_domain *dom; - struct rdt_resource *r; - - lockdep_assert_held(&rdtgroup_mutex); - - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) - memset(dom->staged_config, 0, sizeof(dom->staged_config)); - } -} - -static bool resctrl_is_mbm_enabled(void) -{ - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); -} - -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - -/* - * Trivial allocator for CLOSIDs. Since h/w only supports a small number, - * we can keep a bitmap of free CLOSIDs in a single integer. - * - * Using a global CLOSID across all resources has some advantages and - * some drawbacks: - * + We can simply set current's closid to assign a task to a resource - * group. - * + Context switch code can avoid extra memory references deciding which - * CLOSID to load into the PQR_ASSOC MSR - * - We give up some options in configuring resource groups across multi-socket - * systems. - * - Our choices on how to configure each resource become progressively more - * limited as the number of resources grows. - */ -static unsigned long closid_free_map; -static int closid_free_map_len; - -int closids_supported(void) -{ - return closid_free_map_len; -} - -static void closid_init(void) -{ - struct resctrl_schema *s; - u32 rdt_min_closid = 32; - - /* Compute rdt_min_closid across all resources */ - list_for_each_entry(s, &resctrl_schema_all, list) - rdt_min_closid = min(rdt_min_closid, s->num_closid); - - closid_free_map = BIT_MASK(rdt_min_closid) - 1; - - /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ - __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); - closid_free_map_len = rdt_min_closid; -} - -static int closid_alloc(void) -{ - int cleanest_closid; - u32 closid; - - lockdep_assert_held(&rdtgroup_mutex); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { - cleanest_closid = resctrl_find_cleanest_closid(); - if (cleanest_closid < 0) - return cleanest_closid; - closid = cleanest_closid; - } else { - closid = ffs(closid_free_map); - if (closid == 0) - return -ENOSPC; - closid--; - } - __clear_bit(closid, &closid_free_map); - - return closid; -} - -void closid_free(int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - __set_bit(closid, &closid_free_map); -} - -/** - * closid_allocated - test if provided closid is in use - * @closid: closid to be tested - * - * Return: true if @closid is currently associated with a resource group, - * false if @closid is free - */ -bool closid_allocated(unsigned int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - return !test_bit(closid, &closid_free_map); -} - -/** - * rdtgroup_mode_by_closid - Return mode of resource group with closid - * @closid: closid if the resource group - * - * Each resource group is associated with a @closid. Here the mode - * of a resource group can be queried by searching for it using its closid. - * - * Return: mode as &enum rdtgrp_mode of resource group with closid @closid - */ -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) -{ - struct rdtgroup *rdtgrp; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->closid == closid) - return rdtgrp->mode; - } - - return RDT_NUM_MODES; -} - -static const char * const rdt_mode_str[] = { - [RDT_MODE_SHAREABLE] = "shareable", - [RDT_MODE_EXCLUSIVE] = "exclusive", - [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", - [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", -}; - -/** - * rdtgroup_mode_str - Return the string representation of mode - * @mode: the resource group mode as &enum rdtgroup_mode - * - * Return: string representation of valid mode, "unknown" otherwise - */ -static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) -{ - if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) - return "unknown"; - - return rdt_mode_str[mode]; -} -/* set uid and gid of rdtgroup dirs and files to that of the creator */ -static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} - -static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) -{ - struct kernfs_node *kn; - int ret; - - kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - 0, rft->kf_ops, rft, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return 0; -} - -static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rftype *rft = of->kn->priv; - - if (rft->seq_show) - return rft->seq_show(of, m, arg); - return 0; -} - -static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) -{ - struct rftype *rft = of->kn->priv; - - if (rft->write) - return rft->write(of, buf, nbytes, off); - - return -EINVAL; -} - -static const struct kernfs_ops rdtgroup_kf_single_ops = { - .atomic_write_len = PAGE_SIZE, - .write = rdtgroup_file_write, - .seq_show = rdtgroup_seqfile_show, -}; - -static const struct kernfs_ops kf_mondata_ops = { - .atomic_write_len = PAGE_SIZE, - .seq_show = rdtgroup_mondata_show, -}; - -static bool is_cpu_list(struct kernfs_open_file *of) -{ - struct rftype *rft = of->kn->priv; - - return rft->flags & RFTYPE_FLAGS_CPUS_LIST; -} - -static int rdtgroup_cpus_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - struct cpumask *mask; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - mask = &rdtgrp->plr->d->hdr.cpu_mask; - seq_printf(s, is_cpu_list(of) ? - "%*pbl\n" : "%*pb\n", - cpumask_pr_args(mask)); - } - } else { - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->cpu_mask)); - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); - return ret; -} +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); /* - * This is safe against resctrl_sched_in() called from __switch_to() + * This is safe against resctrl_arch_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. @@ -381,1223 +58,7 @@ void resctrl_arch_sync_cpu_closid_rmid(void *info) * executing task might have its own closid selected. Just reuse * the context switch code. */ - resctrl_sched_in(current); -} - -/* - * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, - * - * Per task closids/rmids must have been set up before calling this function. - * @r may be NULL. - */ -static void -update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) -{ - struct resctrl_cpu_defaults defaults, *p = NULL; - - if (r) { - defaults.closid = r->closid; - defaults.rmid = r->mon.rmid; - p = &defaults; - } - - on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); -} - -static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask) -{ - struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; - struct list_head *head; - - /* Check whether cpus belong to parent ctrl group */ - cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); - return -EINVAL; - } - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Give any dropped cpus to parent rdtgroup */ - cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); - update_closid_rmid(tmpmask, prgrp); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu rmid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - if (crgrp == rdtgrp) - continue; - cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, - tmpmask); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - return 0; -} - -static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) -{ - struct rdtgroup *crgrp; - - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); - /* update the child mon group masks as well*/ - list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) - cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); -} - -static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask, cpumask_var_t tmpmask1) -{ - struct rdtgroup *r, *crgrp; - struct list_head *head; - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Can't drop CPUs from default group\n"); - return -EINVAL; - } - - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - update_closid_rmid(tmpmask, &rdtgroup_default); - } - - /* - * If we added cpus, remove them from previous group and - * the prev group's child groups that owned them - * and update per-cpu closid/rmid. - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (!cpumask_empty(tmpmask1)) - cpumask_rdtgrp_clear(r, tmpmask1); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - /* - * Clear child mon group masks since there is a new parent mask - * now and update the rmid for the cpus the child lost. - */ - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); - update_closid_rmid(tmpmask, rdtgrp); - cpumask_clear(&crgrp->cpu_mask); - } - - return 0; -} - -static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - cpumask_var_t tmpmask, newmask, tmpmask1; - struct rdtgroup *rdtgrp; - int ret; - - if (!buf) - return -EINVAL; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - return -ENOMEM; - } - if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - return -ENOMEM; - } - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto unlock; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - if (is_cpu_list(of)) - ret = cpulist_parse(buf, newmask); - else - ret = cpumask_parse(buf, newmask); - - if (ret) { - rdt_last_cmd_puts("Bad CPU list/mask\n"); - goto unlock; - } - - /* check that user didn't specify any offline cpus */ - cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (!cpumask_empty(tmpmask)) { - ret = -EINVAL; - rdt_last_cmd_puts("Can only assign online CPUs\n"); - goto unlock; - } - - if (rdtgrp->type == RDTCTRL_GROUP) - ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); - else if (rdtgrp->type == RDTMON_GROUP) - ret = cpus_mon_write(rdtgrp, newmask, tmpmask); - else - ret = -EINVAL; - -unlock: - rdtgroup_kn_unlock(of->kn); - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - free_cpumask_var(tmpmask1); - - return ret ?: nbytes; -} - -/** - * rdtgroup_remove - the helper to remove resource group safely - * @rdtgrp: resource group to remove - * - * On resource group creation via a mkdir, an extra kernfs_node reference is - * taken to ensure that the rdtgroup structure remains accessible for the - * rdtgroup_kn_unlock() calls where it is removed. - * - * Drop the extra reference here, then free the rdtgroup structure. - * - * Return: void - */ -static void rdtgroup_remove(struct rdtgroup *rdtgrp) -{ - kernfs_put(rdtgrp->kn); - kfree(rdtgrp); -} - -static void _update_task_closid_rmid(void *task) -{ - /* - * If the task is still current on this CPU, update PQR_ASSOC MSR. - * Otherwise, the MSR is updated when the task is scheduled in. - */ - if (task == current) - resctrl_sched_in(task); -} - -static void update_task_closid_rmid(struct task_struct *t) -{ - if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) - smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); - else - _update_task_closid_rmid(t); -} - -static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) -{ - u32 closid, rmid = rdtgrp->mon.rmid; - - if (rdtgrp->type == RDTCTRL_GROUP) - closid = rdtgrp->closid; - else if (rdtgrp->type == RDTMON_GROUP) - closid = rdtgrp->mon.parent->closid; - else - return false; - - return resctrl_arch_match_closid(tsk, closid) && - resctrl_arch_match_rmid(tsk, closid, rmid); -} - -static int __rdtgroup_move_task(struct task_struct *tsk, - struct rdtgroup *rdtgrp) -{ - /* If the task is already in rdtgrp, no need to move the task. */ - if (task_in_rdtgroup(tsk, rdtgrp)) - return 0; - - /* - * Set the task's closid/rmid before the PQR_ASSOC MSR can be - * updated by them. - * - * For ctrl_mon groups, move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTMON_GROUP && - !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } - - if (rdtgrp->type == RDTMON_GROUP) - resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, - rdtgrp->mon.rmid); - else - resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, - rdtgrp->mon.rmid); - - /* - * Ensure the task's closid and rmid are written before determining if - * the task is current that will decide if it will be interrupted. - * This pairs with the full barrier between the rq->curr update and - * resctrl_sched_in() during context switch. - */ - smp_mb(); - - /* - * By now, the task's closid and rmid are set. If the task is current - * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource - * group go into effect. If the task is not current, the MSR will be - * updated when the task is scheduled in. - */ - update_task_closid_rmid(tsk); - - return 0; -} - -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && - resctrl_arch_match_closid(t, r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && - resctrl_arch_match_rmid(t, r->mon.parent->closid, - r->mon.rmid)); -} - -/** - * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group - * @r: Resource group - * - * Return: 1 if tasks have been assigned to @r, 0 otherwise - */ -int rdtgroup_tasks_assigned(struct rdtgroup *r) -{ - struct task_struct *p, *t; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - ret = 1; - break; - } - } - rcu_read_unlock(); - - return ret; -} - -static int rdtgroup_task_write_permission(struct task_struct *task, - struct kernfs_open_file *of) -{ - const struct cred *tcred = get_task_cred(task); - const struct cred *cred = current_cred(); - int ret = 0; - - /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. - */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - rdt_last_cmd_printf("No permission to move task %d\n", task->pid); - ret = -EPERM; - } - - put_cred(tcred); - return ret; -} - -static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, - struct kernfs_open_file *of) -{ - struct task_struct *tsk; - int ret; - - rcu_read_lock(); - if (pid) { - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - rdt_last_cmd_printf("No task %d\n", pid); - return -ESRCH; - } - } else { - tsk = current; - } - - get_task_struct(tsk); - rcu_read_unlock(); - - ret = rdtgroup_task_write_permission(tsk, of); - if (!ret) - ret = __rdtgroup_move_task(tsk, rdtgrp); - - put_task_struct(tsk); - return ret; -} - -static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - char *pid_str; - int ret = 0; - pid_t pid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - while (buf && buf[0] != '\0' && buf[0] != '\n') { - pid_str = strim(strsep(&buf, ",")); - - if (kstrtoint(pid_str, 0, &pid)) { - rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); - ret = -EINVAL; - break; - } - - if (pid < 0) { - rdt_last_cmd_printf("Invalid pid %d\n", pid); - ret = -EINVAL; - break; - } - - ret = rdtgroup_move_task(pid, rdtgrp, of); - if (ret) { - rdt_last_cmd_printf("Error while processing task %d\n", pid); - break; - } - } - -unlock: - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) -{ - struct task_struct *p, *t; - pid_t pid; - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - pid = task_pid_vnr(t); - if (pid) - seq_printf(s, "%d\n", pid); - } - } - rcu_read_unlock(); -} - -static int rdtgroup_tasks_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - show_rdt_tasks(rdtgrp, s); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_closid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->closid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_rmid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->mon.rmid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -#ifdef CONFIG_PROC_CPU_RESCTRL - -/* - * A task can only be part of one resctrl control group and of one monitor - * group which is associated to that control group. - * - * 1) res: - * mon: - * - * resctrl is not available. - * - * 2) res:/ - * mon: - * - * Task is part of the root resctrl control group, and it is not associated - * to any monitor group. - * - * 3) res:/ - * mon:mon0 - * - * Task is part of the root resctrl control group and monitor group mon0. - * - * 4) res:group0 - * mon: - * - * Task is part of resctrl control group group0, and it is not associated - * to any monitor group. - * - * 5) res:group0 - * mon:mon1 - * - * Task is part of resctrl control group group0 and monitor group mon1. - */ -int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - struct rdtgroup *rdtg; - int ret = 0; - - mutex_lock(&rdtgroup_mutex); - - /* Return empty if resctrl has not been mounted. */ - if (!resctrl_mounted) { - seq_puts(s, "res:\nmon:\n"); - goto unlock; - } - - list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { - struct rdtgroup *crg; - - /* - * Task information is only relevant for shareable - * and exclusive groups. - */ - if (rdtg->mode != RDT_MODE_SHAREABLE && - rdtg->mode != RDT_MODE_EXCLUSIVE) - continue; - - if (!resctrl_arch_match_closid(tsk, rdtg->closid)) - continue; - - seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", - rdt_kn_name(rdtg->kn)); - seq_puts(s, "mon:"); - list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, - mon.crdtgrp_list) { - if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, - crg->mon.rmid)) - continue; - seq_printf(s, "%s", rdt_kn_name(crg->kn)); - break; - } - seq_putc(s, '\n'); - goto unlock; - } - /* - * The above search should succeed. Otherwise return - * with an error. - */ - ret = -ENOENT; -unlock: - mutex_unlock(&rdtgroup_mutex); - - return ret; -} -#endif - -static int rdt_last_cmd_status_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - int len; - - mutex_lock(&rdtgroup_mutex); - len = seq_buf_used(&last_cmd_status); - if (len) - seq_printf(seq, "%.*s", len, last_cmd_status_buf); - else - seq_puts(seq, "ok\n"); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static void *rdt_kn_parent_priv(struct kernfs_node *kn) -{ - /* - * The parent pointer is only valid within RCU section since it can be - * replaced. - */ - guard(rcu)(); - return rcu_dereference(kn->__parent)->priv; -} - -static int rdt_num_closids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - - seq_printf(seq, "%u\n", s->num_closid); - return 0; -} - -static int rdt_default_ctrl_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); - return 0; -} - -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.min_cbm_bits); - return 0; -} - -static int rdt_shareable_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", r->cache.shareable_bits); - return 0; -} - -/* - * rdt_bit_usage_show - Display current usage of resources - * - * A domain is a shared resource that can now be allocated differently. Here - * we display the current regions of the domain as an annotated bitmask. - * For each domain of this resource its allocation bitmask - * is annotated as below to indicate the current usage of the corresponding bit: - * 0 - currently unused - * X - currently available for sharing and used by software and hardware - * H - currently used by hardware only but available for software use - * S - currently used and shareable by software only - * E - currently used exclusively by one resource group - * P - currently pseudo-locked by one resource group - */ -static int rdt_bit_usage_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - /* - * Use unsigned long even though only 32 bits are used to ensure - * test_bit() is used safely. - */ - unsigned long sw_shareable = 0, hw_shareable = 0; - unsigned long exclusive = 0, pseudo_locked = 0; - struct rdt_resource *r = s->res; - struct rdt_ctrl_domain *dom; - int i, hwb, swb, excl, psl; - enum rdtgrp_mode mode; - bool sep = false; - u32 ctrl_val; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - hw_shareable = r->cache.shareable_bits; - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(seq, ';'); - sw_shareable = 0; - exclusive = 0; - seq_printf(seq, "%d=", dom->hdr.id); - for (i = 0; i < closids_supported(); i++) { - if (!closid_allocated(i)) - continue; - ctrl_val = resctrl_arch_get_config(r, dom, i, - s->conf_type); - mode = rdtgroup_mode_by_closid(i); - switch (mode) { - case RDT_MODE_SHAREABLE: - sw_shareable |= ctrl_val; - break; - case RDT_MODE_EXCLUSIVE: - exclusive |= ctrl_val; - break; - case RDT_MODE_PSEUDO_LOCKSETUP: - /* - * RDT_MODE_PSEUDO_LOCKSETUP is possible - * here but not included since the CBM - * associated with this CLOSID in this mode - * is not initialized and no task or cpu can be - * assigned this CLOSID. - */ - break; - case RDT_MODE_PSEUDO_LOCKED: - case RDT_NUM_MODES: - WARN(1, - "invalid mode for closid %d\n", i); - break; - } - } - for (i = r->cache.cbm_len - 1; i >= 0; i--) { - pseudo_locked = dom->plr ? dom->plr->cbm : 0; - hwb = test_bit(i, &hw_shareable); - swb = test_bit(i, &sw_shareable); - excl = test_bit(i, &exclusive); - psl = test_bit(i, &pseudo_locked); - if (hwb && swb) - seq_putc(seq, 'X'); - else if (hwb && !swb) - seq_putc(seq, 'H'); - else if (!hwb && swb) - seq_putc(seq, 'S'); - else if (excl) - seq_putc(seq, 'E'); - else if (psl) - seq_putc(seq, 'P'); - else /* Unused bits remain */ - seq_putc(seq, '0'); - } - sep = true; - } - seq_putc(seq, '\n'); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return 0; -} - -static int rdt_min_bw_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.min_bw); - return 0; -} - -static int rdt_num_rmids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - seq_printf(seq, "%d\n", r->num_rmid); - - return 0; -} - -static int rdt_mon_features_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct mon_evt *mevt; - - list_for_each_entry(mevt, &r->evt_list, list) { - seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) - seq_printf(seq, "%s_config\n", mevt->name); - } - - return 0; -} - -static int rdt_bw_gran_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.bw_gran); - return 0; -} - -static int rdt_delay_linear_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.delay_linear); - return 0; -} - -static int max_threshold_occ_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); - - return 0; -} - -static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - switch (r->membw.throttle_mode) { - case THREAD_THROTTLE_PER_THREAD: - seq_puts(seq, "per-thread\n"); - return 0; - case THREAD_THROTTLE_MAX: - seq_puts(seq, "max\n"); - return 0; - case THREAD_THROTTLE_UNDEFINED: - seq_puts(seq, "undefined\n"); - return 0; - } - - WARN_ON_ONCE(1); - - return 0; -} - -static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - unsigned int bytes; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - if (bytes > resctrl_rmid_realloc_limit) - return -EINVAL; - - resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); - - return nbytes; -} - -/* - * rdtgroup_mode_show - Display mode of this resource group - */ -static int rdtgroup_mode_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); - - rdtgroup_kn_unlock(of->kn); - return 0; -} - -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) -{ - switch (my_type) { - case CDP_CODE: - return CDP_DATA; - case CDP_DATA: - return CDP_CODE; - default: - case CDP_NONE: - return CDP_NONE; - } -} - -static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); - - return 0; -} - -/** - * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other - * @r: Resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @type: CDP type of @r. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Checks if provided @cbm intended to be used for @closid on domain - * @d overlaps with any other closids or other hardware usage associated - * with this domain. If @exclusive is true then only overlaps with - * resource groups in exclusive mode will be considered. If @exclusive - * is false then overlaps with any resource group or hardware entities - * will be considered. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: false if CBM does not overlap, true if it does. - */ -static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, - enum resctrl_conf_type type, bool exclusive) -{ - enum rdtgrp_mode mode; - unsigned long ctrl_b; - int i; - - /* Check for any overlap with regions used by hardware directly */ - if (!exclusive) { - ctrl_b = r->cache.shareable_bits; - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) - return true; - } - - /* Check for overlap with other resource groups */ - for (i = 0; i < closids_supported(); i++) { - ctrl_b = resctrl_arch_get_config(r, d, i, type); - mode = rdtgroup_mode_by_closid(i); - if (closid_allocated(i) && i != closid && - mode != RDT_MODE_PSEUDO_LOCKSETUP) { - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { - if (exclusive) { - if (mode == RDT_MODE_EXCLUSIVE) - return true; - continue; - } - return true; - } - } - } - - return false; -} - -/** - * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware - * @s: Schema for the resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Resources that can be allocated using a CBM can use the CBM to control - * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test - * for overlap. Overlap test is not limited to the specific resource for - * which the CBM is intended though - when dealing with CDP resources that - * share the underlying hardware the overlap check should be performed on - * the CDP resource sharing the hardware also. - * - * Refer to description of __rdtgroup_cbm_overlaps() for the details of the - * overlap test. - * - * Return: true if CBM overlap detected, false if there is no overlap - */ -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - struct rdt_resource *r = s->res; - - if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, - exclusive)) - return true; - - if (!resctrl_arch_get_cdp_enabled(r->rid)) - return false; - return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); -} - -/** - * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive - * @rdtgrp: Resource group identified through its closid. - * - * An exclusive resource group implies that there should be no sharing of - * its allocated resources. At the time this group is considered to be - * exclusive this test can determine if its current schemata supports this - * setting by testing for overlap with all other resource groups. - * - * Return: true if resource group can be exclusive, false if there is overlap - * with allocations of other resource groups and thus this resource group - * cannot be exclusive. - */ -static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) -{ - int closid = rdtgrp->closid; - struct rdt_ctrl_domain *d; - struct resctrl_schema *s; - struct rdt_resource *r; - bool has_cache = false; - u32 ctrl; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) - continue; - has_cache = true; - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - ctrl = resctrl_arch_get_config(r, d, closid, - s->conf_type); - if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { - rdt_last_cmd_puts("Schemata overlaps\n"); - return false; - } - } - } - - if (!has_cache) { - rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); - return false; - } - - return true; -} - -/* - * rdtgroup_mode_write - Modify the resource group's mode - */ -static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - enum rdtgrp_mode mode; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - rdt_last_cmd_clear(); - - mode = rdtgrp->mode; - - if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || - (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || - (!strcmp(buf, "pseudo-locksetup") && - mode == RDT_MODE_PSEUDO_LOCKSETUP) || - (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) - goto out; - - if (mode == RDT_MODE_PSEUDO_LOCKED) { - rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); - ret = -EINVAL; - goto out; - } - - if (!strcmp(buf, "shareable")) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_SHAREABLE; - } else if (!strcmp(buf, "exclusive")) { - if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - ret = -EINVAL; - goto out; - } - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_EXCLUSIVE; - } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && - !strcmp(buf, "pseudo-locksetup")) { - ret = rdtgroup_locksetup_enter(rdtgrp); - if (ret) - goto out; - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; - } else { - rdt_last_cmd_puts("Unknown or unsupported mode\n"); - ret = -EINVAL; - } - -out: - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -/** - * rdtgroup_cbm_to_size - Translate CBM to size in bytes - * @r: RDT resource to which @d belongs. - * @d: RDT domain instance. - * @cbm: bitmask for which the size should be computed. - * - * The bitmask provided associated with the RDT domain instance @d will be - * translated into how many bytes it represents. The size in bytes is - * computed by first dividing the total cache size by the CBM length to - * determine how many bytes each bit in the bitmask represents. The result - * is multiplied with the number of bits set in the bitmask. - * - * @cbm is unsigned long, even if only 32 bits are used to make the - * bitmap functions work correctly. - */ -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int size = 0; - struct cacheinfo *ci; - int num_b; - - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) - return size; - - num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); - if (ci) - size = ci->size / r->cache.cbm_len * num_b; - - return size; -} - -/* - * rdtgroup_size_show - Display size in bytes of allocated regions - * - * The "size" file mirrors the layout of the "schemata" file, printing the - * size in bytes of each region instead of the capacity bitmask. - */ -static int rdtgroup_size_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - enum resctrl_conf_type type; - struct rdt_ctrl_domain *d; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - unsigned int size; - int ret = 0; - u32 closid; - bool sep; - u32 ctrl; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%*s:", max_name_width, - rdtgrp->plr->s->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, - rdtgrp->plr->d, - rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); - } - goto out; - } - - closid = rdtgrp->closid; - - list_for_each_entry(schema, &resctrl_schema_all, list) { - r = schema->res; - type = schema->conf_type; - sep = false; - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(s, ';'); - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - size = 0; - } else { - if (is_mba_sc(r)) - ctrl = d->mbps_val[closid]; - else - ctrl = resctrl_arch_get_config(r, d, - closid, - type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) - size = ctrl; - else - size = rdtgroup_cbm_to_size(r, d, ctrl); - } - seq_printf(s, "%d=%u", d->hdr.id, size); - sep = true; - } - seq_putc(s, '\n'); - } - -out: - rdtgroup_kn_unlock(of->kn); - - return ret; + resctrl_arch_sched_in(current); } #define INVALID_CONFIG_INDEX UINT_MAX @@ -1635,68 +96,12 @@ void resctrl_arch_mon_event_config_read(void *_config_info) pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } - rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval); + rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval); /* Report only the valid event configuration bits */ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; } -static void mondata_config_read(struct resctrl_mon_config_info *mon_info) -{ - smp_call_function_any(&mon_info->d->hdr.cpu_mask, - resctrl_arch_mon_event_config_read, mon_info, 1); -} - -static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) -{ - struct resctrl_mon_config_info mon_info; - struct rdt_mon_domain *dom; - bool sep = false; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); - mon_info.r = r; - mon_info.d = dom; - mon_info.evtid = evtid; - mondata_config_read(&mon_info); - - seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); - sep = true; - } - seq_puts(s, "\n"); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return 0; -} - -static int mbm_total_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); - - return 0; -} - -static int mbm_local_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); - - return 0; -} - void resctrl_arch_mon_event_config_write(void *_config_info) { struct resctrl_mon_config_info *config_info = _config_info; @@ -1707,638 +112,21 @@ void resctrl_arch_mon_event_config_write(void *_config_info) pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } - wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0); -} - -static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_mon_domain *d, u32 evtid, u32 val) -{ - struct resctrl_mon_config_info mon_info = {0}; - - /* - * Read the current config value first. If both are the same then - * no need to write it again. - */ - mon_info.r = r; - mon_info.d = d; - mon_info.evtid = evtid; - mondata_config_read(&mon_info); - if (mon_info.mon_config == val) - return; - - mon_info.mon_config = val; - - /* - * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the - * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE - * are scoped at the domain level. Writing any of these MSRs - * on one CPU is observed by all the CPUs in the domain. - */ - smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, - &mon_info, 1); - - /* - * When an Event Configuration is changed, the bandwidth counters - * for all RMIDs and Events will be cleared by the hardware. The - * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for - * every RMID on the next read to any event for every RMID. - * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) - * cleared while it is tracked by the hardware. Clear the - * mbm_local and mbm_total counts for all the RMIDs. - */ - resctrl_arch_reset_rmid_all(r, d); -} - -static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) -{ - char *dom_str = NULL, *id_str; - unsigned long dom_id, val; - struct rdt_mon_domain *d; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - -next: - if (!tok || tok[0] == '\0') - return 0; - - /* Start processing the strings for each domain */ - dom_str = strim(strsep(&tok, ";")); - id_str = strsep(&dom_str, "="); - - if (!id_str || kstrtoul(id_str, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); - return -EINVAL; - } - - if (!dom_str || kstrtoul(dom_str, 16, &val)) { - rdt_last_cmd_puts("Non-numeric event configuration value\n"); - return -EINVAL; - } - - /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { - rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); - return -EINVAL; - } - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->hdr.id == dom_id) { - mbm_config_write_domain(r, d, evtid, val); - goto next; - } - } - - return -EINVAL; -} - -static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -/* rdtgroup information files for one cache resource. */ -static struct rftype res_common_files[] = { - { - .name = "last_cmd_status", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_last_cmd_status_show, - .fflags = RFTYPE_TOP_INFO, - }, - { - .name = "num_closids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, - .fflags = RFTYPE_CTRL_INFO, - }, - { - .name = "mon_features", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_mon_features_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "num_rmids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_rmids_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "cbm_mask", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_default_ctrl_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_cbm_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_cbm_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "shareable_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_shareable_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "bit_usage", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bit_usage_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_bandwidth", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_bw_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "bandwidth_gran", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bw_gran_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "delay_linear", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_delay_linear_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - /* - * Platform specific which (if any) capabilities are provided by - * thread_throttle_mode. Defer "fflags" initialization to platform - * discovery. - */ - { - .name = "thread_throttle_mode", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_thread_throttle_mode_show, - }, - { - .name = "max_threshold_occupancy", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = max_threshold_occ_write, - .seq_show = max_threshold_occ_show, - .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "mbm_total_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_total_bytes_config_show, - .write = mbm_total_bytes_config_write, - }, - { - .name = "mbm_local_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_local_bytes_config_show, - .write = mbm_local_bytes_config_write, - }, - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - .fflags = RFTYPE_BASE, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "mon_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_rmid_show, - .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "mba_MBps_event", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mba_mbps_event_write, - .seq_show = rdtgroup_mba_mbps_event_show, - }, - { - .name = "mode", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mode_write, - .seq_show = rdtgroup_mode_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "size", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_size_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "sparse_masks", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_has_sparse_bitmasks_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "ctrl_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_closid_show, - .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, - }, - -}; - -static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) -{ - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - lockdep_assert_held(&rdtgroup_mutex); - - if (resctrl_debug) - fflags |= RFTYPE_DEBUG; - - for (rft = rfts; rft < rfts + len; rft++) { - if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) { - if ((fflags & rft->fflags) == rft->fflags) - kernfs_remove_by_name(kn, rft->name); - } - return ret; -} - -static struct rftype *rdtgroup_get_rftype_by_name(const char *name) -{ - struct rftype *rfts, *rft; - int len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - return rft; - } - - return NULL; -} - -static void thread_throttle_mode_init(void) -{ - enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; - struct rdt_resource *r_mba, *r_smba; - - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - if (r_mba->alloc_capable && - r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_mba->membw.throttle_mode; - - r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); - if (r_smba->alloc_capable && - r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_smba->membw.throttle_mode; - - if (throttle_mode == THREAD_THROTTLE_UNDEFINED) - return; - - resctrl_file_fflags_init("thread_throttle_mode", - RFTYPE_CTRL_INFO | RFTYPE_RES_MB); -} - -void resctrl_file_fflags_init(const char *config, unsigned long fflags) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name(config); - if (rft) - rft->fflags = fflags; -} - -/** - * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * - * The permissions of named resctrl file, directory, or link are modified - * to not allow read, write, or execute by any user. - * - * WARNING: This function is intended to communicate to the user that the - * resctrl file has been locked down - that it is not relevant to the - * particular state the system finds itself in. It should not be relied - * on to protect from user access because after the file's permissions - * are restricted the user can still change the permissions using chmod - * from the command line. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn; - int ret = 0; - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - iattr.ia_mode = S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode = S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode = S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -/** - * rdtgroup_kn_mode_restore - Restore user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * @mask: Mask of permissions that should be restored - * - * Restore the permissions of the named file. If @name is a directory the - * permissions of its parent will be used. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn, *parent; - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - iattr.ia_mode = rft->mode & mask; - } - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - parent = kernfs_get_parent(kn); - if (parent) { - iattr.ia_mode |= parent->mode; - kernfs_put(parent); - } - iattr.ia_mode |= S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode |= S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode |= S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -static int rdtgroup_mkdir_info_resdir(void *priv, char *name, - unsigned long fflags) -{ - struct kernfs_node *kn_subdir; - int ret; - - kn_subdir = kernfs_create_dir(kn_info, name, - kn_info->mode, priv); - if (IS_ERR(kn_subdir)) - return PTR_ERR(kn_subdir); - - ret = rdtgroup_kn_set_ugid(kn_subdir); - if (ret) - return ret; - - ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); - - return ret; -} - -static unsigned long fflags_from_resource(struct rdt_resource *r) -{ - switch (r->rid) { - case RDT_RESOURCE_L3: - case RDT_RESOURCE_L2: - return RFTYPE_RES_CACHE; - case RDT_RESOURCE_MBA: - case RDT_RESOURCE_SMBA: - return RFTYPE_RES_MB; - } - - return WARN_ON_ONCE(1); -} - -static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - unsigned long fflags; - char name[32]; - int ret; - - /* create the directory */ - kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); - if (IS_ERR(kn_info)) - return PTR_ERR(kn_info); - - ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); - if (ret) - goto out_destroy; - - /* loop over enabled controls, these are all alloc_capable */ - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; - ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); - if (ret) - goto out_destroy; - } - - for_each_mon_capable_rdt_resource(r) { - fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; - sprintf(name, "%s_MON", r->name); - ret = rdtgroup_mkdir_info_resdir(r, name, fflags); - if (ret) - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(kn_info); - if (ret) - goto out_destroy; - - kernfs_activate(kn_info); - - return 0; - -out_destroy: - kernfs_remove(kn_info); - return ret; -} - -static int -mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, - char *name, struct kernfs_node **dest_kn) -{ - struct kernfs_node *kn; - int ret; - - /* create the directory */ - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - if (dest_kn) - *dest_kn = kn; - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - kernfs_activate(kn); - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; + wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config); } static void l3_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); + wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); } static void l2_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); -} - -static inline bool is_mba_linear(void) -{ - return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; + wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); } static int set_cache_qos_cfg(int level, bool enable) @@ -2396,76 +184,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } -static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - u32 num_closid = resctrl_arch_get_num_closid(r); - int cpu = cpumask_any(&d->hdr.cpu_mask); - int i; - - d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), - GFP_KERNEL, cpu_to_node(cpu)); - if (!d->mbps_val) - return -ENOMEM; - - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - - return 0; -} - -static void mba_sc_domain_destroy(struct rdt_resource *r, - struct rdt_ctrl_domain *d) -{ - kfree(d->mbps_val); - d->mbps_val = NULL; -} - -/* - * MBA software controller is supported only if - * MBM is supported and MBA is in linear scale, - * and the MBM monitor scope is the same as MBA - * control scope. - */ -static bool supports_mba_mbps(void) -{ - struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - - return (resctrl_is_mbm_enabled() && - r->alloc_capable && is_mba_linear() && - r->ctrl_scope == rmbm->mon_scope); -} - -/* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. - */ -static int set_mba_sc(bool mba_sc) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rdt_ctrl_domain *d; - unsigned long fflags; - int i; - - if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) - return -EINVAL; - - r->membw.mba_sc = mba_sc; - - rdtgroup_default.mba_mbps_event = mba_mbps_default_event; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - } - - fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; - resctrl_file_fflags_init("mba_MBps_event", fflags); - - return 0; -} - static int cdp_enable(int level) { struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; @@ -2506,419 +224,9 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -/* - * We don't allow rdtgroup directories to be created anywhere - * except the root directory. Thus when looking for the rdtgroup - * structure for a kernfs node we are either looking at a directory, - * in which case the rdtgroup structure is pointed at by the "priv" - * field, otherwise we have a file, and need only look to the parent - * to find the rdtgroup. - */ -static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) -{ - if (kernfs_type(kn) == KERNFS_DIR) { - /* - * All the resource directories use "kn->priv" - * to point to the "struct rdtgroup" for the - * resource. "info" and its subdirectories don't - * have rdtgroup structures, so return NULL here. - */ - if (kn == kn_info || - rcu_access_pointer(kn->__parent) == kn_info) - return NULL; - else - return kn->priv; - } else { - return rdt_kn_parent_priv(kn); - } -} - -static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - atomic_inc(&rdtgrp->waitcount); - kernfs_break_active_protection(kn); -} - -static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - kernfs_unbreak_active_protection(kn); - rdtgroup_remove(rdtgrp); - } else { - kernfs_unbreak_active_protection(kn); - } -} - -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return NULL; - - rdtgroup_kn_get(rdtgrp, kn); - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* Was this group deleted while we waited? */ - if (rdtgrp->flags & RDT_DELETED) - return NULL; - - return rdtgrp; -} - -void rdtgroup_kn_unlock(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return; - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - rdtgroup_kn_put(rdtgrp, kn); -} - -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **mon_data_kn); - -static void rdt_disable_ctx(void) -{ - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); - set_mba_sc(false); - - resctrl_debug = false; -} - -static int rdt_enable_ctx(struct rdt_fs_context *ctx) -{ - int ret = 0; - - if (ctx->enable_cdpl2) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); - if (ret) - goto out_done; - } - - if (ctx->enable_cdpl3) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); - if (ret) - goto out_cdpl2; - } - - if (ctx->enable_mba_mbps) { - ret = set_mba_sc(true); - if (ret) - goto out_cdpl3; - } - - if (ctx->enable_debug) - resctrl_debug = true; - - return 0; - -out_cdpl3: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); -out_cdpl2: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -out_done: - return ret; -} - -static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) -{ - struct resctrl_schema *s; - const char *suffix = ""; - int ret, cl; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - s->res = r; - s->num_closid = resctrl_arch_get_num_closid(r); - if (resctrl_arch_get_cdp_enabled(r->rid)) - s->num_closid /= 2; - - s->conf_type = type; - switch (type) { - case CDP_CODE: - suffix = "CODE"; - break; - case CDP_DATA: - suffix = "DATA"; - break; - case CDP_NONE: - suffix = ""; - break; - } - - ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); - if (ret >= sizeof(s->name)) { - kfree(s); - return -EINVAL; - } - - cl = strlen(s->name); - - /* - * If CDP is supported by this resource, but not enabled, - * include the suffix. This ensures the tabular format of the - * schemata file does not change between mounts of the filesystem. - */ - if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) - cl += 4; - - if (cl > max_name_width) - max_name_width = cl; - - switch (r->schema_fmt) { - case RESCTRL_SCHEMA_BITMAP: - s->fmt_str = "%d=%x"; - break; - case RESCTRL_SCHEMA_RANGE: - s->fmt_str = "%d=%u"; - break; - } - - if (WARN_ON_ONCE(!s->fmt_str)) { - kfree(s); - return -EINVAL; - } - - INIT_LIST_HEAD(&s->list); - list_add(&s->list, &resctrl_schema_all); - - return 0; -} - -static int schemata_list_create(void) -{ - struct rdt_resource *r; - int ret = 0; - - for_each_alloc_capable_rdt_resource(r) { - if (resctrl_arch_get_cdp_enabled(r->rid)) { - ret = schemata_list_add(r, CDP_CODE); - if (ret) - break; - - ret = schemata_list_add(r, CDP_DATA); - } else { - ret = schemata_list_add(r, CDP_NONE); - } - - if (ret) - break; - } - - return ret; -} - -static void schemata_list_destroy(void) -{ - struct resctrl_schema *s, *tmp; - - list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { - list_del(&s->list); - kfree(s); - } -} - -static int rdt_get_tree(struct fs_context *fc) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_mon_domain *dom; - struct rdt_resource *r; - int ret; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - /* - * resctrl file system can only be mounted once. - */ - if (resctrl_mounted) { - ret = -EBUSY; - goto out; - } - - ret = rdtgroup_setup_root(ctx); - if (ret) - goto out; - - ret = rdt_enable_ctx(ctx); - if (ret) - goto out_root; - - ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } - - closid_init(); - - if (resctrl_arch_mon_capable()) - flags |= RFTYPE_MON; - - ret = rdtgroup_add_files(rdtgroup_default.kn, flags); - if (ret) - goto out_schemata_free; - - kernfs_activate(rdtgroup_default.kn); - - ret = rdtgroup_create_info_dir(rdtgroup_default.kn); - if (ret < 0) - goto out_schemata_free; - - if (resctrl_arch_mon_capable()) { - ret = mongroup_create_dir(rdtgroup_default.kn, - &rdtgroup_default, "mon_groups", - &kn_mongrp); - if (ret < 0) - goto out_info; - - ret = mkdir_mondata_all(rdtgroup_default.kn, - &rdtgroup_default, &kn_mondata); - if (ret < 0) - goto out_mongrp; - rdtgroup_default.mon.mon_data_kn = kn_mondata; - } - - ret = rdt_pseudo_lock_init(); - if (ret) - goto out_mondata; - - ret = kernfs_get_tree(fc); - if (ret < 0) - goto out_psl; - - if (resctrl_arch_alloc_capable()) - resctrl_arch_enable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_enable_mon(); - - if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) - resctrl_mounted = true; - - if (resctrl_is_mbm_enabled()) { - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - list_for_each_entry(dom, &r->mon_domains, hdr.list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - goto out; - -out_psl: - rdt_pseudo_lock_release(); -out_mondata: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mondata); -out_mongrp: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mongrp); -out_info: - kernfs_remove(kn_info); -out_schemata_free: - schemata_list_destroy(); -out_ctx: - rdt_disable_ctx(); -out_root: - rdtgroup_destroy_root(); -out: - rdt_last_cmd_clear(); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -enum rdt_param { - Opt_cdp, - Opt_cdpl2, - Opt_mba_mbps, - Opt_debug, - nr__rdt_params -}; - -static const struct fs_parameter_spec rdt_fs_parameters[] = { - fsparam_flag("cdp", Opt_cdp), - fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_MBps", Opt_mba_mbps), - fsparam_flag("debug", Opt_debug), - {} -}; - -static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - struct fs_parse_result result; - const char *msg; - int opt; - - opt = fs_parse(fc, rdt_fs_parameters, param, &result); - if (opt < 0) - return opt; - - switch (opt) { - case Opt_cdp: - ctx->enable_cdpl3 = true; - return 0; - case Opt_cdpl2: - ctx->enable_cdpl2 = true; - return 0; - case Opt_mba_mbps: - msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; - if (!supports_mba_mbps()) - return invalfc(fc, msg); - ctx->enable_mba_mbps = true; - return 0; - case Opt_debug: - ctx->enable_debug = true; - return 0; - } - - return -EINVAL; -} - -static void rdt_fs_context_free(struct fs_context *fc) +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) { - struct rdt_fs_context *ctx = rdt_fc2context(fc); - - kernfs_free_fs_context(fc); - kfree(ctx); -} - -static const struct fs_context_operations rdt_fs_context_ops = { - .free = rdt_fs_context_free, - .parse_param = rdt_parse_param, - .get_tree = rdt_get_tree, -}; - -static int rdt_init_fs_context(struct fs_context *fc) -{ - struct rdt_fs_context *ctx; - - ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; - fc->fs_private = &ctx->kfc; - fc->ops = &rdt_fs_context_ops; - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(&init_user_ns); - fc->global = true; - return 0; + return rdt_resources_all[l].cdp_enabled; } void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) @@ -2952,1460 +260,3 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) return; } - -/* - * Move tasks from one to the other group. If @from is NULL, then all tasks - * in the systems are moved unconditionally (used for teardown). - * - * If @mask is not NULL the cpus on which moved tasks are running are set - * in that mask so the update smp function call is restricted to affected - * cpus. - */ -static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, - struct cpumask *mask) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - for_each_process_thread(p, t) { - if (!from || is_closid_match(t, from) || - is_rmid_match(t, from)) { - resctrl_arch_set_closid_rmid(t, to->closid, - to->mon.rmid); - - /* - * Order the closid/rmid stores above before the loads - * in task_curr(). This pairs with the full barrier - * between the rq->curr update and resctrl_sched_in() - * during context switch. - */ - smp_mb(); - - /* - * If the task is on a CPU, set the CPU in the mask. - * The detection is inaccurate as tasks might move or - * schedule before the smp function call takes place. - * In such a case the function call is pointless, but - * there is no other side effect. - */ - if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) - cpumask_set_cpu(task_cpu(t), mask); - } - } - read_unlock(&tasklist_lock); -} - -static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) -{ - struct rdtgroup *sentry, *stmp; - struct list_head *head; - - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->closid, sentry->mon.rmid); - list_del(&sentry->mon.crdtgrp_list); - - if (atomic_read(&sentry->waitcount) != 0) - sentry->flags = RDT_DELETED; - else - rdtgroup_remove(sentry); - } -} - -/* - * Forcibly remove all of subdirectories under root. - */ -static void rmdir_all_sub(void) -{ - struct rdtgroup *rdtgrp, *tmp; - - /* Move all tasks to the default resource group */ - rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); - - list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { - /* Free any child rmids */ - free_all_child_rdtgrp(rdtgrp); - - /* Remove each rdtgroup other than root */ - if (rdtgrp == &rdtgroup_default) - continue; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - - /* - * Give any CPUs back to the default group. We cannot copy - * cpu_online_mask because a CPU might have executed the - * offline callback already, but is still marked online. - */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - kernfs_remove(rdtgrp->kn); - list_del(&rdtgrp->rdtgroup_list); - - if (atomic_read(&rdtgrp->waitcount) != 0) - rdtgrp->flags = RDT_DELETED; - else - rdtgroup_remove(rdtgrp); - } - /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ - update_closid_rmid(cpu_online_mask, &rdtgroup_default); - - kernfs_remove(kn_info); - kernfs_remove(kn_mongrp); - kernfs_remove(kn_mondata); -} - -static void rdt_kill_sb(struct super_block *sb) -{ - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_disable_ctx(); - - /* Put everything back to default values. */ - for_each_alloc_capable_rdt_resource(r) - resctrl_arch_reset_all_ctrls(r); - - rmdir_all_sub(); - rdt_pseudo_lock_release(); - rdtgroup_default.mode = RDT_MODE_SHAREABLE; - schemata_list_destroy(); - rdtgroup_destroy_root(); - if (resctrl_arch_alloc_capable()) - resctrl_arch_disable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_disable_mon(); - resctrl_mounted = false; - kernfs_kill_sb(sb); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -static struct file_system_type rdt_fs_type = { - .name = "resctrl", - .init_fs_context = rdt_init_fs_context, - .parameters = rdt_fs_parameters, - .kill_sb = rdt_kill_sb, -}; - -static int mon_addfile(struct kernfs_node *parent_kn, const char *name, - void *priv) -{ - struct kernfs_node *kn; - int ret = 0; - - kn = __kernfs_create_file(parent_kn, name, 0444, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, - &kf_mondata_ops, priv, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return ret; -} - -static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) -{ - struct kernfs_node *kn; - - kn = kernfs_find_and_get(pkn, name); - if (!kn) - return; - kernfs_put(kn); - - if (kn->dir.subdirs <= 1) - kernfs_remove(kn); - else - kernfs_remove_by_name(kn, subname); -} - -/* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups for the given domain. - * Remove files and directories containing "sum" of domain data - * when last domain being summed is removed. - */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct rdtgroup *prgrp, *crgrp; - char subname[32]; - bool snc_mode; - char name[32]; - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); - - list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) - mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); - } -} - -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp, - bool do_sum) -{ - struct rmid_read rr = {0}; - union mon_data_bits priv; - struct mon_evt *mevt; - int ret; - - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - priv.u.rid = r->rid; - priv.u.domid = do_sum ? d->ci->id : d->hdr.id; - priv.u.sum = do_sum; - list_for_each_entry(mevt, &r->evt_list, list) { - priv.u.evtid = mevt->evtid; - ret = mon_addfile(kn, mevt->name, priv.priv); - if (ret) - return ret; - - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); - } - - return 0; -} - -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) -{ - struct kernfs_node *kn, *ckn; - char name[32]; - bool snc_mode; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - kn = kernfs_find_and_get(parent_kn, name); - if (kn) { - /* - * rdtgroup_mutex will prevent this directory from being - * removed. No need to keep this hold. - */ - kernfs_put(kn); - } else { - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); - if (ret) - goto out_destroy; - } - - if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); - if (IS_ERR(ckn)) { - ret = -EINVAL; - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(ckn); - if (ret) - goto out_destroy; - - ret = mon_add_all_files(ckn, d, r, prgrp, false); - if (ret) - goto out_destroy; - } - - kernfs_activate(kn); - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/* - * Add all subdirectories of mon_data for "ctrl_mon" groups - * and "monitor" groups with given domain id. - */ -static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *prgrp, *crgrp; - struct list_head *head; - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); - } - } -} - -static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, - struct rdt_resource *r, - struct rdtgroup *prgrp) -{ - struct rdt_mon_domain *dom; - int ret; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); - if (ret) - return ret; - } - - return 0; -} - -/* - * This creates a directory mon_data which contains the monitored data. - * - * mon_data has one directory for each domain which are named - * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data - * with L3 domain looks as below: - * ./mon_data: - * mon_L3_00 - * mon_L3_01 - * mon_L3_02 - * ... - * - * Each domain directory has one file per event: - * ./mon_L3_00/: - * llc_occupancy - * - */ -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **dest_kn) -{ - struct rdt_resource *r; - struct kernfs_node *kn; - int ret; - - /* - * Create the mon_data directory first. - */ - ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); - if (ret) - return ret; - - if (dest_kn) - *dest_kn = kn; - - /* - * Create the subdirectories for each domain. Note that all events - * in a domain like L3 are grouped into a resource whose domain is L3 - */ - for_each_mon_capable_rdt_resource(r) { - ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); - if (ret) - goto out_destroy; - } - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/** - * cbm_ensure_valid - Enforce validity on provided CBM - * @_val: Candidate CBM - * @r: RDT resource to which the CBM belongs - * - * The provided CBM represents all cache portions available for use. This - * may be represented by a bitmap that does not consist of contiguous ones - * and thus be an invalid CBM. - * Here the provided CBM is forced to be a valid CBM by only considering - * the first set of contiguous bits as valid and clearing all bits. - * The intention here is to provide a valid default CBM with which a new - * resource group is initialized. The user can follow this with a - * modification to the CBM if the default does not satisfy the - * requirements. - */ -static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) -{ - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit; - unsigned long val = _val; - - if (!val) - return 0; - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Clear any remaining bits to ensure contiguous region */ - bitmap_clear(&val, zero_bit, cbm_len - zero_bit); - return (u32)val; -} - -/* - * Initialize cache resources per RDT domain - * - * Set the RDT domain up to start off with all usable allocations. That is, - * all shareable and unused bits. All-zero CBM is invalid. - */ -static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, - u32 closid) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - enum resctrl_conf_type t = s->conf_type; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 used_b = 0, unused_b = 0; - unsigned long tmp_cbm; - enum rdtgrp_mode mode; - u32 peer_ctl, ctrl_val; - int i; - - cfg = &d->staged_config[t]; - cfg->have_new_ctrl = false; - cfg->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - for (i = 0; i < closids_supported(); i++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - /* - * ctrl values for locksetup aren't relevant - * until the schemata is written, and the mode - * becomes RDT_MODE_PSEUDO_LOCKED. - */ - continue; - /* - * If CDP is active include peer domain's - * usage to ensure there is no overlap - * with an exclusive group. - */ - if (resctrl_arch_get_cdp_enabled(r->rid)) - peer_ctl = resctrl_arch_get_config(r, d, i, - peer_type); - else - peer_ctl = 0; - ctrl_val = resctrl_arch_get_config(r, d, i, - s->conf_type); - used_b |= ctrl_val | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - cfg->new_ctrl |= ctrl_val | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - cfg->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); - /* - * Assign the u32 CBM to an unsigned long to ensure that - * bitmap_weight() does not access out-of-bound memory. - */ - tmp_cbm = cfg->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); - return -ENOSPC; - } - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Initialize cache resources with default values. - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. - * - * If there are no more shareable bits available on any domain then - * the entire allocation will fail. - */ -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) -{ - struct rdt_ctrl_domain *d; - int ret; - - list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { - ret = __init_one_rdt_domain(d, s, closid); - if (ret < 0) - return ret; - } - - return 0; -} - -/* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) -{ - struct resctrl_staged_config *cfg; - struct rdt_ctrl_domain *d; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (is_mba_sc(r)) { - d->mbps_val[closid] = MBA_MAX_MBPS; - continue; - } - - cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = resctrl_get_default_ctrl(r); - cfg->have_new_ctrl = true; - } -} - -/* Initialize the RDT group's allocations. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - int ret = 0; - - rdt_staged_configs_clear(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) { - rdtgroup_init_mba(r, rdtgrp->closid); - if (is_mba_sc(r)) - continue; - } else { - ret = rdtgroup_init_cat(s, rdtgrp->closid); - if (ret < 0) - goto out; - } - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Failed to initialize allocations\n"); - goto out; - } - - } - - rdtgrp->mode = RDT_MODE_SHAREABLE; - -out: - rdt_staged_configs_clear(); - return ret; -} - -static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) -{ - int ret; - - if (!resctrl_arch_mon_capable()) - return 0; - - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - return 0; -} - -static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) -{ - if (resctrl_arch_mon_capable()) - free_rmid(rgrp->closid, rgrp->mon.rmid); -} - -/* - * We allow creating mon groups only with in a directory called "mon_groups" - * which is present in every ctrl_mon group. Check if this is a valid - * "mon_groups" directory. - * - * 1. The directory should be named "mon_groups". - * 2. The mon group itself should "not" be named "mon_groups". - * This makes sure "mon_groups" directory always has a ctrl_mon group - * as parent. - */ -static bool is_mon_groups(struct kernfs_node *kn, const char *name) -{ - return (!strcmp(rdt_kn_name(kn), "mon_groups") && - strcmp(name, "mon_groups")); -} - -static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, - const char *name, umode_t mode, - enum rdt_group_type rtype, struct rdtgroup **r) -{ - struct rdtgroup *prdtgrp, *rdtgrp; - unsigned long files = 0; - struct kernfs_node *kn; - int ret; - - prdtgrp = rdtgroup_kn_lock_live(parent_kn); - if (!prdtgrp) { - ret = -ENODEV; - goto out_unlock; - } - - /* - * Check that the parent directory for a monitor group is a "mon_groups" - * directory. - */ - if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { - ret = -EPERM; - goto out_unlock; - } - - if (rtype == RDTMON_GROUP && - (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto out_unlock; - } - - /* allocate the rdtgroup. */ - rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); - if (!rdtgrp) { - ret = -ENOSPC; - rdt_last_cmd_puts("Kernel out of memory\n"); - goto out_unlock; - } - *r = rdtgrp; - rdtgrp->mon.parent = prdtgrp; - rdtgrp->type = rtype; - INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); - - /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - rdt_last_cmd_puts("kernfs create error\n"); - goto out_free_rgrp; - } - rdtgrp->kn = kn; - - /* - * kernfs_remove() will drop the reference count on "kn" which - * will free it. But we still need it to stick around for the - * rdtgroup_kn_unlock(kn) call. Take one extra reference here, - * which will be dropped by kernfs_put() in rdtgroup_remove(). - */ - kernfs_get(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - rdt_last_cmd_puts("kernfs perm error\n"); - goto out_destroy; - } - - if (rtype == RDTCTRL_GROUP) { - files = RFTYPE_BASE | RFTYPE_CTRL; - if (resctrl_arch_mon_capable()) - files |= RFTYPE_MON; - } else { - files = RFTYPE_BASE | RFTYPE_MON; - } - - ret = rdtgroup_add_files(kn, files); - if (ret) { - rdt_last_cmd_puts("kernfs fill error\n"); - goto out_destroy; - } - - /* - * The caller unlocks the parent_kn upon success. - */ - return 0; - -out_destroy: - kernfs_put(rdtgrp->kn); - kernfs_remove(rdtgrp->kn); -out_free_rgrp: - kfree(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) -{ - kernfs_remove(rgrp->kn); - rdtgroup_remove(rgrp); -} - -/* - * Create a monitor group under "mon_groups" directory of a control - * and monitor group(ctrl_mon). This is a resource group - * to monitor a subset of tasks and cpus in its parent ctrl_mon group. - */ -static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp, *prgrp; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); - if (ret) - return ret; - - prgrp = rdtgrp->mon.parent; - rdtgrp->closid = prgrp->closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) { - mkdir_rdt_prepare_clean(rdtgrp); - goto out_unlock; - } - - kernfs_activate(rdtgrp->kn); - - /* - * Add the rdtgrp to the list of rdtgrps the parent - * ctrl_mon group has to track. - */ - list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); - -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -/* - * These are rdtgroups created under the root directory. Can be used - * to allocate and monitor resources. - */ -static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp; - struct kernfs_node *kn; - u32 closid; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); - if (ret) - return ret; - - kn = rdtgrp->kn; - ret = closid_alloc(); - if (ret < 0) { - rdt_last_cmd_puts("Out of CLOSIDs\n"); - goto out_common_fail; - } - closid = ret; - ret = 0; - - rdtgrp->closid = closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) - goto out_closid_free; - - kernfs_activate(rdtgrp->kn); - - ret = rdtgroup_init_alloc(rdtgrp); - if (ret < 0) - goto out_rmid_free; - - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - - if (resctrl_arch_mon_capable()) { - /* - * Create an empty mon_groups directory to hold the subset - * of tasks and cpus to monitor. - */ - ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_del_list; - } - if (is_mba_sc(NULL)) - rdtgrp->mba_mbps_event = mba_mbps_default_event; - } - - goto out_unlock; - -out_del_list: - list_del(&rdtgrp->rdtgroup_list); -out_rmid_free: - mkdir_rdt_prepare_rmid_free(rdtgrp); -out_closid_free: - closid_free(closid); -out_common_fail: - mkdir_rdt_prepare_clean(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; - - /* - * If the parent directory is the root directory and RDT - * allocation is supported, add a control and monitoring - * subdirectory - */ - if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) - return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); - - /* Else, attempt to add a monitoring subdirectory. */ - if (resctrl_arch_mon_capable()) - return rdtgroup_mkdir_mon(parent_kn, name, mode); - - return -EPERM; -} - -static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - u32 closid, rmid; - int cpu; - - /* Give any tasks back to the parent group */ - rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); - - /* - * Update per cpu closid/rmid of the moved CPUs first. - * Note: the closid will not change, but the arch code still needs it. - */ - closid = prdtgrp->closid; - rmid = prdtgrp->mon.rmid; - for_each_cpu(cpu, &rdtgrp->cpu_mask) - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - /* - * Remove the rdtgrp from the parent ctrl_mon group's list - */ - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_del(&rdtgrp->mon.crdtgrp_list); - - kernfs_remove(rdtgrp->kn); - - return 0; -} - -static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) -{ - rdtgrp->flags = RDT_DELETED; - list_del(&rdtgrp->rdtgroup_list); - - kernfs_remove(rdtgrp->kn); - return 0; -} - -static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - u32 closid, rmid; - int cpu; - - /* Give any tasks back to the default group */ - rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); - - /* Give any CPUs back to the default group */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - /* Update per cpu closid and rmid of the moved CPUs first */ - closid = rdtgroup_default.closid; - rmid = rdtgroup_default.mon.rmid; - for_each_cpu(cpu, &rdtgrp->cpu_mask) - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - closid_free(rdtgrp->closid); - - rdtgroup_ctrl_remove(rdtgrp); - - /* - * Free all the child monitor group rmids. - */ - free_all_child_rdtgrp(rdtgrp); - - return 0; -} - -static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) -{ - /* - * Valid within the RCU section it was obtained or while rdtgroup_mutex - * is held. - */ - return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); -} - -static int rdtgroup_rmdir(struct kernfs_node *kn) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret = 0; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; - } - parent_kn = rdt_kn_parent(kn); - - /* - * If the rdtgroup is a ctrl_mon group and parent directory - * is the root directory, remove the ctrl_mon group. - * - * If the rdtgroup is a mon group and parent directory - * is a valid "mon_groups" directory, remove the mon group. - */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && - rdtgrp != &rdtgroup_default) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(rdtgrp); - } else { - ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); - } - } else if (rdtgrp->type == RDTMON_GROUP && - is_mon_groups(parent_kn, rdt_kn_name(kn))) { - ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); - } else { - ret = -EPERM; - } - -out: - rdtgroup_kn_unlock(kn); - free_cpumask_var(tmpmask); - return ret; -} - -/** - * mongrp_reparent() - replace parent CTRL_MON group of a MON group - * @rdtgrp: the MON group whose parent should be replaced - * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp - * @cpus: cpumask provided by the caller for use during this call - * - * Replaces the parent CTRL_MON group for a MON group, resulting in all member - * tasks' CLOSID immediately changing to that of the new parent group. - * Monitoring data for the group is unaffected by this operation. - */ -static void mongrp_reparent(struct rdtgroup *rdtgrp, - struct rdtgroup *new_prdtgrp, - cpumask_var_t cpus) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - - WARN_ON(rdtgrp->type != RDTMON_GROUP); - WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); - - /* Nothing to do when simply renaming a MON group. */ - if (prdtgrp == new_prdtgrp) - return; - - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_move_tail(&rdtgrp->mon.crdtgrp_list, - &new_prdtgrp->mon.crdtgrp_list); - - rdtgrp->mon.parent = new_prdtgrp; - rdtgrp->closid = new_prdtgrp->closid; - - /* Propagate updated closid to all tasks in this group. */ - rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); - - update_closid_rmid(cpus, NULL); -} - -static int rdtgroup_rename(struct kernfs_node *kn, - struct kernfs_node *new_parent, const char *new_name) -{ - struct kernfs_node *kn_parent; - struct rdtgroup *new_prdtgrp; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret; - - rdtgrp = kernfs_to_rdtgroup(kn); - new_prdtgrp = kernfs_to_rdtgroup(new_parent); - if (!rdtgrp || !new_prdtgrp) - return -ENOENT; - - /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ - rdtgroup_kn_get(rdtgrp, kn); - rdtgroup_kn_get(new_prdtgrp, new_parent); - - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - /* - * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if - * either kernfs_node is a file. - */ - if (kernfs_type(kn) != KERNFS_DIR || - kernfs_type(new_parent) != KERNFS_DIR) { - rdt_last_cmd_puts("Source and destination must be directories"); - ret = -EPERM; - goto out; - } - - if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { - ret = -ENOENT; - goto out; - } - - kn_parent = rdt_kn_parent(kn); - if (rdtgrp->type != RDTMON_GROUP || !kn_parent || - !is_mon_groups(kn_parent, rdt_kn_name(kn))) { - rdt_last_cmd_puts("Source must be a MON group\n"); - ret = -EPERM; - goto out; - } - - if (!is_mon_groups(new_parent, new_name)) { - rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); - ret = -EPERM; - goto out; - } - - /* - * If the MON group is monitoring CPUs, the CPUs must be assigned to the - * current parent CTRL_MON group and therefore cannot be assigned to - * the new parent, making the move illegal. - */ - if (!cpumask_empty(&rdtgrp->cpu_mask) && - rdtgrp->mon.parent != new_prdtgrp) { - rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); - ret = -EPERM; - goto out; - } - - /* - * Allocate the cpumask for use in mongrp_reparent() to avoid the - * possibility of failing to allocate it after kernfs_rename() has - * succeeded. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out; - } - - /* - * Perform all input validation and allocations needed to ensure - * mongrp_reparent() will succeed before calling kernfs_rename(), - * otherwise it would be necessary to revert this call if - * mongrp_reparent() failed. - */ - ret = kernfs_rename(kn, new_parent, new_name); - if (!ret) - mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); - - free_cpumask_var(tmpmask); - -out: - mutex_unlock(&rdtgroup_mutex); - rdtgroup_kn_put(rdtgrp, kn); - rdtgroup_kn_put(new_prdtgrp, new_parent); - return ret; -} - -static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - seq_puts(seq, ",cdp"); - - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - seq_puts(seq, ",cdpl2"); - - if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) - seq_puts(seq, ",mba_MBps"); - - if (resctrl_debug) - seq_puts(seq, ",debug"); - - return 0; -} - -static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { - .mkdir = rdtgroup_mkdir, - .rmdir = rdtgroup_rmdir, - .rename = rdtgroup_rename, - .show_options = rdtgroup_show_options, -}; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx) -{ - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, - KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, - &rdtgroup_default); - if (IS_ERR(rdt_root)) - return PTR_ERR(rdt_root); - - ctx->kfc.root = rdt_root; - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - - return 0; -} - -static void rdtgroup_destroy_root(void) -{ - kernfs_destroy_root(rdt_root); - rdtgroup_default.kn = NULL; -} - -static void __init rdtgroup_setup_default(void) -{ - mutex_lock(&rdtgroup_mutex); - - rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; - rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; - rdtgroup_default.type = RDTCTRL_GROUP; - INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); - - list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - - mutex_unlock(&rdtgroup_mutex); -} - -static void domain_destroy_mon_state(struct rdt_mon_domain *d) -{ - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); -} - -void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) - mba_sc_domain_destroy(r, d); - - mutex_unlock(&rdtgroup_mutex); -} - -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d); - - if (resctrl_is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } - - domain_destroy_mon_state(d); - - mutex_unlock(&rdtgroup_mutex); -} - -/** - * domain_setup_mon_state() - Initialise domain monitoring structures. - * @r: The resource for the newly online domain. - * @d: The newly online domain. - * - * Allocate monitor resources that belong to this domain. - * Called when the first CPU of a domain comes online, regardless of whether - * the filesystem is mounted. - * During boot this may be called before global allocations have been made by - * resctrl_mon_resource_init(). - * - * Returns 0 for success, or -ENOMEM. - */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; - - if (resctrl_arch_is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - } - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } - } - - return 0; -} - -int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - int err = 0; - - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { - /* RDT_RESOURCE_MBA is never mon_capable */ - err = mba_sc_domain_allocate(r, d); - } - - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - int err; - - mutex_lock(&rdtgroup_mutex); - - err = domain_setup_mon_state(r, d); - if (err) - goto out_unlock; - - if (resctrl_is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - if (resctrl_arch_is_llc_occupancy_enabled()) - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - - /* - * If the filesystem is not mounted then only the default resource group - * exists. Creation of its directories is deferred until mount time - * by rdt_get_tree() calling mkdir_mondata_all(). - * If resctrl is mounted, add per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -void resctrl_online_cpu(unsigned int cpu) -{ - mutex_lock(&rdtgroup_mutex); - /* The CPU is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - mutex_unlock(&rdtgroup_mutex); -} - -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) -{ - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) - break; - } -} - -static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, - struct rdt_resource *r) -{ - struct rdt_mon_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -void resctrl_offline_cpu(unsigned int cpu) -{ - struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } - - if (!l3->mon_capable) - goto out_unlock; - - d = get_mon_domain_from_cpu(cpu, l3); - if (d) { - if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0, cpu); - } - if (resctrl_arch_is_llc_occupancy_enabled() && - cpu == d->cqm_work_cpu && has_busy_rmid(d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0, cpu); - } - } - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -/* - * resctrl_init - resctrl filesystem initialization - * - * Setup resctrl file system including set up root, create mount point, - * register resctrl filesystem, and initialize files under root directory. - * - * Return: 0 on success or -errno - */ -int __init resctrl_init(void) -{ - int ret = 0; - - seq_buf_init(&last_cmd_status, last_cmd_status_buf, - sizeof(last_cmd_status_buf)); - - rdtgroup_setup_default(); - - thread_throttle_mode_init(); - - ret = resctrl_mon_resource_init(); - if (ret) - return ret; - - ret = sysfs_create_mount_point(fs_kobj, "resctrl"); - if (ret) { - resctrl_mon_resource_exit(); - return ret; - } - - ret = register_filesystem(&rdt_fs_type); - if (ret) - goto cleanup_mountpoint; - - /* - * Adding the resctrl debugfs directory here may not be ideal since - * it would let the resctrl debugfs directory appear on the debugfs - * filesystem before the resctrl filesystem is mounted. - * It may also be ok since that would enable debugging of RDT before - * resctrl is mounted. - * The reason why the debugfs directory is created here and not in - * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and - * during the debugfs directory creation also &sb->s_type->i_mutex_key - * (the lockdep class of inode->i_rwsem). Other filesystem - * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_lock - * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex - * is taken, thus creating dependency: - * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause - * issues considering the other two lock dependencies. - * By creating the debugfs directory here we avoid a dependency - * that may cause deadlock (even though file operations cannot - * occur until the filesystem is mounted, but I do not know how to - * tell lockdep that). - */ - debugfs_resctrl = debugfs_create_dir("resctrl", NULL); - - return 0; - -cleanup_mountpoint: - sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_mon_resource_exit(); - - return ret; -} - -void __exit resctrl_exit(void) -{ - debugfs_remove_recursive(debugfs_resctrl); - unregister_filesystem(&rdt_fs_type); - sysfs_remove_mount_point(fs_kobj, "resctrl"); - - resctrl_mon_resource_exit(); -} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 16f3ca30626a..dbf6d71bdf18 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, @@ -53,7 +54,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, - { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 }, + { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h index 4eddb4d571ef..30f39f92c98f 100644 --- a/arch/x86/kernel/cpu/sgx/driver.h +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -2,7 +2,6 @@ #ifndef __ARCH_SGX_DRIVER_H__ #define __ARCH_SGX_DRIVER_H__ -#include <crypto/hash.h> #include <linux/kref.h> #include <linux/mmu_notifier.h> #include <linux/radix-tree.h> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 776a20172867..66f1efa16fbb 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -3,6 +3,7 @@ #include <asm/mman.h> #include <asm/sgx.h> +#include <crypto/sha2.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -463,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) return ret; } -static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, - void *hash) -{ - SHASH_DESC_ON_STACK(shash, tfm); - - shash->tfm = tfm; - - return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); -} - -static int sgx_get_key_hash(const void *modulus, void *hash) -{ - struct crypto_shash *tfm; - int ret; - - tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - ret = __sgx_get_key_hash(tfm, modulus, hash); - - crypto_free_shash(tfm); - return ret; -} - static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { @@ -523,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, sgx_xfrm_reserved_mask) return -EINVAL; - ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); - if (ret) - return ret; + sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner); mutex_lock(&encl->lock); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8ce352fc72ac..2de01b379aa3 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/vmalloc.h> +#include <asm/msr.h> #include <asm/sgx.h> #include "driver.h" #include "encl.h" @@ -719,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags) goto out; } + sgx_unmark_page_reclaimable(page); + /* * TBD: Add additional plumbing to enable pre-emptive * action for asynchronous poison notification. Until @@ -871,7 +874,7 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash) WARN_ON_ONCE(preemptible()); for (i = 0; i < 4; i++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); + wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); } const struct file_operations sgx_provision_fops = { diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 01456236a6dd..e35ccdc84910 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -30,6 +30,7 @@ #include <asm/hypervisor.h> #include <asm/io_apic.h> #include <asm/mpspec.h> +#include <asm/msr.h> #include <asm/smp.h> #include "cpu.h" @@ -154,7 +155,7 @@ static __init bool check_for_real_bsp(u32 apic_id) * kernel must rely on the firmware enumeration order. */ if (has_apic_base) { - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); is_bsp = !!(msr & MSR_IA32_APICBASE_BSP); } diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 03b3c9c3a45e..843b1655ab45 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -3,6 +3,7 @@ #include <asm/apic.h> #include <asm/memtype.h> +#include <asm/msr.h> #include <asm/processor.h> #include "cpu.h" @@ -133,7 +134,7 @@ static void parse_fam10h_node_id(struct topo_scan *tscan) if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) return; - rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); + rdmsrq(MSR_FAM10H_NODE_ID, nid.msr); store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); tscan->c->topo.llc_id = nid.node_id; } @@ -160,7 +161,7 @@ static void topoext_fixup(struct topo_scan *tscan) if (msr_set_bit(0xc0011005, 54) <= 0) return; - rdmsrl(0xc0011005, msrval); + rdmsrq(0xc0011005, msrval); if (msrval & BIT_64(54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); @@ -182,7 +183,7 @@ static void parse_topology_amd(struct topo_scan *tscan) if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) has_topoext = cpu_parse_topology_ext(tscan); - if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); if (!has_topoext && !parse_8000_0008(tscan)) diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index b31ee4f1657a..49782724a943 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -12,6 +12,7 @@ #include <asm/cmdline.h> #include <asm/cpu.h> +#include <asm/msr.h> #include "cpu.h" @@ -24,7 +25,7 @@ static void tsx_disable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Force all transactions to immediately abort */ tsx |= TSX_CTRL_RTM_DISABLE; @@ -37,14 +38,14 @@ static void tsx_disable(void) */ tsx |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static void tsx_enable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Enable the RTM feature in the cpu */ tsx &= ~TSX_CTRL_RTM_DISABLE; @@ -56,7 +57,7 @@ static void tsx_enable(void) */ tsx &= ~TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) @@ -115,13 +116,13 @@ static void tsx_clear_cpuid(void) */ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { - rdmsrl(MSR_TSX_FORCE_ABORT, msr); + rdmsrq(MSR_TSX_FORCE_ABORT, msr); msr |= MSR_TFA_TSX_CPUID_CLEAR; - wrmsrl(MSR_TSX_FORCE_ABORT, msr); + wrmsrq(MSR_TSX_FORCE_ABORT, msr); } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) { - rdmsrl(MSR_IA32_TSX_CTRL, msr); + rdmsrq(MSR_IA32_TSX_CTRL, msr); msr |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, msr); + wrmsrq(MSR_IA32_TSX_CTRL, msr); } } @@ -146,11 +147,11 @@ static void tsx_dev_mode_disable(void) !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); if (mcu_opt_ctrl & RTM_ALLOW) { mcu_opt_ctrl &= ~RTM_ALLOW; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT); } } diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 2293efd6ffa6..933fcd7ff250 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -33,7 +33,7 @@ static DEFINE_MUTEX(umwait_lock); static void umwait_update_control_msr(void * unused) { lockdep_assert_irqs_disabled(); - wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached)); } /* @@ -71,7 +71,7 @@ static int umwait_cpu_offline(unsigned int cpu) * the original control MSR value in umwait_init(). So there * is no race condition here. */ - wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); return 0; } @@ -214,7 +214,7 @@ static int __init umwait_init(void) * changed. This is the only place where orig_umwait_control_cached * is modified. */ - rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); + rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", umwait_cpu_online, umwait_cpu_offline); diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 90eba7eb5335..89b1c8a70fe8 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> +#include <asm/msr.h> #include "cpu.h" diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index c6fefd4585f8..71ee20102a8a 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -23,8 +23,6 @@ #include <asm/stacktrace.h> #include <asm/unwind.h> -int panic_on_unrecovered_nmi; -int panic_on_io_nmi; static int die_counter; static struct pt_regs exec_summary_regs; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index 3aad78bfcb26..cba75306e5b6 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/console.h> #include <linux/kernel.h> +#include <linux/kexec.h> #include <linux/init.h> #include <linux/string.h> #include <linux/screen_info.h> @@ -144,6 +145,11 @@ static __init void early_serial_hw_init(unsigned divisor) static_call(serial_out)(early_serial_base, DLL, divisor & 0xff); static_call(serial_out)(early_serial_base, DLH, (divisor >> 8) & 0xff); static_call(serial_out)(early_serial_base, LCR, c & ~DLAB); + +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64) + if (static_call_query(serial_in) == io_serial_in) + kexec_debug_8250_port = early_serial_base; +#endif } #define DEFAULT_BAUD 9600 @@ -327,6 +333,9 @@ static __init void early_pci_serial_init(char *s) /* WARNING! assuming the address is always in the first 4G */ early_serial_base = (unsigned long)early_ioremap(bar0 & PCI_BASE_ADDRESS_MEM_MASK, 0x10); +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_X86_64) + kexec_debug_8250_mmio32 = bar0 & PCI_BASE_ADDRESS_MEM_MASK; +#endif write_pci_config(bus, slot, func, PCI_COMMAND, cmdreg|PCI_COMMAND_MEMORY); } diff --git a/arch/x86/kernel/fpu/context.h b/arch/x86/kernel/fpu/context.h index f6d856bd50bc..10d0a720659c 100644 --- a/arch/x86/kernel/fpu/context.h +++ b/arch/x86/kernel/fpu/context.h @@ -53,7 +53,7 @@ static inline void fpregs_activate(struct fpu *fpu) /* Internal helper for switch_fpu_return() and signal frame setup */ static inline void fpregs_restore_userregs(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); int cpu = smp_processor_id(); if (WARN_ON_ONCE(current->flags & (PF_KTHREAD | PF_USER_WORKER))) @@ -67,7 +67,7 @@ static inline void fpregs_restore_userregs(void) * If PKRU is enabled, then the PKRU value is already * correct because it was either set in switch_to() or in * flush_thread(). So it is excluded because it might be - * not up to date in current->thread.fpu.xsave state. + * not up to date in current->thread.fpu->xsave state. * * XFD state is handled in restore_fpregs_from_fpstate(). */ diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 91d6341f281f..ea138583dd92 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -11,6 +11,7 @@ #include <asm/fpu/sched.h> #include <asm/fpu/signal.h> #include <asm/fpu/types.h> +#include <asm/msr.h> #include <asm/traps.h> #include <asm/irq_regs.h> @@ -43,14 +44,27 @@ struct fpu_state_config fpu_user_cfg __ro_after_init; */ struct fpstate init_fpstate __ro_after_init; -/* Track in-kernel FPU usage */ -static DEFINE_PER_CPU(bool, in_kernel_fpu); +/* + * Track FPU initialization and kernel-mode usage. 'true' means the FPU is + * initialized and is not currently being used by the kernel: + */ +DEFINE_PER_CPU(bool, kernel_fpu_allowed); /* * Track which context is using the FPU on the CPU: */ DEFINE_PER_CPU(struct fpu *, fpu_fpregs_owner_ctx); +#ifdef CONFIG_X86_DEBUG_FPU +struct fpu *x86_task_fpu(struct task_struct *task) +{ + if (WARN_ON_ONCE(task->flags & PF_KTHREAD)) + return NULL; + + return (void *)task + sizeof(*task); +} +#endif + /* * Can we use the FPU in kernel mode with the * whole "kernel_fpu_begin/end()" sequence? @@ -61,15 +75,18 @@ bool irq_fpu_usable(void) return false; /* - * In kernel FPU usage already active? This detects any explicitly - * nested usage in task or softirq context, which is unsupported. It - * also detects attempted usage in a hardirq that has interrupted a - * kernel-mode FPU section. + * Return false in the following cases: + * + * - FPU is not yet initialized. This can happen only when the call is + * coming from CPU onlining, for example for microcode checksumming. + * - The kernel is already using the FPU, either because of explicit + * nesting (which should never be done), or because of implicit + * nesting when a hardirq interrupted a kernel-mode FPU section. + * + * The single boolean check below handles both cases: */ - if (this_cpu_read(in_kernel_fpu)) { - WARN_ON_FPU(!in_hardirq()); + if (!this_cpu_read(kernel_fpu_allowed)) return false; - } /* * When not in NMI or hard interrupt context, FPU can be used in: @@ -202,7 +219,7 @@ void fpu_reset_from_exception_fixup(void) #if IS_ENABLED(CONFIG_KVM) static void __fpstate_reset(struct fpstate *fpstate, u64 xfd); -static void fpu_init_guest_permissions(struct fpu_guest *gfpu) +static void fpu_lock_guest_permissions(void) { struct fpu_state_perm *fpuperm; u64 perm; @@ -211,15 +228,13 @@ static void fpu_init_guest_permissions(struct fpu_guest *gfpu) return; spin_lock_irq(¤t->sighand->siglock); - fpuperm = ¤t->group_leader->thread.fpu.guest_perm; + fpuperm = &x86_task_fpu(current->group_leader)->guest_perm; perm = fpuperm->__state_perm; /* First fpstate allocation locks down permissions. */ WRITE_ONCE(fpuperm->__state_perm, perm | FPU_GUEST_PERM_LOCKED); spin_unlock_irq(¤t->sighand->siglock); - - gfpu->perm = perm & ~FPU_GUEST_PERM_LOCKED; } bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) @@ -240,7 +255,6 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) gfpu->fpstate = fpstate; gfpu->xfeatures = fpu_kernel_cfg.default_features; - gfpu->perm = fpu_kernel_cfg.default_features; /* * KVM sets the FP+SSE bits in the XSAVE header when copying FPU state @@ -255,7 +269,7 @@ bool fpu_alloc_guest_fpstate(struct fpu_guest *gfpu) if (WARN_ON_ONCE(fpu_user_cfg.default_size > gfpu->uabi_size)) gfpu->uabi_size = fpu_user_cfg.default_size; - fpu_init_guest_permissions(gfpu); + fpu_lock_guest_permissions(); return true; } @@ -263,16 +277,16 @@ EXPORT_SYMBOL_GPL(fpu_alloc_guest_fpstate); void fpu_free_guest_fpstate(struct fpu_guest *gfpu) { - struct fpstate *fps = gfpu->fpstate; + struct fpstate *fpstate = gfpu->fpstate; - if (!fps) + if (!fpstate) return; - if (WARN_ON_ONCE(!fps->is_valloc || !fps->is_guest || fps->in_use)) + if (WARN_ON_ONCE(!fpstate->is_valloc || !fpstate->is_guest || fpstate->in_use)) return; gfpu->fpstate = NULL; - vfree(fps); + vfree(fpstate); } EXPORT_SYMBOL_GPL(fpu_free_guest_fpstate); @@ -323,12 +337,12 @@ EXPORT_SYMBOL_GPL(fpu_update_guest_xfd); */ void fpu_sync_guest_vmexit_xfd_state(void) { - struct fpstate *fps = current->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(current)->fpstate; lockdep_assert_irqs_disabled(); if (fpu_state_size_dynamic()) { - rdmsrl(MSR_IA32_XFD, fps->xfd); - __this_cpu_write(xfd_state, fps->xfd); + rdmsrq(MSR_IA32_XFD, fpstate->xfd); + __this_cpu_write(xfd_state, fpstate->xfd); } } EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); @@ -337,7 +351,7 @@ EXPORT_SYMBOL_GPL(fpu_sync_guest_vmexit_xfd_state); int fpu_swap_kvm_fpstate(struct fpu_guest *guest_fpu, bool enter_guest) { struct fpstate *guest_fps = guest_fpu->fpstate; - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); struct fpstate *cur_fps = fpu->fpstate; fpregs_lock(); @@ -431,14 +445,15 @@ void kernel_fpu_begin_mask(unsigned int kfpu_mask) fpregs_lock(); WARN_ON_FPU(!irq_fpu_usable()); - WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); - this_cpu_write(in_kernel_fpu, true); + /* Toggle kernel_fpu_allowed to false: */ + WARN_ON_FPU(!this_cpu_read(kernel_fpu_allowed)); + this_cpu_write(kernel_fpu_allowed, false); if (!(current->flags & (PF_KTHREAD | PF_USER_WORKER)) && !test_thread_flag(TIF_NEED_FPU_LOAD)) { set_thread_flag(TIF_NEED_FPU_LOAD); - save_fpregs_to_fpstate(¤t->thread.fpu); + save_fpregs_to_fpstate(x86_task_fpu(current)); } __cpu_invalidate_fpregs_state(); @@ -453,9 +468,10 @@ EXPORT_SYMBOL_GPL(kernel_fpu_begin_mask); void kernel_fpu_end(void) { - WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); + /* Toggle kernel_fpu_allowed back to true: */ + WARN_ON_FPU(this_cpu_read(kernel_fpu_allowed)); + this_cpu_write(kernel_fpu_allowed, true); - this_cpu_write(in_kernel_fpu, false); if (!irqs_disabled()) fpregs_unlock(); } @@ -467,7 +483,7 @@ EXPORT_SYMBOL_GPL(kernel_fpu_end); */ void fpu_sync_fpstate(struct fpu *fpu) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); trace_x86_fpu_before_save(fpu); @@ -552,7 +568,7 @@ void fpstate_reset(struct fpu *fpu) static inline void fpu_inherit_perms(struct fpu *dst_fpu) { if (fpu_state_size_dynamic()) { - struct fpu *src_fpu = ¤t->group_leader->thread.fpu; + struct fpu *src_fpu = x86_task_fpu(current->group_leader); spin_lock_irq(¤t->sighand->siglock); /* Fork also inherits the permissions of the parent */ @@ -572,7 +588,7 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) if (!ssp) return 0; - xstate = get_xsave_addr(&dst->thread.fpu.fpstate->regs.xsave, + xstate = get_xsave_addr(&x86_task_fpu(dst)->fpstate->regs.xsave, XFEATURE_CET_USER); /* @@ -593,8 +609,16 @@ static int update_fpu_shstk(struct task_struct *dst, unsigned long ssp) int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, unsigned long ssp) { - struct fpu *src_fpu = ¤t->thread.fpu; - struct fpu *dst_fpu = &dst->thread.fpu; + /* + * We allocate the new FPU structure right after the end of the task struct. + * task allocation size already took this into account. + * + * This is safe because task_struct size is a multiple of cacheline size, + * thus x86_task_fpu() will always be cacheline aligned as well. + */ + struct fpu *dst_fpu = (void *)dst + sizeof(*dst); + + BUILD_BUG_ON(sizeof(*dst) % SMP_CACHE_BYTES != 0); /* The new task's FPU state cannot be valid in the hardware. */ dst_fpu->last_cpu = -1; @@ -657,19 +681,22 @@ int fpu_clone(struct task_struct *dst, unsigned long clone_flags, bool minimal, if (update_fpu_shstk(dst, ssp)) return 1; - trace_x86_fpu_copy_src(src_fpu); trace_x86_fpu_copy_dst(dst_fpu); return 0; } /* - * Whitelist the FPU register state embedded into task_struct for hardened - * usercopy. + * While struct fpu is no longer part of struct thread_struct, it is still + * allocated after struct task_struct in the "task_struct" kmem cache. But + * since FPU is expected to be part of struct thread_struct, we have to + * adjust for it here. */ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) { - *offset = offsetof(struct thread_struct, fpu.__fpstate.regs); + /* The allocation follows struct task_struct. */ + *offset = sizeof(struct task_struct) - offsetof(struct task_struct, thread); + *offset += offsetof(struct fpu, __fpstate.regs); *size = fpu_kernel_cfg.default_size; } @@ -682,11 +709,18 @@ void fpu_thread_struct_whitelist(unsigned long *offset, unsigned long *size) * a state-restore is coming: either an explicit one, * or a reschedule. */ -void fpu__drop(struct fpu *fpu) +void fpu__drop(struct task_struct *tsk) { + struct fpu *fpu; + + if (test_tsk_thread_flag(tsk, TIF_NEED_FPU_LOAD)) + return; + + fpu = x86_task_fpu(tsk); + preempt_disable(); - if (fpu == ¤t->thread.fpu) { + if (fpu == x86_task_fpu(current)) { /* Ignore delayed exceptions from user space */ asm volatile("1: fwait\n" "2:\n" @@ -718,9 +752,9 @@ static inline void restore_fpregs_from_init_fpstate(u64 features_mask) /* * Reset current->fpu memory state to the init values. */ -static void fpu_reset_fpregs(void) +static void fpu_reset_fpstate_regs(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); fpregs_lock(); __fpu_invalidate_fpregs_state(fpu); @@ -749,11 +783,11 @@ static void fpu_reset_fpregs(void) */ void fpu__clear_user_states(struct fpu *fpu) { - WARN_ON_FPU(fpu != ¤t->thread.fpu); + WARN_ON_FPU(fpu != x86_task_fpu(current)); fpregs_lock(); if (!cpu_feature_enabled(X86_FEATURE_FPU)) { - fpu_reset_fpregs(); + fpu_reset_fpstate_regs(); fpregs_unlock(); return; } @@ -782,8 +816,8 @@ void fpu__clear_user_states(struct fpu *fpu) void fpu_flush_thread(void) { - fpstate_reset(¤t->thread.fpu); - fpu_reset_fpregs(); + fpstate_reset(x86_task_fpu(current)); + fpu_reset_fpstate_regs(); } /* * Load FPU context before returning to userspace. @@ -823,7 +857,7 @@ void fpregs_lock_and_load(void) */ void fpregs_assert_state_consistent(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); if (test_thread_flag(TIF_NEED_FPU_LOAD)) return; @@ -835,7 +869,7 @@ EXPORT_SYMBOL_GPL(fpregs_assert_state_consistent); void fpregs_mark_activate(void) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); fpregs_activate(fpu); fpu->last_cpu = smp_processor_id(); diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c index 998a08f17e33..99db41bf9fa6 100644 --- a/arch/x86/kernel/fpu/init.c +++ b/arch/x86/kernel/fpu/init.c @@ -38,7 +38,7 @@ static void fpu__init_cpu_generic(void) /* Flush out any pending x87 state: */ #ifdef CONFIG_MATH_EMULATION if (!boot_cpu_has(X86_FEATURE_FPU)) - fpstate_init_soft(¤t->thread.fpu.fpstate->regs.soft); + ; else #endif asm volatile ("fninit"); @@ -51,6 +51,9 @@ void fpu__init_cpu(void) { fpu__init_cpu_generic(); fpu__init_cpu_xstate(); + + /* Start allowing kernel-mode FPU: */ + this_cpu_write(kernel_fpu_allowed, true); } static bool __init fpu__probe_without_cpuid(void) @@ -73,6 +76,8 @@ static bool __init fpu__probe_without_cpuid(void) static void __init fpu__init_system_early_generic(void) { + set_thread_flag(TIF_NEED_FPU_LOAD); + if (!boot_cpu_has(X86_FEATURE_CPUID) && !test_bit(X86_FEATURE_FPU, (unsigned long *)cpu_caps_cleared)) { if (fpu__probe_without_cpuid()) @@ -94,7 +99,6 @@ static void __init fpu__init_system_early_generic(void) * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __ro_after_init = 0xffffffffu; -EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) { @@ -150,11 +154,13 @@ static void __init fpu__init_task_struct_size(void) { int task_size = sizeof(struct task_struct); + task_size += sizeof(struct fpu); + /* * Subtract off the static size of the register state. * It potentially has a bunch of padding. */ - task_size -= sizeof(current->thread.fpu.__fpstate.regs); + task_size -= sizeof(union fpregs_state); /* * Add back the dynamically-calculated register state @@ -164,14 +170,9 @@ static void __init fpu__init_task_struct_size(void) /* * We dynamically size 'struct fpu', so we require that - * it be at the end of 'thread_struct' and that - * 'thread_struct' be at the end of 'task_struct'. If - * you hit a compile error here, check the structure to - * see if something got added to the end. + * 'state' be at the end of 'it: */ CHECK_MEMBER_AT_END_OF(struct fpu, __fpstate); - CHECK_MEMBER_AT_END_OF(struct thread_struct, fpu); - CHECK_MEMBER_AT_END_OF(struct task_struct, thread); arch_task_struct_size = task_size; } @@ -204,7 +205,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) fpu_kernel_cfg.default_size = size; fpu_user_cfg.max_size = size; fpu_user_cfg.default_size = size; - fpstate_reset(¤t->thread.fpu); } /* @@ -213,7 +213,6 @@ static void __init fpu__init_system_xstate_size_legacy(void) */ void __init fpu__init_system(void) { - fpstate_reset(¤t->thread.fpu); fpu__init_system_early_generic(); /* diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 887b0b8e21e3..0986c2200adc 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -45,7 +45,7 @@ int regset_xregset_fpregs_active(struct task_struct *target, const struct user_r */ static void sync_fpstate(struct fpu *fpu) { - if (fpu == ¤t->thread.fpu) + if (fpu == x86_task_fpu(current)) fpu_sync_fpstate(fpu); } @@ -63,7 +63,7 @@ static void fpu_force_restore(struct fpu *fpu) * Only stopped child tasks can be used to modify the FPU * state in the fpstate buffer: */ - WARN_ON_FPU(fpu == ¤t->thread.fpu); + WARN_ON_FPU(fpu == x86_task_fpu(current)); __fpu_invalidate_fpregs_state(fpu); } @@ -71,7 +71,7 @@ static void fpu_force_restore(struct fpu *fpu) int xfpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); if (!cpu_feature_enabled(X86_FEATURE_FXSR)) return -ENODEV; @@ -91,7 +91,7 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct fxregs_state newstate; int ret; @@ -133,7 +133,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_feature_enabled(X86_FEATURE_XSAVE)) return -ENODEV; - sync_fpstate(&target->thread.fpu); + sync_fpstate(x86_task_fpu(target)); copy_xstate_to_uabi_buf(to, target, XSTATE_COPY_XSAVE); return 0; @@ -143,7 +143,7 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct xregs_state *tmpbuf = NULL; int ret; @@ -187,7 +187,7 @@ int ssp_active(struct task_struct *target, const struct user_regset *regset) int ssp_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct cet_user_state *cetregs; if (!cpu_feature_enabled(X86_FEATURE_USER_SHSTK) || @@ -214,7 +214,7 @@ int ssp_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct xregs_state *xsave = &fpu->fpstate->regs.xsave; struct cet_user_state *cetregs; unsigned long user_ssp; @@ -368,7 +368,7 @@ static void __convert_from_fxsr(struct user_i387_ia32_struct *env, void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - __convert_from_fxsr(env, tsk, &tsk->thread.fpu.fpstate->regs.fxsave); + __convert_from_fxsr(env, tsk, &x86_task_fpu(tsk)->fpstate->regs.fxsave); } void convert_to_fxsr(struct fxregs_state *fxsave, @@ -401,7 +401,7 @@ void convert_to_fxsr(struct fxregs_state *fxsave, int fpregs_get(struct task_struct *target, const struct user_regset *regset, struct membuf to) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; struct fxregs_state fxsave, *fx; @@ -433,7 +433,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { - struct fpu *fpu = &target->thread.fpu; + struct fpu *fpu = x86_task_fpu(target); struct user_i387_ia32_struct env; int ret; diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 6c69cb28b298..c3ec2512f2bb 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -43,13 +43,13 @@ static inline bool check_xstate_in_sigframe(struct fxregs_state __user *fxbuf, * fpstate layout with out copying the extended state information * in the memory layout. */ - if (__get_user(magic2, (__u32 __user *)(fpstate + current->thread.fpu.fpstate->user_size))) + if (__get_user(magic2, (__u32 __user *)(fpstate + x86_task_fpu(current)->fpstate->user_size))) return false; if (likely(magic2 == FP_XSTATE_MAGIC2)) return true; setfx: - trace_x86_fpu_xstate_check_failed(¤t->thread.fpu); + trace_x86_fpu_xstate_check_failed(x86_task_fpu(current)); /* Set the parameters for fx only state */ fx_sw->magic1 = 0; @@ -64,13 +64,13 @@ setfx: static inline bool save_fsave_header(struct task_struct *tsk, void __user *buf) { if (use_fxsr()) { - struct xregs_state *xsave = &tsk->thread.fpu.fpstate->regs.xsave; + struct xregs_state *xsave = &x86_task_fpu(tsk)->fpstate->regs.xsave; struct user_i387_ia32_struct env; struct _fpstate_32 __user *fp = buf; fpregs_lock(); if (!test_thread_flag(TIF_NEED_FPU_LOAD)) - fxsave(&tsk->thread.fpu.fpstate->regs.fxsave); + fxsave(&x86_task_fpu(tsk)->fpstate->regs.fxsave); fpregs_unlock(); convert_from_fxsr(&env, tsk); @@ -114,7 +114,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, { struct xregs_state __user *x = buf; struct _fpx_sw_bytes sw_bytes = {}; - u32 xfeatures; int err; /* Setup the bytes not touched by the [f]xsave and reserved for SW. */ @@ -128,12 +127,6 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, (__u32 __user *)(buf + fpstate->user_size)); /* - * Read the xfeatures which we copied (directly from the cpu or - * from the state in task struct) to the user buffers. - */ - err |= __get_user(xfeatures, (__u32 __user *)&x->header.xfeatures); - - /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. This will * enable us capturing any changes(during sigreturn) to @@ -144,9 +137,7 @@ static inline bool save_xstate_epilog(void __user *buf, int ia32_frame, * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ - xfeatures |= XFEATURE_MASK_FPSSE; - - err |= __put_user(xfeatures, (__u32 __user *)&x->header.xfeatures); + err |= set_xfeature_in_sigframe(x, XFEATURE_MASK_FPSSE); return !err; } @@ -184,7 +175,7 @@ static inline int copy_fpregs_to_sigframe(struct xregs_state __user *buf, u32 pk bool copy_fpstate_to_sigframe(void __user *buf, void __user *buf_fx, int size, u32 pkru) { struct task_struct *tsk = current; - struct fpstate *fpstate = tsk->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(tsk)->fpstate; bool ia32_fxstate = (buf != buf_fx); int ret; @@ -272,7 +263,7 @@ static int __restore_fpregs_from_user(void __user *buf, u64 ufeatures, */ static bool restore_fpregs_from_user(void __user *buf, u64 xrestore, bool fx_only) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); int ret; /* Restore enabled features only. */ @@ -332,7 +323,7 @@ static bool __fpu_restore_sig(void __user *buf, void __user *buf_fx, bool ia32_fxstate) { struct task_struct *tsk = current; - struct fpu *fpu = &tsk->thread.fpu; + struct fpu *fpu = x86_task_fpu(tsk); struct user_i387_ia32_struct env; bool success, fx_only = false; union fpregs_state *fpregs; @@ -452,7 +443,7 @@ static inline unsigned int xstate_sigframe_size(struct fpstate *fpstate) */ bool fpu__restore_sig(void __user *buf, int ia32_frame) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); void __user *buf_fx = buf; bool ia32_fxstate = false; bool success = false; @@ -499,7 +490,7 @@ unsigned long fpu__alloc_mathframe(unsigned long sp, int ia32_frame, unsigned long *buf_fx, unsigned long *size) { - unsigned long frame_size = xstate_sigframe_size(current->thread.fpu.fpstate); + unsigned long frame_size = xstate_sigframe_size(x86_task_fpu(current)->fpstate); *buf_fx = sp = round_down(sp - frame_size, 64); if (ia32_frame && use_fxsr()) { diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 6a41d1610d8b..9aa9ac8399ae 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -14,13 +14,15 @@ #include <linux/proc_fs.h> #include <linux/vmalloc.h> #include <linux/coredump.h> +#include <linux/sort.h> #include <asm/fpu/api.h> #include <asm/fpu/regset.h> #include <asm/fpu/signal.h> #include <asm/fpu/xcr.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> +#include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/prctl.h> #include <asm/elf.h> @@ -62,6 +64,7 @@ static const char *xfeature_names[] = "unknown xstate feature", "AMX Tile config", "AMX Tile data", + "APX registers", "unknown xstate feature", }; @@ -80,6 +83,7 @@ static unsigned short xsave_cpuid_features[] __initdata = { [XFEATURE_CET_USER] = X86_FEATURE_SHSTK, [XFEATURE_XTILE_CFG] = X86_FEATURE_AMX_TILE, [XFEATURE_XTILE_DATA] = X86_FEATURE_AMX_TILE, + [XFEATURE_APX] = X86_FEATURE_APX, }; static unsigned int xstate_offsets[XFEATURE_MAX] __ro_after_init = @@ -88,6 +92,31 @@ static unsigned int xstate_sizes[XFEATURE_MAX] __ro_after_init = { [ 0 ... XFEATURE_MAX - 1] = -1}; static unsigned int xstate_flags[XFEATURE_MAX] __ro_after_init; +/* + * Ordering of xstate components in uncompacted format: The xfeature + * number does not necessarily indicate its position in the XSAVE buffer. + * This array defines the traversal order of xstate features. + */ +static unsigned int xfeature_uncompact_order[XFEATURE_MAX] __ro_after_init = + { [ 0 ... XFEATURE_MAX - 1] = -1}; + +static inline unsigned int next_xfeature_order(unsigned int i, u64 mask) +{ + for (; xfeature_uncompact_order[i] != -1; i++) { + if (mask & BIT_ULL(xfeature_uncompact_order[i])) + break; + } + + return i; +} + +/* Iterate xstate features in uncompacted order: */ +#define for_each_extended_xfeature_in_order(i, mask) \ + for (i = 0; \ + i = next_xfeature_order(i, mask), \ + xfeature_uncompact_order[i] != -1; \ + i++) + #define XSTATE_FLAG_SUPERVISOR BIT(0) #define XSTATE_FLAG_ALIGNED64 BIT(1) @@ -199,7 +228,7 @@ void fpu__init_cpu_xstate(void) * MSR_IA32_XSS sets supervisor states managed by XSAVES. */ if (boot_cpu_has(X86_FEATURE_XSAVES)) { - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } } @@ -209,16 +238,20 @@ static bool xfeature_enabled(enum xfeature xfeature) return fpu_kernel_cfg.max_features & BIT_ULL(xfeature); } +static int compare_xstate_offsets(const void *xfeature1, const void *xfeature2) +{ + return xstate_offsets[*(unsigned int *)xfeature1] - + xstate_offsets[*(unsigned int *)xfeature2]; +} + /* * Record the offsets and sizes of various xstates contained - * in the XSAVE state memory layout. + * in the XSAVE state memory layout. Also, create an ordered + * list of xfeatures for handling out-of-order offsets. */ static void __init setup_xstate_cache(void) { - u32 eax, ebx, ecx, edx, i; - /* start at the beginning of the "extended state" */ - unsigned int last_good_offset = offsetof(struct xregs_state, - extended_state_area); + u32 eax, ebx, ecx, edx, xfeature, i = 0; /* * The FP xstates and SSE xstates are legacy states. They are always * in the fixed offsets in the xsave area in either compacted form @@ -232,31 +265,30 @@ static void __init setup_xstate_cache(void) xstate_sizes[XFEATURE_SSE] = sizeof_field(struct fxregs_state, xmm_space); - for_each_extended_xfeature(i, fpu_kernel_cfg.max_features) { - cpuid_count(CPUID_LEAF_XSTATE, i, &eax, &ebx, &ecx, &edx); + for_each_extended_xfeature(xfeature, fpu_kernel_cfg.max_features) { + cpuid_count(CPUID_LEAF_XSTATE, xfeature, &eax, &ebx, &ecx, &edx); - xstate_sizes[i] = eax; - xstate_flags[i] = ecx; + xstate_sizes[xfeature] = eax; + xstate_flags[xfeature] = ecx; /* * If an xfeature is supervisor state, the offset in EBX is * invalid, leave it to -1. */ - if (xfeature_is_supervisor(i)) + if (xfeature_is_supervisor(xfeature)) continue; - xstate_offsets[i] = ebx; + xstate_offsets[xfeature] = ebx; - /* - * In our xstate size checks, we assume that the highest-numbered - * xstate feature has the highest offset in the buffer. Ensure - * it does. - */ - WARN_ONCE(last_good_offset > xstate_offsets[i], - "x86/fpu: misordered xstate at %d\n", last_good_offset); - - last_good_offset = xstate_offsets[i]; + /* Populate the list of xfeatures before sorting */ + xfeature_uncompact_order[i++] = xfeature; } + + /* + * Sort xfeatures by their offsets to support out-of-order + * offsets in the uncompacted format. + */ + sort(xfeature_uncompact_order, i, sizeof(unsigned int), compare_xstate_offsets, NULL); } /* @@ -340,7 +372,8 @@ static __init void os_xrstor_booting(struct xregs_state *xstate) XFEATURE_MASK_BNDCSR | \ XFEATURE_MASK_PASID | \ XFEATURE_MASK_CET_USER | \ - XFEATURE_MASK_XTILE) + XFEATURE_MASK_XTILE | \ + XFEATURE_MASK_APX) /* * setup the xstate image representing the init state @@ -540,6 +573,7 @@ static bool __init check_xstate_against_struct(int nr) case XFEATURE_PASID: return XCHECK_SZ(sz, nr, struct ia32_pasid_state); case XFEATURE_XTILE_CFG: return XCHECK_SZ(sz, nr, struct xtile_cfg); case XFEATURE_CET_USER: return XCHECK_SZ(sz, nr, struct cet_user_state); + case XFEATURE_APX: return XCHECK_SZ(sz, nr, struct apx_state); case XFEATURE_XTILE_DATA: check_xtile_data_against_struct(sz); return true; default: XSTATE_WARN_ON(1, "No structure for xstate: %d\n", nr); @@ -552,13 +586,20 @@ static bool __init check_xstate_against_struct(int nr) static unsigned int xstate_calculate_size(u64 xfeatures, bool compacted) { unsigned int topmost = fls64(xfeatures) - 1; - unsigned int offset = xstate_offsets[topmost]; + unsigned int offset, i; if (topmost <= XFEATURE_SSE) return sizeof(struct xregs_state); - if (compacted) + if (compacted) { offset = xfeature_get_offset(xfeatures, topmost); + } else { + /* Walk through the xfeature order to pick the last */ + for_each_extended_xfeature_in_order(i, xfeatures) + topmost = xfeature_uncompact_order[i]; + offset = xstate_offsets[topmost]; + } + return offset + xstate_sizes[topmost]; } @@ -639,7 +680,7 @@ static unsigned int __init get_xsave_compacted_size(void) return get_compacted_size(); /* Disable independent features. */ - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor()); + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor()); /* * Ask the hardware what size is required of the buffer. @@ -648,7 +689,7 @@ static unsigned int __init get_xsave_compacted_size(void) size = get_compacted_size(); /* Re-enable independent features so XSAVES will work on them again. */ - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | mask); return size; } @@ -711,6 +752,8 @@ static int __init init_xstate_size(void) */ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) { + pr_info("x86/fpu: XSAVE disabled\n"); + fpu_kernel_cfg.max_features = 0; cr4_clear_bits(X86_CR4_OSXSAVE); setup_clear_cpu_cap(X86_FEATURE_XSAVE); @@ -727,7 +770,7 @@ static void __init fpu__init_disable_system_xstate(unsigned int legacy_size) */ init_fpstate.xfd = 0; - fpstate_reset(¤t->thread.fpu); + fpstate_reset(x86_task_fpu(current)); } /* @@ -775,6 +818,17 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) goto out_disable; } + if (fpu_kernel_cfg.max_features & XFEATURE_MASK_APX && + fpu_kernel_cfg.max_features & (XFEATURE_MASK_BNDREGS | XFEATURE_MASK_BNDCSR)) { + /* + * This is a problematic CPU configuration where two + * conflicting state components are both enumerated. + */ + pr_err("x86/fpu: Both APX/MPX present in the CPU's xstate features: 0x%llx.\n", + fpu_kernel_cfg.max_features); + goto out_disable; + } + fpu_kernel_cfg.independent_features = fpu_kernel_cfg.max_features & XFEATURE_MASK_INDEPENDENT; @@ -834,9 +888,6 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) if (err) goto out_disable; - /* Reset the state for the current task */ - fpstate_reset(¤t->thread.fpu); - /* * Update info used for ptrace frames; use standard-format size and no * supervisor xstates: @@ -852,7 +903,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) init_fpstate.xfeatures = fpu_kernel_cfg.default_features; if (init_fpstate.size > sizeof(init_fpstate.regs)) { - pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d), disabling XSAVE\n", + pr_warn("x86/fpu: init_fpstate buffer too small (%zu < %d)\n", sizeof(init_fpstate.regs), init_fpstate.size); goto out_disable; } @@ -864,7 +915,7 @@ void __init fpu__init_system_xstate(unsigned int legacy_size) * xfeatures mask. */ if (xfeatures != fpu_kernel_cfg.max_features) { - pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init, disabling XSAVE\n", + pr_err("x86/fpu: xfeatures modified from 0x%016llx to 0x%016llx during init\n", xfeatures, fpu_kernel_cfg.max_features); goto out_disable; } @@ -904,12 +955,12 @@ void fpu__resume_cpu(void) * of XSAVES and MSR_IA32_XSS. */ if (cpu_feature_enabled(X86_FEATURE_XSAVES)) { - wrmsrl(MSR_IA32_XSS, xfeatures_mask_supervisor() | + wrmsrq(MSR_IA32_XSS, xfeatures_mask_supervisor() | xfeatures_mask_independent()); } if (fpu_state_size_dynamic()) - wrmsrl(MSR_IA32_XFD, current->thread.fpu.fpstate->xfd); + wrmsrq(MSR_IA32_XFD, x86_task_fpu(current)->fpstate->xfd); } /* @@ -1071,10 +1122,9 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, const unsigned int off_mxcsr = offsetof(struct fxregs_state, mxcsr); struct xregs_state *xinit = &init_fpstate.regs.xsave; struct xregs_state *xsave = &fpstate->regs.xsave; + unsigned int zerofrom, i, xfeature; struct xstate_header header; - unsigned int zerofrom; u64 mask; - int i; memset(&header, 0, sizeof(header)); header.xfeatures = xsave->header.xfeatures; @@ -1143,15 +1193,16 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, */ mask = header.xfeatures; - for_each_extended_xfeature(i, mask) { + for_each_extended_xfeature_in_order(i, mask) { + xfeature = xfeature_uncompact_order[i]; /* * If there was a feature or alignment gap, zero the space * in the destination buffer. */ - if (zerofrom < xstate_offsets[i]) - membuf_zero(&to, xstate_offsets[i] - zerofrom); + if (zerofrom < xstate_offsets[xfeature]) + membuf_zero(&to, xstate_offsets[xfeature] - zerofrom); - if (i == XFEATURE_PKRU) { + if (xfeature == XFEATURE_PKRU) { struct pkru_state pkru = {0}; /* * PKRU is not necessarily up to date in the @@ -1161,14 +1212,14 @@ void __copy_xstate_to_uabi_buf(struct membuf to, struct fpstate *fpstate, membuf_write(&to, &pkru, sizeof(pkru)); } else { membuf_write(&to, - __raw_xsave_addr(xsave, i), - xstate_sizes[i]); + __raw_xsave_addr(xsave, xfeature), + xstate_sizes[xfeature]); } /* * Keep track of the last copied state in the non-compacted * target buffer for gap zeroing. */ - zerofrom = xstate_offsets[i] + xstate_sizes[i]; + zerofrom = xstate_offsets[xfeature] + xstate_sizes[xfeature]; } out: @@ -1191,8 +1242,8 @@ out: void copy_xstate_to_uabi_buf(struct membuf to, struct task_struct *tsk, enum xstate_copy_mode copy_mode) { - __copy_xstate_to_uabi_buf(to, tsk->thread.fpu.fpstate, - tsk->thread.fpu.fpstate->user_xfeatures, + __copy_xstate_to_uabi_buf(to, x86_task_fpu(tsk)->fpstate, + x86_task_fpu(tsk)->fpstate->user_xfeatures, tsk->thread.pkru, copy_mode); } @@ -1332,7 +1383,7 @@ int copy_uabi_from_kernel_to_xstate(struct fpstate *fpstate, const void *kbuf, u int copy_sigframe_from_user_to_xstate(struct task_struct *tsk, const void __user *ubuf) { - return copy_uabi_to_xstate(tsk->thread.fpu.fpstate, NULL, ubuf, &tsk->thread.pkru); + return copy_uabi_to_xstate(x86_task_fpu(tsk)->fpstate, NULL, ubuf, &tsk->thread.pkru); } static bool validate_independent_components(u64 mask) @@ -1398,9 +1449,9 @@ void xrstors(struct xregs_state *xstate, u64 mask) } #if IS_ENABLED(CONFIG_KVM) -void fpstate_clear_xstate_component(struct fpstate *fps, unsigned int xfeature) +void fpstate_clear_xstate_component(struct fpstate *fpstate, unsigned int xfeature) { - void *addr = get_xsave_addr(&fps->regs.xsave, xfeature); + void *addr = get_xsave_addr(&fpstate->regs.xsave, xfeature); if (addr) memset(addr, 0, xstate_sizes[xfeature]); @@ -1426,7 +1477,7 @@ static bool xstate_op_valid(struct fpstate *fpstate, u64 mask, bool rstor) * The XFD MSR does not match fpstate->xfd. That's invalid when * the passed in fpstate is current's fpstate. */ - if (fpstate->xfd == current->thread.fpu.fpstate->xfd) + if (fpstate->xfd == x86_task_fpu(current)->fpstate->xfd) return false; /* @@ -1503,7 +1554,7 @@ void fpstate_free(struct fpu *fpu) static int fpstate_realloc(u64 xfeatures, unsigned int ksize, unsigned int usize, struct fpu_guest *guest_fpu) { - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); struct fpstate *curfps, *newfps = NULL; unsigned int fpsize; bool in_use; @@ -1596,7 +1647,7 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) * AVX512. */ bool compacted = cpu_feature_enabled(X86_FEATURE_XCOMPACTED); - struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; unsigned int ksize, usize; u64 mask; @@ -1606,16 +1657,20 @@ static int __xstate_request_perm(u64 permitted, u64 requested, bool guest) if ((permitted & requested) == requested) return 0; - /* Calculate the resulting kernel state size */ + /* + * Calculate the resulting kernel state size. Note, @permitted also + * contains supervisor xfeatures even though supervisor are always + * permitted for kernel and guest FPUs, and never permitted for user + * FPUs. + */ mask = permitted | requested; - /* Take supervisor states into account on the host */ - if (!guest) - mask |= xfeatures_mask_supervisor(); ksize = xstate_calculate_size(mask, compacted); - /* Calculate the resulting user state size */ - mask &= XFEATURE_MASK_USER_SUPPORTED; - usize = xstate_calculate_size(mask, false); + /* + * Calculate the resulting user state size. Take care not to clobber + * the supervisor xfeatures in the new mask! + */ + usize = xstate_calculate_size(mask & XFEATURE_MASK_USER_SUPPORTED, false); if (!guest) { ret = validate_sigaltstack(usize); @@ -1699,7 +1754,7 @@ int __xfd_enable_feature(u64 xfd_err, struct fpu_guest *guest_fpu) return -EPERM; } - fpu = ¤t->group_leader->thread.fpu; + fpu = x86_task_fpu(current->group_leader); perm = guest_fpu ? &fpu->guest_perm : &fpu->perm; ksize = perm->__state_size; usize = perm->__user_state_size; @@ -1804,7 +1859,7 @@ long fpu_xstate_prctl(int option, unsigned long arg2) */ static void avx512_status(struct seq_file *m, struct task_struct *task) { - unsigned long timestamp = READ_ONCE(task->thread.fpu.avx512_timestamp); + unsigned long timestamp = READ_ONCE(x86_task_fpu(task)->avx512_timestamp); long delta; if (!timestamp) { diff --git a/arch/x86/kernel/fpu/xstate.h b/arch/x86/kernel/fpu/xstate.h index 0fd34f53f025..52ce19289989 100644 --- a/arch/x86/kernel/fpu/xstate.h +++ b/arch/x86/kernel/fpu/xstate.h @@ -5,6 +5,7 @@ #include <asm/cpufeature.h> #include <asm/fpu/xstate.h> #include <asm/fpu/xcr.h> +#include <asm/msr.h> #ifdef CONFIG_X86_64 DECLARE_PER_CPU(u64, xfd_state); @@ -22,7 +23,7 @@ static inline void xstate_init_xcomp_bv(struct xregs_state *xsave, u64 mask) static inline u64 xstate_get_group_perm(bool guest) { - struct fpu *fpu = ¤t->group_leader->thread.fpu; + struct fpu *fpu = x86_task_fpu(current->group_leader); struct fpu_state_perm *perm; /* Pairs with WRITE_ONCE() in xstate_request_perm() */ @@ -69,21 +70,31 @@ static inline u64 xfeatures_mask_independent(void) return fpu_kernel_cfg.independent_features; } +static inline int set_xfeature_in_sigframe(struct xregs_state __user *xbuf, u64 mask) +{ + u64 xfeatures; + int err; + + /* Read the xfeatures value already saved in the user buffer */ + err = __get_user(xfeatures, &xbuf->header.xfeatures); + xfeatures |= mask; + err |= __put_user(xfeatures, &xbuf->header.xfeatures); + + return err; +} + /* * Update the value of PKRU register that was already pushed onto the signal frame. */ -static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u64 mask, u32 pkru) +static inline int update_pkru_in_sigframe(struct xregs_state __user *buf, u32 pkru) { - u64 xstate_bv; int err; if (unlikely(!cpu_feature_enabled(X86_FEATURE_OSPKE))) return 0; /* Mark PKRU as in-use so that it is restored correctly. */ - xstate_bv = (mask & xfeatures_in_use()) | XFEATURE_MASK_PKRU; - - err = __put_user(xstate_bv, &buf->header.xfeatures); + err = set_xfeature_in_sigframe(buf, XFEATURE_MASK_PKRU); if (err) return err; @@ -171,7 +182,7 @@ static inline void xfd_validate_state(struct fpstate *fpstate, u64 mask, bool rs #ifdef CONFIG_X86_64 static inline void xfd_set_state(u64 xfd) { - wrmsrl(MSR_IA32_XFD, xfd); + wrmsrq(MSR_IA32_XFD, xfd); __this_cpu_write(xfd_state, xfd); } @@ -288,7 +299,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkr * internally, e.g. PKRU. That's user space ABI and also required * to allow the signal handler to modify PKRU. */ - struct fpstate *fpstate = current->thread.fpu.fpstate; + struct fpstate *fpstate = x86_task_fpu(current)->fpstate; u64 mask = fpstate->user_xfeatures; u32 lmask; u32 hmask; @@ -307,7 +318,7 @@ static inline int xsave_to_user_sigframe(struct xregs_state __user *buf, u32 pkr clac(); if (!err) - err = update_pkru_in_sigframe(buf, mask, pkru); + err = update_pkru_in_sigframe(buf, pkru); return err; } @@ -322,7 +333,7 @@ static inline int xrstor_from_user_sigframe(struct xregs_state __user *buf, u64 u32 hmask = mask >> 32; int err; - xfd_validate_state(current->thread.fpu.fpstate, mask, true); + xfd_validate_state(x86_task_fpu(current)->fpstate, mask, true); stac(); XSTATE_OP(XRSTOR, xstate, lmask, hmask, err); diff --git a/arch/x86/kernel/fred.c b/arch/x86/kernel/fred.c index 5e2cd1004980..816187da3a47 100644 --- a/arch/x86/kernel/fred.c +++ b/arch/x86/kernel/fred.c @@ -3,6 +3,7 @@ #include <asm/desc.h> #include <asm/fred.h> +#include <asm/msr.h> #include <asm/tlbflush.h> #include <asm/traps.h> @@ -43,23 +44,23 @@ void cpu_init_fred_exceptions(void) */ loadsegment(ss, __KERNEL_DS); - wrmsrl(MSR_IA32_FRED_CONFIG, + wrmsrq(MSR_IA32_FRED_CONFIG, /* Reserve for CALL emulation */ FRED_CONFIG_REDZONE | FRED_CONFIG_INT_STKLVL(0) | FRED_CONFIG_ENTRYPOINT(asm_fred_entrypoint_user)); - wrmsrl(MSR_IA32_FRED_STKLVLS, 0); + wrmsrq(MSR_IA32_FRED_STKLVLS, 0); /* * Ater a CPU offline/online cycle, the FRED RSP0 MSR should be * resynchronized with its per-CPU cache. */ - wrmsrl(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0)); + wrmsrq(MSR_IA32_FRED_RSP0, __this_cpu_read(fred_rsp0)); - wrmsrl(MSR_IA32_FRED_RSP1, 0); - wrmsrl(MSR_IA32_FRED_RSP2, 0); - wrmsrl(MSR_IA32_FRED_RSP3, 0); + wrmsrq(MSR_IA32_FRED_RSP1, 0); + wrmsrq(MSR_IA32_FRED_RSP2, 0); + wrmsrq(MSR_IA32_FRED_RSP3, 0); /* Enable FRED */ cr4_set_bits(X86_CR4_FRED); @@ -79,14 +80,14 @@ void cpu_init_fred_rsps(void) * (remember that user space faults are always taken on stack level 0) * is to avoid overflowing the kernel stack. */ - wrmsrl(MSR_IA32_FRED_STKLVLS, + wrmsrq(MSR_IA32_FRED_STKLVLS, FRED_STKLVL(X86_TRAP_DB, FRED_DB_STACK_LEVEL) | FRED_STKLVL(X86_TRAP_NMI, FRED_NMI_STACK_LEVEL) | FRED_STKLVL(X86_TRAP_MC, FRED_MC_STACK_LEVEL) | FRED_STKLVL(X86_TRAP_DF, FRED_DF_STACK_LEVEL)); /* The FRED equivalents to IST stacks... */ - wrmsrl(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); - wrmsrl(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); - wrmsrl(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); + wrmsrq(MSR_IA32_FRED_RSP1, __this_cpu_ist_top_va(DB)); + wrmsrq(MSR_IA32_FRED_RSP2, __this_cpu_ist_top_va(NMI)); + wrmsrq(MSR_IA32_FRED_RSP3, __this_cpu_ist_top_va(DF)); } diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 5eb1514af559..252e82bcfd2f 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -55,10 +55,10 @@ void ftrace_arch_code_modify_post_process(void) { /* * ftrace_make_{call,nop}() may be called during - * module load, and we need to finish the text_poke_queue() + * module load, and we need to finish the smp_text_poke_batch_add() * that they do, here. */ - text_poke_finish(); + smp_text_poke_batch_finish(); ftrace_poke_late = 0; mutex_unlock(&text_mutex); } @@ -119,7 +119,7 @@ ftrace_modify_code_direct(unsigned long ip, const char *old_code, /* replace the text with the new text */ if (ftrace_poke_late) - text_poke_queue((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)ip, new_code, MCOUNT_INSN_SIZE, NULL); else text_poke_early((void *)ip, new_code, MCOUNT_INSN_SIZE); return 0; @@ -186,11 +186,11 @@ int ftrace_update_ftrace_func(ftrace_func_t func) ip = (unsigned long)(&ftrace_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); ip = (unsigned long)(&ftrace_regs_call); new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } @@ -247,10 +247,10 @@ void ftrace_replace_code(int enable) break; } - text_poke_queue((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_batch_add((void *)rec->ip, new, MCOUNT_INSN_SIZE, NULL); ftrace_update_record(rec, enable); } - text_poke_finish(); + smp_text_poke_batch_finish(); } void arch_ftrace_update_code(int command) @@ -492,7 +492,7 @@ void arch_ftrace_update_trampoline(struct ftrace_ops *ops) mutex_lock(&text_mutex); /* Do a safe modify in case the trampoline is executing */ new = ftrace_call_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); mutex_unlock(&text_mutex); } @@ -586,7 +586,7 @@ static int ftrace_mod_jmp(unsigned long ip, void *func) const char *new; new = ftrace_jmp_replace(ip, (unsigned long)func); - text_poke_bp((void *)ip, new, MCOUNT_INSN_SIZE, NULL); + smp_text_poke_single((void *)ip, new, MCOUNT_INSN_SIZE, NULL); return 0; } diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fa9b6339975f..533fcf5636fc 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -47,234 +47,22 @@ * Manage page tables very early on. */ extern pmd_t early_dynamic_pgts[EARLY_DYNAMIC_PAGE_TABLES][PTRS_PER_PMD]; -static unsigned int __initdata next_early_pgt; +unsigned int __initdata next_early_pgt; +SYM_PIC_ALIAS(next_early_pgt); pmdval_t early_pmd_flags = __PAGE_KERNEL_LARGE & ~(_PAGE_GLOBAL | _PAGE_NX); -#ifdef CONFIG_X86_5LEVEL unsigned int __pgtable_l5_enabled __ro_after_init; unsigned int pgdir_shift __ro_after_init = 39; EXPORT_SYMBOL(pgdir_shift); unsigned int ptrs_per_p4d __ro_after_init = 1; EXPORT_SYMBOL(ptrs_per_p4d); -#endif -#ifdef CONFIG_DYNAMIC_MEMORY_LAYOUT unsigned long page_offset_base __ro_after_init = __PAGE_OFFSET_BASE_L4; EXPORT_SYMBOL(page_offset_base); unsigned long vmalloc_base __ro_after_init = __VMALLOC_BASE_L4; EXPORT_SYMBOL(vmalloc_base); unsigned long vmemmap_base __ro_after_init = __VMEMMAP_BASE_L4; EXPORT_SYMBOL(vmemmap_base); -#endif - -static inline bool check_la57_support(void) -{ - if (!IS_ENABLED(CONFIG_X86_5LEVEL)) - return false; - - /* - * 5-level paging is detected and enabled at kernel decompression - * stage. Only check if it has been enabled there. - */ - if (!(native_read_cr4() & X86_CR4_LA57)) - return false; - - RIP_REL_REF(__pgtable_l5_enabled) = 1; - RIP_REL_REF(pgdir_shift) = 48; - RIP_REL_REF(ptrs_per_p4d) = 512; - RIP_REL_REF(page_offset_base) = __PAGE_OFFSET_BASE_L5; - RIP_REL_REF(vmalloc_base) = __VMALLOC_BASE_L5; - RIP_REL_REF(vmemmap_base) = __VMEMMAP_BASE_L5; - - return true; -} - -static unsigned long __head sme_postprocess_startup(struct boot_params *bp, - pmdval_t *pmd, - unsigned long p2v_offset) -{ - unsigned long paddr, paddr_end; - int i; - - /* Encrypt the kernel and related (if SME is active) */ - sme_encrypt_kernel(bp); - - /* - * Clear the memory encryption mask from the .bss..decrypted section. - * The bss section will be memset to zero later in the initialization so - * there is no need to zero it after changing the memory encryption - * attribute. - */ - if (sme_get_me_mask()) { - paddr = (unsigned long)&RIP_REL_REF(__start_bss_decrypted); - paddr_end = (unsigned long)&RIP_REL_REF(__end_bss_decrypted); - - for (; paddr < paddr_end; paddr += PMD_SIZE) { - /* - * On SNP, transition the page to shared in the RMP table so that - * it is consistent with the page table attribute change. - * - * __start_bss_decrypted has a virtual address in the high range - * mapping (kernel .text). PVALIDATE, by way of - * early_snp_set_memory_shared(), requires a valid virtual - * address but the kernel is currently running off of the identity - * mapping so use the PA to get a *currently* valid virtual address. - */ - early_snp_set_memory_shared(paddr, paddr, PTRS_PER_PMD); - - i = pmd_index(paddr - p2v_offset); - pmd[i] -= sme_get_me_mask(); - } - } - - /* - * Return the SME encryption mask (if SME is active) to be used as a - * modifier for the initial pgdir entry programmed into CR3. - */ - return sme_get_me_mask(); -} - -/* Code in __startup_64() can be relocated during execution, but the compiler - * doesn't have to generate PC-relative relocations when accessing globals from - * that function. Clang actually does not generate them, which leads to - * boot-time crashes. To work around this problem, every global pointer must - * be accessed using RIP_REL_REF(). Kernel virtual addresses can be determined - * by subtracting p2v_offset from the RIP-relative address. - */ -unsigned long __head __startup_64(unsigned long p2v_offset, - struct boot_params *bp) -{ - pmd_t (*early_pgts)[PTRS_PER_PMD] = RIP_REL_REF(early_dynamic_pgts); - unsigned long physaddr = (unsigned long)&RIP_REL_REF(_text); - unsigned long va_text, va_end; - unsigned long pgtable_flags; - unsigned long load_delta; - pgdval_t *pgd; - p4dval_t *p4d; - pudval_t *pud; - pmdval_t *pmd, pmd_entry; - bool la57; - int i; - - la57 = check_la57_support(); - - /* Is the address too large? */ - if (physaddr >> MAX_PHYSMEM_BITS) - for (;;); - - /* - * Compute the delta between the address I am compiled to run at - * and the address I am actually running at. - */ - load_delta = __START_KERNEL_map + p2v_offset; - RIP_REL_REF(phys_base) = load_delta; - - /* Is the address not 2M aligned? */ - if (load_delta & ~PMD_MASK) - for (;;); - - va_text = physaddr - p2v_offset; - va_end = (unsigned long)&RIP_REL_REF(_end) - p2v_offset; - - /* Include the SME encryption mask in the fixup value */ - load_delta += sme_get_me_mask(); - - /* Fixup the physical addresses in the page table */ - - pgd = &RIP_REL_REF(early_top_pgt)->pgd; - pgd[pgd_index(__START_KERNEL_map)] += load_delta; - - if (IS_ENABLED(CONFIG_X86_5LEVEL) && la57) { - p4d = (p4dval_t *)&RIP_REL_REF(level4_kernel_pgt); - p4d[MAX_PTRS_PER_P4D - 1] += load_delta; - - pgd[pgd_index(__START_KERNEL_map)] = (pgdval_t)p4d | _PAGE_TABLE; - } - - RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 2].pud += load_delta; - RIP_REL_REF(level3_kernel_pgt)[PTRS_PER_PUD - 1].pud += load_delta; - - for (i = FIXMAP_PMD_TOP; i > FIXMAP_PMD_TOP - FIXMAP_PMD_NUM; i--) - RIP_REL_REF(level2_fixmap_pgt)[i].pmd += load_delta; - - /* - * Set up the identity mapping for the switchover. These - * entries should *NOT* have the global bit set! This also - * creates a bunch of nonsense entries but that is fine -- - * it avoids problems around wraparound. - */ - - pud = &early_pgts[0]->pmd; - pmd = &early_pgts[1]->pmd; - RIP_REL_REF(next_early_pgt) = 2; - - pgtable_flags = _KERNPG_TABLE_NOENC + sme_get_me_mask(); - - if (la57) { - p4d = &early_pgts[RIP_REL_REF(next_early_pgt)++]->pmd; - - i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)p4d + pgtable_flags; - pgd[i + 1] = (pgdval_t)p4d + pgtable_flags; - - i = physaddr >> P4D_SHIFT; - p4d[(i + 0) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; - p4d[(i + 1) % PTRS_PER_P4D] = (pgdval_t)pud + pgtable_flags; - } else { - i = (physaddr >> PGDIR_SHIFT) % PTRS_PER_PGD; - pgd[i + 0] = (pgdval_t)pud + pgtable_flags; - pgd[i + 1] = (pgdval_t)pud + pgtable_flags; - } - - i = physaddr >> PUD_SHIFT; - pud[(i + 0) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; - pud[(i + 1) % PTRS_PER_PUD] = (pudval_t)pmd + pgtable_flags; - - pmd_entry = __PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL; - /* Filter out unsupported __PAGE_KERNEL_* bits: */ - pmd_entry &= RIP_REL_REF(__supported_pte_mask); - pmd_entry += sme_get_me_mask(); - pmd_entry += physaddr; - - for (i = 0; i < DIV_ROUND_UP(va_end - va_text, PMD_SIZE); i++) { - int idx = i + (physaddr >> PMD_SHIFT); - - pmd[idx % PTRS_PER_PMD] = pmd_entry + i * PMD_SIZE; - } - - /* - * Fixup the kernel text+data virtual addresses. Note that - * we might write invalid pmds, when the kernel is relocated - * cleanup_highmap() fixes this up along with the mappings - * beyond _end. - * - * Only the region occupied by the kernel image has so far - * been checked against the table of usable memory regions - * provided by the firmware, so invalidate pages outside that - * region. A page table entry that maps to a reserved area of - * memory would allow processor speculation into that area, - * and on some hardware (particularly the UV platform) even - * speculative access to some reserved areas is caught as an - * error, causing the BIOS to halt the system. - */ - - pmd = &RIP_REL_REF(level2_kernel_pgt)->pmd; - - /* invalidate pages before the kernel image */ - for (i = 0; i < pmd_index(va_text); i++) - pmd[i] &= ~_PAGE_PRESENT; - - /* fixup pages that are part of the kernel image */ - for (; i <= pmd_index(va_end); i++) - if (pmd[i] & _PAGE_PRESENT) - pmd[i] += load_delta; - - /* invalidate pages after the kernel image */ - for (; i < PTRS_PER_PMD; i++) - pmd[i] &= ~_PAGE_PRESENT; - - return sme_postprocess_startup(bp, pmd, p2v_offset); -} /* Wipe all early page tables except for the kernel symbol map */ static void __init reset_early_page_tables(void) @@ -449,6 +237,12 @@ asmlinkage __visible void __init __noreturn x86_64_start_kernel(char * real_mode /* Kill off the identity-map trampoline */ reset_early_page_tables(); + if (pgtable_l5_enabled()) { + page_offset_base = __PAGE_OFFSET_BASE_L5; + vmalloc_base = __VMALLOC_BASE_L5; + vmemmap_base = __VMEMMAP_BASE_L5; + } + clear_bss(); /* @@ -513,41 +307,6 @@ void __init __noreturn x86_64_start_reservations(char *real_mode_data) start_kernel(); } -/* - * Data structures and code used for IDT setup in head_64.S. The bringup-IDT is - * used until the idt_table takes over. On the boot CPU this happens in - * x86_64_start_kernel(), on secondary CPUs in start_secondary(). In both cases - * this happens in the functions called from head_64.S. - * - * The idt_table can't be used that early because all the code modifying it is - * in idt.c and can be instrumented by tracing or KASAN, which both don't work - * during early CPU bringup. Also the idt_table has the runtime vectors - * configured which require certain CPU state to be setup already (like TSS), - * which also hasn't happened yet in early CPU bringup. - */ -static gate_desc bringup_idt_table[NUM_EXCEPTION_VECTORS] __page_aligned_data; - -/* This may run while still in the direct mapping */ -static void __head startup_64_load_idt(void *vc_handler) -{ - struct desc_ptr desc = { - .address = (unsigned long)&RIP_REL_REF(bringup_idt_table), - .size = sizeof(bringup_idt_table) - 1, - }; - struct idt_data data; - gate_desc idt_desc; - - /* @vc_handler is set only for a VMM Communication Exception */ - if (vc_handler) { - init_idt_data(&data, X86_TRAP_VC, vc_handler); - idt_init_desc(&idt_desc, &data); - native_write_idt_entry((gate_desc *)desc.address, X86_TRAP_VC, &idt_desc); - } - - native_load_idt(&desc); -} - -/* This is used when running on kernel addresses */ void early_setup_idt(void) { void *handler = NULL; @@ -559,30 +318,3 @@ void early_setup_idt(void) startup_64_load_idt(handler); } - -/* - * Setup boot CPU state needed before kernel switches to virtual addresses. - */ -void __head startup_64_setup_gdt_idt(void) -{ - struct desc_struct *gdt = (void *)(__force unsigned long)gdt_page.gdt; - void *handler = NULL; - - struct desc_ptr startup_gdt_descr = { - .address = (unsigned long)&RIP_REL_REF(*gdt), - .size = GDT_SIZE - 1, - }; - - /* Load GDT */ - native_load_gdt(&startup_gdt_descr); - - /* New GDT is live - reload data segment registers */ - asm volatile("movl %%eax, %%ds\n" - "movl %%eax, %%ss\n" - "movl %%eax, %%es\n" : : "a"(__KERNEL_DS) : "memory"); - - if (IS_ENABLED(CONFIG_AMD_MEM_ENCRYPT)) - handler = &RIP_REL_REF(vc_no_ghcb); - - startup_64_load_idt(handler); -} diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 2e42056d2306..76743dfad6ab 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -86,7 +86,7 @@ SYM_CODE_START(startup_32) movl $pa(__bss_stop),%ecx subl %edi,%ecx shrl $2,%ecx - rep ; stosl + rep stosl /* * Copy bootup parameters out of the way. * Note: %esi still has the pointer to the real-mode data. @@ -98,15 +98,13 @@ SYM_CODE_START(startup_32) movl $pa(boot_params),%edi movl $(PARAM_SIZE/4),%ecx cld - rep - movsl + rep movsl movl pa(boot_params) + NEW_CL_POINTER,%esi andl %esi,%esi jz 1f # No command line movl $pa(boot_command_line),%edi movl $(COMMAND_LINE_SIZE/4),%ecx - rep - movsl + rep movsl 1: #ifdef CONFIG_OLPC diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index fefe2a25cf02..3e9b3a3bd039 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -573,6 +573,7 @@ SYM_CODE_START_NOALIGN(vc_no_ghcb) /* Pure iret required here - don't use INTERRUPT_RETURN */ iretq SYM_CODE_END(vc_no_ghcb) +SYM_PIC_ALIAS(vc_no_ghcb); #endif #ifdef CONFIG_MITIGATION_PAGE_TABLE_ISOLATION @@ -604,10 +605,12 @@ SYM_DATA_START_PTI_ALIGNED(early_top_pgt) .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC .fill PTI_USER_PGD_FILL,8,0 SYM_DATA_END(early_top_pgt) +SYM_PIC_ALIAS(early_top_pgt) SYM_DATA_START_PAGE_ALIGNED(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 SYM_DATA_END(early_dynamic_pgts) +SYM_PIC_ALIAS(early_dynamic_pgts); SYM_DATA(early_recursion_flag, .long 0) @@ -646,12 +649,11 @@ SYM_DATA_START_PTI_ALIGNED(init_top_pgt) SYM_DATA_END(init_top_pgt) #endif -#ifdef CONFIG_X86_5LEVEL SYM_DATA_START_PAGE_ALIGNED(level4_kernel_pgt) .fill 511,8,0 .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level4_kernel_pgt) -#endif +SYM_PIC_ALIAS(level4_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .fill L3_START_KERNEL,8,0 @@ -659,6 +661,7 @@ SYM_DATA_START_PAGE_ALIGNED(level3_kernel_pgt) .quad level2_kernel_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC .quad level2_fixmap_pgt - __START_KERNEL_map + _PAGE_TABLE_NOENC SYM_DATA_END(level3_kernel_pgt) +SYM_PIC_ALIAS(level3_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) /* @@ -676,6 +679,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_kernel_pgt) */ PMDS(0, __PAGE_KERNEL_LARGE_EXEC, KERNEL_IMAGE_SIZE/PMD_SIZE) SYM_DATA_END(level2_kernel_pgt) +SYM_PIC_ALIAS(level2_kernel_pgt) SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) .fill (512 - 4 - FIXMAP_PMD_NUM),8,0 @@ -688,6 +692,7 @@ SYM_DATA_START_PAGE_ALIGNED(level2_fixmap_pgt) /* 6 MB reserved space + a 2MB hole */ .fill 4,8,0 SYM_DATA_END(level2_fixmap_pgt) +SYM_PIC_ALIAS(level2_fixmap_pgt) SYM_DATA_START_PAGE_ALIGNED(level1_fixmap_pgt) .rept (FIXMAP_PMD_NUM) @@ -703,6 +708,7 @@ SYM_DATA(smpboot_control, .long 0) .align 16 /* This must match the first entry in level2_kernel_pgt */ SYM_DATA(phys_base, .quad 0x0) +SYM_PIC_ALIAS(phys_base); EXPORT_SYMBOL(phys_base) #include "../xen/xen-head.S" diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 7f4b2966e15c..d6387dde3ff9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -7,11 +7,12 @@ #include <linux/cpu.h> #include <linux/irq.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/irq_remapping.h> #include <asm/hpet.h> #include <asm/time.h> #include <asm/mwait.h> +#include <asm/msr.h> #undef pr_fmt #define pr_fmt(fmt) "hpet: " fmt @@ -970,7 +971,7 @@ static bool __init hpet_is_pc10_damaged(void) return false; /* Check whether PC10 is enabled in PKG C-state limit */ - rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, pcfg); + rdmsrq(MSR_PKG_CST_CONFIG_CONTROL, pcfg); if ((pcfg & 0xF) < 8) return false; diff --git a/arch/x86/kernel/jailhouse.c b/arch/x86/kernel/jailhouse.c index cd8ed1edbf9e..9e9a591a5fec 100644 --- a/arch/x86/kernel/jailhouse.c +++ b/arch/x86/kernel/jailhouse.c @@ -49,7 +49,7 @@ static uint32_t jailhouse_cpuid_base(void) !boot_cpu_has(X86_FEATURE_HYPERVISOR)) return 0; - return hypervisor_cpuid_base("Jailhouse\0\0\0", 0); + return cpuid_base_hypervisor("Jailhouse\0\0\0", 0); } static uint32_t __init jailhouse_detect(void) diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index f5b8ef02d172..a7949a54a0ff 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -102,7 +102,7 @@ __jump_label_transform(struct jump_entry *entry, return; } - text_poke_bp((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_single((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); } static void __ref jump_label_transform(struct jump_entry *entry, @@ -135,7 +135,7 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, mutex_lock(&text_mutex); jlp = __jump_label_patch(entry, type); - text_poke_queue((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); + smp_text_poke_batch_add((void *)jump_entry_code(entry), jlp.code, jlp.size, NULL); mutex_unlock(&text_mutex); return true; } @@ -143,6 +143,6 @@ bool arch_jump_label_transform_queue(struct jump_entry *entry, void arch_jump_label_transform_apply(void) { mutex_lock(&text_mutex); - text_poke_finish(); + smp_text_poke_batch_finish(); mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 09608fd93687..47cb8eb138ba 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -808,7 +808,7 @@ void arch_arm_kprobe(struct kprobe *p) u8 int3 = INT3_INSN_OPCODE; text_poke(p->addr, &int3, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(p->addr, &p->opcode, 1, &int3, 1); } @@ -818,7 +818,7 @@ void arch_disarm_kprobe(struct kprobe *p) perf_event_text_poke(p->addr, &int3, 1, &p->opcode, 1); text_poke(p->addr, &p->opcode, 1); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); } void arch_remove_kprobe(struct kprobe *p) diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 36d6809c6c9e..0aabd4c4e2c4 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -488,7 +488,7 @@ void arch_optimize_kprobes(struct list_head *oplist) insn_buff[0] = JMP32_INSN_OPCODE; *(s32 *)(&insn_buff[1]) = rel; - text_poke_bp(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); + smp_text_poke_single(op->kp.addr, insn_buff, JMP32_INSN_SIZE, NULL); list_del_init(&op->list); } @@ -513,11 +513,11 @@ void arch_unoptimize_kprobe(struct optimized_kprobe *op) JMP32_INSN_SIZE - INT3_INSN_SIZE); text_poke(addr, new, INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); text_poke(addr + INT3_INSN_SIZE, new + INT3_INSN_SIZE, JMP32_INSN_SIZE - INT3_INSN_SIZE); - text_poke_sync(); + smp_text_poke_sync_each_cpu(); perf_event_text_poke(op->kp.addr, old, JMP32_INSN_SIZE, new, JMP32_INSN_SIZE); } diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 3be9b3342c67..921c1c783bc1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -40,6 +40,7 @@ #include <asm/mtrr.h> #include <asm/tlb.h> #include <asm/cpuidle_haltpoll.h> +#include <asm/msr.h> #include <asm/ptrace.h> #include <asm/reboot.h> #include <asm/svm.h> @@ -301,7 +302,7 @@ DEFINE_IDTENTRY_SYSVEC(sysvec_kvm_asyncpf_interrupt) token = __this_cpu_read(apf_reason.token); kvm_async_pf_task_wake(token); __this_cpu_write(apf_reason.token, 0); - wrmsrl(MSR_KVM_ASYNC_PF_ACK, 1); + wrmsrq(MSR_KVM_ASYNC_PF_ACK, 1); } set_irq_regs(old_regs); @@ -327,7 +328,7 @@ static void kvm_register_steal_time(void) if (!has_steal_clock) return; - wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); + wrmsrq(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED)); pr_debug("stealtime: cpu %d, msr %llx\n", cpu, (unsigned long long) slow_virt_to_phys(st)); } @@ -361,9 +362,9 @@ static void kvm_guest_cpu_init(void) if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT)) pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; - wrmsrl(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); + wrmsrq(MSR_KVM_ASYNC_PF_INT, HYPERVISOR_CALLBACK_VECTOR); - wrmsrl(MSR_KVM_ASYNC_PF_EN, pa); + wrmsrq(MSR_KVM_ASYNC_PF_EN, pa); __this_cpu_write(async_pf_enabled, true); pr_debug("setup async PF for cpu %d\n", smp_processor_id()); } @@ -376,7 +377,7 @@ static void kvm_guest_cpu_init(void) __this_cpu_write(kvm_apic_eoi, 0); pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi)) | KVM_MSR_ENABLED; - wrmsrl(MSR_KVM_PV_EOI_EN, pa); + wrmsrq(MSR_KVM_PV_EOI_EN, pa); } if (has_steal_clock) @@ -388,7 +389,7 @@ static void kvm_pv_disable_apf(void) if (!__this_cpu_read(async_pf_enabled)) return; - wrmsrl(MSR_KVM_ASYNC_PF_EN, 0); + wrmsrq(MSR_KVM_ASYNC_PF_EN, 0); __this_cpu_write(async_pf_enabled, false); pr_debug("disable async PF for cpu %d\n", smp_processor_id()); @@ -399,7 +400,7 @@ static void kvm_disable_steal_time(void) if (!has_steal_clock) return; - wrmsr(MSR_KVM_STEAL_TIME, 0, 0); + wrmsrq(MSR_KVM_STEAL_TIME, 0); } static u64 kvm_steal_clock(int cpu) @@ -451,9 +452,9 @@ static void kvm_guest_cpu_offline(bool shutdown) { kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) - wrmsrl(MSR_KVM_PV_EOI_EN, 0); + wrmsrq(MSR_KVM_PV_EOI_EN, 0); if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) - wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0); + wrmsrq(MSR_KVM_MIGRATION_CONTROL, 0); kvm_pv_disable_apf(); if (!shutdown) apf_task_wake_all(); @@ -615,7 +616,7 @@ static int __init setup_efi_kvm_sev_migration(void) } pr_info("%s : live migration enabled in EFI\n", __func__); - wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); + wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); return 1; } @@ -728,7 +729,7 @@ static int kvm_suspend(void) #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) - rdmsrl(MSR_KVM_POLL_CONTROL, val); + rdmsrq(MSR_KVM_POLL_CONTROL, val); has_guest_poll = !(val & 1); #endif return 0; @@ -740,7 +741,7 @@ static void kvm_resume(void) #ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL if (kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL) && has_guest_poll) - wrmsrl(MSR_KVM_POLL_CONTROL, 0); + wrmsrq(MSR_KVM_POLL_CONTROL, 0); #endif } @@ -874,7 +875,7 @@ static noinline uint32_t __kvm_cpuid_base(void) return 0; /* So we don't blow up on old processors */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base(KVM_SIGNATURE, 0); + return cpuid_base_hypervisor(KVM_SIGNATURE, 0); return 0; } @@ -975,7 +976,7 @@ static void __init kvm_init_platform(void) * If not booted using EFI, enable Live migration support. */ if (!efi_enabled(EFI_BOOT)) - wrmsrl(MSR_KVM_MIGRATION_CONTROL, + wrmsrq(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); } kvmclock_init(); @@ -1124,12 +1125,12 @@ out: static void kvm_disable_host_haltpoll(void *i) { - wrmsrl(MSR_KVM_POLL_CONTROL, 0); + wrmsrq(MSR_KVM_POLL_CONTROL, 0); } static void kvm_enable_host_haltpoll(void *i) { - wrmsrl(MSR_KVM_POLL_CONTROL, 1); + wrmsrq(MSR_KVM_POLL_CONTROL, 1); } void arch_haltpoll_enable(unsigned int cpu) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 5b2c15214a6b..ca0a49eeac4a 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -60,7 +60,7 @@ EXPORT_PER_CPU_SYMBOL_GPL(hv_clock_per_cpu); */ static void kvm_get_wallclock(struct timespec64 *now) { - wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + wrmsrq(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); preempt_disable(); pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); preempt_enable(); @@ -173,7 +173,7 @@ static void kvm_register_clock(char *txt) return; pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; - wrmsrl(msr_kvm_system_time, pa); + wrmsrq(msr_kvm_system_time, pa); pr_debug("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } @@ -196,7 +196,7 @@ static void kvm_setup_secondary_clock(void) void kvmclock_disable(void) { if (msr_kvm_system_time) - native_write_msr(msr_kvm_system_time, 0, 0); + native_write_msr(msr_kvm_system_time, 0); } static void __init kvmclock_init_mem(void) diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index a68f5a0a9f37..949c9e4bfad2 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -76,6 +76,19 @@ map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) static int map_acpi_tables(struct x86_mapping_info *info, pgd_t *level4p) { return 0; } #endif +static int map_mmio_serial(struct x86_mapping_info *info, pgd_t *level4p) +{ + unsigned long mstart, mend; + + if (!kexec_debug_8250_mmio32) + return 0; + + mstart = kexec_debug_8250_mmio32 & PAGE_MASK; + mend = (kexec_debug_8250_mmio32 + PAGE_SIZE + 23) & PAGE_MASK; + pr_info("Map PCI serial at %lx - %lx\n", mstart, mend); + return kernel_ident_mapping_init(info, level4p, mstart, mend); +} + #ifdef CONFIG_KEXEC_FILE const struct kexec_file_ops * const kexec_file_loaders[] = { &kexec_bzImage64_ops, @@ -285,6 +298,10 @@ static int init_pgtable(struct kimage *image, unsigned long control_page) if (result) return result; + result = map_mmio_serial(&info, image->arch.pgd); + if (result) + return result; + /* * This must be last because the intermediate page table pages it * allocates will not be control pages and may overlap the image. @@ -304,6 +321,24 @@ static void load_segments(void) ); } +static void prepare_debug_idt(unsigned long control_page, unsigned long vec_ofs) +{ + gate_desc idtentry = { 0 }; + int i; + + idtentry.bits.p = 1; + idtentry.bits.type = GATE_TRAP; + idtentry.segment = __KERNEL_CS; + idtentry.offset_low = (control_page & 0xFFFF) + vec_ofs; + idtentry.offset_middle = (control_page >> 16) & 0xFFFF; + idtentry.offset_high = control_page >> 32; + + for (i = 0; i < 16; i++) { + kexec_debug_idt[i] = idtentry; + idtentry.offset_low += KEXEC_DEBUG_EXC_HANDLER_SIZE; + } +} + int machine_kexec_prepare(struct kimage *image) { void *control_page = page_address(image->control_code_page); @@ -321,6 +356,9 @@ int machine_kexec_prepare(struct kimage *image) if (image->type == KEXEC_TYPE_DEFAULT) kexec_pa_swap_page = page_to_pfn(image->swap_page) << PAGE_SHIFT; + prepare_debug_idt((unsigned long)__pa(control_page), + (unsigned long)kexec_debug_exc_vectors - reloc_start); + __memcpy(control_page, __relocate_kernel_start, reloc_end - reloc_start); set_memory_rox((unsigned long)control_page, 1); @@ -396,16 +434,10 @@ void __nocfi machine_kexec(struct kimage *image) * with from a table in memory. At no other time is the * descriptor table in memory accessed. * - * I take advantage of this here by force loading the - * segments, before I zap the gdt with an invalid value. + * Take advantage of this here by force loading the segments, + * before the GDT is zapped with an invalid value. */ load_segments(); - /* - * The gdt & idt are now invalid. - * If you want to load them you must set up your own idt & gdt. - */ - native_idt_invalidate(); - native_gdt_invalidate(); /* now call it */ image->start = relocate_kernel_ptr((unsigned long)image->head, diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c index 1f54eedc3015..ef6104e7cc72 100644 --- a/arch/x86/kernel/mmconf-fam10h_64.c +++ b/arch/x86/kernel/mmconf-fam10h_64.c @@ -97,7 +97,7 @@ static void get_fam10h_pci_mmconf_base(void) /* SYS_CFG */ address = MSR_AMD64_SYSCFG; - rdmsrl(address, val); + rdmsrq(address, val); /* TOP_MEM2 is not enabled? */ if (!(val & (1<<21))) { @@ -105,7 +105,7 @@ static void get_fam10h_pci_mmconf_base(void) } else { /* TOP_MEM2 */ address = MSR_K8_TOP_MEM2; - rdmsrl(address, val); + rdmsrq(address, val); tom2 = max(val & 0xffffff800000ULL, 1ULL << 32); } @@ -177,7 +177,7 @@ void fam10h_check_enable_mmcfg(void) return; address = MSR_FAM10H_MMIO_CONF_BASE; - rdmsrl(address, val); + rdmsrq(address, val); /* try to make sure that AP's setting is identical to BSP setting */ if (val & FAM10H_MMIO_CONF_ENABLE) { @@ -212,7 +212,7 @@ void fam10h_check_enable_mmcfg(void) (FAM10H_MMIO_CONF_BUSRANGE_MASK<<FAM10H_MMIO_CONF_BUSRANGE_SHIFT)); val |= fam10h_pci_mmconf_base | (8 << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | FAM10H_MMIO_CONF_ENABLE; - wrmsrl(address, val); + wrmsrq(address, val); } static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d) diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index ff07558b7ebc..0ffbae902e2f 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -206,7 +206,7 @@ static int write_relocate_add(Elf64_Shdr *sechdrs, write, apply); if (!early) { - text_poke_sync(); + smp_text_poke_sync_each_cpu(); mutex_unlock(&text_mutex); } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 9a95d00f1423..be93ec7255bf 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -49,27 +49,20 @@ struct nmi_desc { struct list_head head; }; -static struct nmi_desc nmi_desc[NMI_MAX] = -{ - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[0].lock), - .head = LIST_HEAD_INIT(nmi_desc[0].head), - }, - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[1].lock), - .head = LIST_HEAD_INIT(nmi_desc[1].head), - }, - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[2].lock), - .head = LIST_HEAD_INIT(nmi_desc[2].head), - }, - { - .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[3].lock), - .head = LIST_HEAD_INIT(nmi_desc[3].head), - }, +#define NMI_DESC_INIT(type) { \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(&nmi_desc[type].lock), \ + .head = LIST_HEAD_INIT(nmi_desc[type].head), \ +} +static struct nmi_desc nmi_desc[NMI_MAX] = { + NMI_DESC_INIT(NMI_LOCAL), + NMI_DESC_INIT(NMI_UNKNOWN), + NMI_DESC_INIT(NMI_SERR), + NMI_DESC_INIT(NMI_IO_CHECK), }; +#define nmi_to_desc(type) (&nmi_desc[type]) + struct nmi_stats { unsigned int normal; unsigned int unknown; @@ -91,6 +84,9 @@ static DEFINE_PER_CPU(struct nmi_stats, nmi_stats); static int ignore_nmis __read_mostly; int unknown_nmi_panic; +int panic_on_unrecovered_nmi; +int panic_on_io_nmi; + /* * Prevent NMI reason port (0x61) being accessed simultaneously, can * only be used in NMI handler. @@ -104,8 +100,6 @@ static int __init setup_unknown_nmi_panic(char *str) } __setup("unknown_nmi_panic", setup_unknown_nmi_panic); -#define nmi_to_desc(type) (&nmi_desc[type]) - static u64 nmi_longest_ns = 1 * NSEC_PER_MSEC; static int __init nmi_warning_debugfs(void) @@ -125,12 +119,12 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) action->max_duration = duration; - remainder_ns = do_div(duration, (1000 * 1000)); - decimal_msecs = remainder_ns / 1000; + /* Convert duration from nsec to msec */ + remainder_ns = do_div(duration, NSEC_PER_MSEC); + decimal_msecs = remainder_ns / NSEC_PER_USEC; - printk_ratelimited(KERN_INFO - "INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", - action->handler, duration, decimal_msecs); + pr_info_ratelimited("INFO: NMI handler (%ps) took too long to run: %lld.%03d msecs\n", + action->handler, duration, decimal_msecs); } static int nmi_handle(unsigned int type, struct pt_regs *regs) @@ -333,10 +327,9 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs) int handled; /* - * Use 'false' as back-to-back NMIs are dealt with one level up. - * Of course this makes having multiple 'unknown' handlers useless - * as only the first one is ever run (unless it can actually determine - * if it caused the NMI) + * As a last resort, let the "unknown" handlers make a + * best-effort attempt to figure out if they can claim + * responsibility for this Unknown NMI. */ handled = nmi_handle(NMI_UNKNOWN, regs); if (handled) { @@ -366,17 +359,18 @@ static noinstr void default_do_nmi(struct pt_regs *regs) bool b2b = false; /* - * CPU-specific NMI must be processed before non-CPU-specific - * NMI, otherwise we may lose it, because the CPU-specific - * NMI can not be detected/processed on other CPUs. - */ - - /* - * Back-to-back NMIs are interesting because they can either - * be two NMI or more than two NMIs (any thing over two is dropped - * due to NMI being edge-triggered). If this is the second half - * of the back-to-back NMI, assume we dropped things and process - * more handlers. Otherwise reset the 'swallow' NMI behaviour + * Back-to-back NMIs are detected by comparing the RIP of the + * current NMI with that of the previous NMI. If it is the same, + * it is assumed that the CPU did not have a chance to jump back + * into a non-NMI context and execute code in between the two + * NMIs. + * + * They are interesting because even if there are more than two, + * only a maximum of two can be detected (anything over two is + * dropped due to NMI being edge-triggered). If this is the + * second half of the back-to-back NMI, assume we dropped things + * and process more handlers. Otherwise, reset the 'swallow' NMI + * behavior. */ if (regs->ip == __this_cpu_read(last_nmi_rip)) b2b = true; @@ -390,6 +384,11 @@ static noinstr void default_do_nmi(struct pt_regs *regs) if (microcode_nmi_handler_enabled() && microcode_nmi_handler()) goto out; + /* + * CPU-specific NMI must be processed before non-CPU-specific + * NMI, otherwise we may lose it, because the CPU-specific + * NMI can not be detected/processed on other CPUs. + */ handled = nmi_handle(NMI_LOCAL, regs); __this_cpu_add(nmi_stats.normal, handled); if (handled) { @@ -426,13 +425,14 @@ static noinstr void default_do_nmi(struct pt_regs *regs) pci_serr_error(reason, regs); else if (reason & NMI_REASON_IOCHK) io_check_error(reason, regs); -#ifdef CONFIG_X86_32 + /* * Reassert NMI in case it became active * meanwhile as it's edge-triggered: */ - reassert_nmi(); -#endif + if (IS_ENABLED(CONFIG_X86_32)) + reassert_nmi(); + __this_cpu_add(nmi_stats.external, 1); raw_spin_unlock(&nmi_reason_lock); goto out; @@ -751,4 +751,3 @@ void local_touch_nmi(void) { __this_cpu_write(last_nmi_rip, 0); } -EXPORT_SYMBOL_GPL(local_touch_nmi); diff --git a/arch/x86/kernel/nmi_selftest.c b/arch/x86/kernel/nmi_selftest.c index e93a8545c74d..a010e9d062bf 100644 --- a/arch/x86/kernel/nmi_selftest.c +++ b/arch/x86/kernel/nmi_selftest.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 /* - * arch/x86/kernel/nmi-selftest.c - * * Testsuite for NMI: IPIs * * Started by Don Zickus: @@ -30,7 +28,6 @@ static DECLARE_BITMAP(nmi_ipi_mask, NR_CPUS) __initdata; static int __initdata testcase_total; static int __initdata testcase_successes; -static int __initdata expected_testcase_failures; static int __initdata unexpected_testcase_failures; static int __initdata unexpected_testcase_unknowns; @@ -120,26 +117,22 @@ static void __init dotest(void (*testcase_fn)(void), int expected) unexpected_testcase_failures++; if (nmi_fail == FAILURE) - printk(KERN_CONT "FAILED |"); + pr_cont("FAILED |"); else if (nmi_fail == TIMEOUT) - printk(KERN_CONT "TIMEOUT|"); + pr_cont("TIMEOUT|"); else - printk(KERN_CONT "ERROR |"); + pr_cont("ERROR |"); dump_stack(); } else { testcase_successes++; - printk(KERN_CONT " ok |"); + pr_cont(" ok |"); } - testcase_total++; + pr_cont("\n"); + testcase_total++; reset_nmi(); } -static inline void __init print_testname(const char *testname) -{ - printk("%12s:", testname); -} - void __init nmi_selftest(void) { init_nmi_testsuite(); @@ -147,38 +140,25 @@ void __init nmi_selftest(void) /* * Run the testsuite: */ - printk("----------------\n"); - printk("| NMI testsuite:\n"); - printk("--------------------\n"); + pr_info("----------------\n"); + pr_info("| NMI testsuite:\n"); + pr_info("--------------------\n"); - print_testname("remote IPI"); + pr_info("%12s:", "remote IPI"); dotest(remote_ipi, SUCCESS); - printk(KERN_CONT "\n"); - print_testname("local IPI"); + + pr_info("%12s:", "local IPI"); dotest(local_ipi, SUCCESS); - printk(KERN_CONT "\n"); cleanup_nmi_testsuite(); + pr_info("--------------------\n"); if (unexpected_testcase_failures) { - printk("--------------------\n"); - printk("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", + pr_info("BUG: %3d unexpected failures (out of %3d) - debugging disabled! |\n", unexpected_testcase_failures, testcase_total); - printk("-----------------------------------------------------------------\n"); - } else if (expected_testcase_failures && testcase_successes) { - printk("--------------------\n"); - printk("%3d out of %3d testcases failed, as expected. |\n", - expected_testcase_failures, testcase_total); - printk("----------------------------------------------------\n"); - } else if (expected_testcase_failures && !testcase_successes) { - printk("--------------------\n"); - printk("All %3d testcases failed, as expected. |\n", - expected_testcase_failures); - printk("----------------------------------------\n"); } else { - printk("--------------------\n"); - printk("Good, all %3d testcases passed! |\n", + pr_info("Good, all %3d testcases passed! |\n", testcase_successes); - printk("---------------------------------\n"); } + pr_info("-----------------------------------------------------------------\n"); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 1ccd05d8999f..ab3e172dcc69 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -33,6 +33,7 @@ #include <asm/tlb.h> #include <asm/io_bitmap.h> #include <asm/gsseg.h> +#include <asm/msr.h> /* stub always returning 0. */ DEFINE_ASM_FUNC(paravirt_ret0, "xor %eax,%eax", .entry.text); @@ -210,12 +211,10 @@ struct paravirt_patch_template pv_ops = { .mmu.set_p4d = native_set_p4d, -#if CONFIG_PGTABLE_LEVELS >= 5 .mmu.p4d_val = PTE_IDENT, .mmu.make_p4d = PTE_IDENT, .mmu.set_pgd = native_set_pgd, -#endif /* CONFIG_PGTABLE_LEVELS >= 5 */ .mmu.pte_val = PTE_IDENT, .mmu.pgd_val = PTE_IDENT, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 962c3ce39323..c1d2dac72b9c 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -30,7 +30,7 @@ #include <linux/hw_breakpoint.h> #include <linux/entry-common.h> #include <asm/cpu.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/apic.h> #include <linux/uaccess.h> #include <asm/mwait.h> @@ -52,6 +52,7 @@ #include <asm/unwind.h> #include <asm/tdx.h> #include <asm/mmu_context.h> +#include <asm/msr.h> #include <asm/shstk.h> #include "process.h" @@ -93,17 +94,12 @@ EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); */ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { - /* init_task is not dynamically sized (incomplete FPU state) */ - if (unlikely(src == &init_task)) - memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(init_task), 0); - else - memcpy(dst, src, arch_task_struct_size); + /* fpu_clone() will initialize the "dst_fpu" memory */ + memcpy_and_pad(dst, arch_task_struct_size, src, sizeof(*dst), 0); #ifdef CONFIG_VM86 dst->thread.vm86 = NULL; #endif - /* Drop the copied pointer to current's fpstate */ - dst->thread.fpu.fpstate = NULL; return 0; } @@ -111,8 +107,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) #ifdef CONFIG_X86_64 void arch_release_task_struct(struct task_struct *tsk) { - if (fpu_state_size_dynamic()) - fpstate_free(&tsk->thread.fpu); + if (fpu_state_size_dynamic() && !(tsk->flags & (PF_KTHREAD | PF_USER_WORKER))) + fpstate_free(x86_task_fpu(tsk)); } #endif @@ -122,7 +118,6 @@ void arch_release_task_struct(struct task_struct *tsk) void exit_thread(struct task_struct *tsk) { struct thread_struct *t = &tsk->thread; - struct fpu *fpu = &t->fpu; if (test_thread_flag(TIF_IO_BITMAP)) io_bitmap_exit(tsk); @@ -130,7 +125,7 @@ void exit_thread(struct task_struct *tsk) free_vm86(t); shstk_free(tsk); - fpu__drop(fpu); + fpu__drop(tsk); } static int set_new_tls(struct task_struct *p, unsigned long tls) @@ -344,7 +339,7 @@ static void set_cpuid_faulting(bool on) msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT; msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT); this_cpu_write(msr_misc_features_shadow, msrval); - wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msrval); } static void disable_cpuid(void) @@ -561,7 +556,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) if (!static_cpu_has(X86_FEATURE_ZEN)) { msr |= ssbd_tif_to_amd_ls_cfg(tifn); - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); return; } @@ -578,7 +573,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) raw_spin_lock(&st->shared_state->lock); /* First sibling enables SSBD: */ if (!st->shared_state->disable_state) - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); st->shared_state->disable_state++; raw_spin_unlock(&st->shared_state->lock); } else { @@ -588,7 +583,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) raw_spin_lock(&st->shared_state->lock); st->shared_state->disable_state--; if (!st->shared_state->disable_state) - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); raw_spin_unlock(&st->shared_state->lock); } } @@ -597,7 +592,7 @@ static __always_inline void amd_set_core_ssb_state(unsigned long tifn) { u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn); - wrmsrl(MSR_AMD64_LS_CFG, msr); + wrmsrq(MSR_AMD64_LS_CFG, msr); } #endif @@ -607,7 +602,7 @@ static __always_inline void amd_set_ssb_virt_state(unsigned long tifn) * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL, * so ssbd_tif_to_spec_ctrl() just works. */ - wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); + wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn)); } /* @@ -710,11 +705,11 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p) arch_has_block_step()) { unsigned long debugctl, msk; - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl &= ~DEBUGCTLMSR_BTF; msk = tifn & _TIF_BLOCKSTEP; debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT; - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } if ((tifp ^ tifn) & _TIF_NOTSC) @@ -907,13 +902,10 @@ static __init bool prefer_mwait_c1_over_halt(void) static __cpuidle void mwait_idle(void) { if (!current_set_polling_and_test()) { - if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { - mb(); /* quirk */ - clflush((void *)¤t_thread_info()->flags); - mb(); /* quirk */ - } + const void *addr = ¤t_thread_info()->flags; - __monitor((void *)¤t_thread_info()->flags, 0, 0); + alternative_input("", "clflush (%[addr])", X86_BUG_CLFLUSH_MONITOR, [addr] "a" (addr)); + __monitor(addr, 0, 0); if (!need_resched()) { __sti_mwait(0, 0); raw_local_irq_disable(); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 4636ef359973..a10e180cbf23 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -160,8 +160,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_p, cpu); + switch_fpu(prev_p, cpu); /* * Save away %gs. No need to save %fs, as it was saved on the @@ -208,10 +207,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(current_task, next_p); - switch_fpu_finish(next_p); - /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(next_p); + resctrl_arch_sched_in(next_p); return prev_p; } diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7196ca7048be..8d6cf25127aa 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -57,6 +57,7 @@ #include <asm/unistd.h> #include <asm/fsgsbase.h> #include <asm/fred.h> +#include <asm/msr.h> #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ #include <asm/unistd_32_ia32.h> @@ -95,8 +96,8 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, return; if (mode == SHOW_REGS_USER) { - rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrq(MSR_FS_BASE, fs); + rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); printk("%sFS: %016lx GS: %016lx\n", log_lvl, fs, shadowgs); return; @@ -107,9 +108,9 @@ void __show_regs(struct pt_regs *regs, enum show_regs_mode mode, asm("movl %%fs,%0" : "=r" (fsindex)); asm("movl %%gs,%0" : "=r" (gsindex)); - rdmsrl(MSR_FS_BASE, fs); - rdmsrl(MSR_GS_BASE, gs); - rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); + rdmsrq(MSR_FS_BASE, fs); + rdmsrq(MSR_GS_BASE, gs); + rdmsrq(MSR_KERNEL_GS_BASE, shadowgs); cr0 = read_cr0(); cr2 = read_cr2(); @@ -195,7 +196,7 @@ static noinstr unsigned long __rdgsbase_inactive(void) native_swapgs(); } else { instrumentation_begin(); - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + rdmsrq(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } @@ -221,7 +222,7 @@ static noinstr void __wrgsbase_inactive(unsigned long gsbase) native_swapgs(); } else { instrumentation_begin(); - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + wrmsrq(MSR_KERNEL_GS_BASE, gsbase); instrumentation_end(); } } @@ -353,7 +354,7 @@ static __always_inline void load_seg_legacy(unsigned short prev_index, } else { if (prev_index != next_index) loadseg(which, next_index); - wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + wrmsrq(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, next_base); } } else { @@ -463,7 +464,7 @@ unsigned long x86_gsbase_read_cpu_inactive(void) gsbase = __rdgsbase_inactive(); local_irq_restore(flags); } else { - rdmsrl(MSR_KERNEL_GS_BASE, gsbase); + rdmsrq(MSR_KERNEL_GS_BASE, gsbase); } return gsbase; @@ -478,7 +479,7 @@ void x86_gsbase_write_cpu_inactive(unsigned long gsbase) __wrgsbase_inactive(gsbase); local_irq_restore(flags); } else { - wrmsrl(MSR_KERNEL_GS_BASE, gsbase); + wrmsrq(MSR_KERNEL_GS_BASE, gsbase); } } @@ -616,8 +617,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && this_cpu_read(hardirq_stack_inuse)); - if (!test_tsk_thread_flag(prev_p, TIF_NEED_FPU_LOAD)) - switch_fpu_prepare(prev_p, cpu); + switch_fpu(prev_p, cpu); /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). @@ -671,8 +671,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) raw_cpu_write(current_task, next_p); raw_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - switch_fpu_finish(next_p); - /* Reload sp0. */ update_task_stack(next_p); @@ -707,7 +705,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) } /* Load the Intel cache allocation PQR MSR. */ - resctrl_sched_in(next_p); + resctrl_arch_sched_in(next_p); return prev_p; } diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c index b7c0f142d026..4679ac0a03eb 100644 --- a/arch/x86/kernel/reboot_fixups_32.c +++ b/arch/x86/kernel/reboot_fixups_32.c @@ -27,7 +27,7 @@ static void cs5530a_warm_reset(struct pci_dev *dev) static void cs5536_warm_reset(struct pci_dev *dev) { /* writing 1 to the LSB of this MSR causes a hard reset */ - wrmsrl(MSR_DIVIL_SOFT_RESET, 1ULL); + wrmsrq(MSR_DIVIL_SOFT_RESET, 1ULL); udelay(50); /* shouldn't get here but be safe and spin a while */ } diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index c7c4b1917336..57276f134d12 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -263,17 +263,17 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) movl %edx, %edi movl $1024, %ecx - rep ; movsl + rep movsl movl %ebp, %edi movl %eax, %esi movl $1024, %ecx - rep ; movsl + rep movsl movl %eax, %edi movl %edx, %esi movl $1024, %ecx - rep ; movsl + rep movsl lea PAGE_SIZE(%ebp), %esi jmp 0b diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index ac058971a382..ea604f4d0b52 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -39,6 +39,8 @@ SYM_DATA(kexec_va_control_page, .quad 0) SYM_DATA(kexec_pa_table_page, .quad 0) SYM_DATA(kexec_pa_swap_page, .quad 0) SYM_DATA_LOCAL(pa_backup_pages_map, .quad 0) +SYM_DATA(kexec_debug_8250_mmio32, .quad 0) +SYM_DATA(kexec_debug_8250_port, .word 0) .balign 16 SYM_DATA_START_LOCAL(kexec_debug_gdt) @@ -50,6 +52,11 @@ SYM_DATA_START_LOCAL(kexec_debug_gdt) .quad 0x00cf92000000ffff /* __KERNEL_DS */ SYM_DATA_END_LABEL(kexec_debug_gdt, SYM_L_LOCAL, kexec_debug_gdt_end) + .balign 8 +SYM_DATA_START(kexec_debug_idt) + .skip 0x100, 0x00 +SYM_DATA_END(kexec_debug_idt) + .section .text..relocate_kernel,"ax"; .code64 SYM_CODE_START_NOALIGN(relocate_kernel) @@ -72,8 +79,13 @@ SYM_CODE_START_NOALIGN(relocate_kernel) pushq %r15 pushf - /* zero out flags, and disable interrupts */ - pushq $0 + /* Invalidate GDT/IDT, zero out flags */ + pushq $0 + pushq $0 + + lidt (%rsp) + lgdt (%rsp) + addq $8, %rsp popfq /* Switch to the identity mapped page tables */ @@ -139,6 +151,15 @@ SYM_CODE_START_LOCAL_NOALIGN(identity_mapped) movq %ds, %rax movq %rax, %ds + /* Now an IDTR on the stack to load the IDT the kernel created */ + leaq kexec_debug_idt(%rip), %rsi + pushq %rsi + pushw $0xff + lidt (%rsp) + addq $10, %rsp + + //int3 + /* * Clear X86_CR4_CET (if it was set) such that we can clear CR0_WP * below. @@ -342,20 +363,20 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) /* copy source page to swap page */ movq kexec_pa_swap_page(%rip), %rdi movl $512, %ecx - rep ; movsq + rep movsq /* copy destination page to source page */ movq %rax, %rdi movq %rdx, %rsi movl $512, %ecx - rep ; movsq + rep movsq /* copy swap page to destination page */ movq %rdx, %rdi movq kexec_pa_swap_page(%rip), %rsi .Lnoswap: movl $512, %ecx - rep ; movsq + rep movsq lea PAGE_SIZE(%rax), %rsi jmp .Lloop @@ -364,3 +385,222 @@ SYM_CODE_START_LOCAL_NOALIGN(swap_pages) ret int3 SYM_CODE_END(swap_pages) + +/* + * Generic 'print character' routine + * - %al: Character to be printed (may clobber %rax) + * - %rdx: MMIO address or port. + */ +#define XMTRDY 0x20 + +#define TXR 0 /* Transmit register (WRITE) */ +#define LSR 5 /* Line Status */ + +SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + addw $LSR, %dx + xchg %al, %ah +.Lxmtrdy_loop: + inb %dx, %al + testb $XMTRDY, %al + jnz .Lready + pause + jmp .Lxmtrdy_loop + +.Lready: + subw $LSR, %dx + xchg %al, %ah + outb %al, %dx +pr_char_null: + ANNOTATE_NOENDBR + + ANNOTATE_UNRET_SAFE + ret +SYM_CODE_END(pr_char_8250) + +SYM_CODE_START_LOCAL_NOALIGN(pr_char_8250_mmio32) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR +.Lxmtrdy_loop_mmio: + movb (LSR*4)(%rdx), %ah + testb $XMTRDY, %ah + jnz .Lready_mmio + pause + jmp .Lxmtrdy_loop_mmio + +.Lready_mmio: + movb %al, (%rdx) + ANNOTATE_UNRET_SAFE + ret +SYM_CODE_END(pr_char_8250_mmio32) + +/* + * Load pr_char function pointer into %rsi and load %rdx with whatever + * that function wants to see there (typically port/MMIO address). + */ +.macro pr_setup + leaq pr_char_8250(%rip), %rsi + movw kexec_debug_8250_port(%rip), %dx + testw %dx, %dx + jnz 1f + + leaq pr_char_8250_mmio32(%rip), %rsi + movq kexec_debug_8250_mmio32(%rip), %rdx + testq %rdx, %rdx + jnz 1f + + leaq pr_char_null(%rip), %rsi +1: +.endm + +/* Print the nybble in %bl, clobber %rax */ +SYM_CODE_START_LOCAL_NOALIGN(pr_nybble) + UNWIND_HINT_FUNC + movb %bl, %al + nop + andb $0x0f, %al + addb $0x30, %al + cmpb $0x3a, %al + jb 1f + addb $('a' - '0' - 10), %al + ANNOTATE_RETPOLINE_SAFE +1: jmp *%rsi +SYM_CODE_END(pr_nybble) + +SYM_CODE_START_LOCAL_NOALIGN(pr_qword) + UNWIND_HINT_FUNC + movq $16, %rcx +1: rolq $4, %rbx + call pr_nybble + loop 1b + movb $'\n', %al + ANNOTATE_RETPOLINE_SAFE + jmp *%rsi +SYM_CODE_END(pr_qword) + +.macro print_reg a, b, c, d, r + movb $\a, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\b, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\c, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movb $\d, %al + ANNOTATE_RETPOLINE_SAFE + call *%rsi + movq \r, %rbx + call pr_qword +.endm + +SYM_CODE_START_NOALIGN(kexec_debug_exc_vectors) + /* Each of these is 6 bytes. */ +.macro vec_err exc + UNWIND_HINT_ENTRY + . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) + nop + nop + pushq $\exc + jmp exc_handler +.endm + +.macro vec_noerr exc + UNWIND_HINT_ENTRY + . = kexec_debug_exc_vectors + (\exc * KEXEC_DEBUG_EXC_HANDLER_SIZE) + pushq $0 + pushq $\exc + jmp exc_handler +.endm + + ANNOTATE_NOENDBR + vec_noerr 0 // #DE + vec_noerr 1 // #DB + vec_noerr 2 // #NMI + vec_noerr 3 // #BP + vec_noerr 4 // #OF + vec_noerr 5 // #BR + vec_noerr 6 // #UD + vec_noerr 7 // #NM + vec_err 8 // #DF + vec_noerr 9 + vec_err 10 // #TS + vec_err 11 // #NP + vec_err 12 // #SS + vec_err 13 // #GP + vec_err 14 // #PF + vec_noerr 15 +SYM_CODE_END(kexec_debug_exc_vectors) + +SYM_CODE_START_LOCAL_NOALIGN(exc_handler) + /* No need for RET mitigations during kexec */ + VALIDATE_UNRET_END + + pushq %rax + pushq %rbx + pushq %rcx + pushq %rdx + pushq %rsi + + /* Stack frame */ +#define EXC_SS 0x58 /* Architectural... */ +#define EXC_RSP 0x50 +#define EXC_EFLAGS 0x48 +#define EXC_CS 0x40 +#define EXC_RIP 0x38 +#define EXC_ERRORCODE 0x30 /* Either architectural or zero pushed by handler */ +#define EXC_EXCEPTION 0x28 /* Pushed by handler entry point */ +#define EXC_RAX 0x20 /* Pushed just above in exc_handler */ +#define EXC_RBX 0x18 +#define EXC_RCX 0x10 +#define EXC_RDX 0x08 +#define EXC_RSI 0x00 + + /* Set up %rdx/%rsi for debug output */ + pr_setup + + /* rip and exception info */ + print_reg 'E', 'x', 'c', ':', EXC_EXCEPTION(%rsp) + print_reg 'E', 'r', 'r', ':', EXC_ERRORCODE(%rsp) + print_reg 'r', 'i', 'p', ':', EXC_RIP(%rsp) + print_reg 'r', 's', 'p', ':', EXC_RSP(%rsp) + + /* We spilled these to the stack */ + print_reg 'r', 'a', 'x', ':', EXC_RAX(%rsp) + print_reg 'r', 'b', 'x', ':', EXC_RBX(%rsp) + print_reg 'r', 'c', 'x', ':', EXC_RCX(%rsp) + print_reg 'r', 'd', 'x', ':', EXC_RDX(%rsp) + print_reg 'r', 's', 'i', ':', EXC_RSI(%rsp) + + /* Other registers untouched */ + print_reg 'r', 'd', 'i', ':', %rdi + print_reg 'r', '8', ' ', ':', %r8 + print_reg 'r', '9', ' ', ':', %r9 + print_reg 'r', '1', '0', ':', %r10 + print_reg 'r', '1', '1', ':', %r11 + print_reg 'r', '1', '2', ':', %r12 + print_reg 'r', '1', '3', ':', %r13 + print_reg 'r', '1', '4', ':', %r14 + print_reg 'r', '1', '5', ':', %r15 + print_reg 'c', 'r', '2', ':', %cr2 + + /* Only return from INT3 */ + cmpq $3, EXC_EXCEPTION(%rsp) + jne .Ldie + + popq %rsi + popq %rdx + popq %rcx + popq %rbx + popq %rax + + addq $16, %rsp + iretq + +.Ldie: + hlt + jmp .Ldie + +SYM_CODE_END(exc_handler) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9d2a13b37833..7d9ed79a93c0 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -11,6 +11,7 @@ #include <linux/crash_dump.h> #include <linux/dma-map-ops.h> #include <linux/efi.h> +#include <linux/hugetlb.h> #include <linux/ima.h> #include <linux/init_ohci1394_dma.h> #include <linux/initrd.h> @@ -18,21 +19,19 @@ #include <linux/memblock.h> #include <linux/panic_notifier.h> #include <linux/pci.h> +#include <linux/random.h> #include <linux/root_dev.h> -#include <linux/hugetlb.h> -#include <linux/tboot.h> -#include <linux/usb/xhci-dbgp.h> #include <linux/static_call.h> #include <linux/swiotlb.h> -#include <linux/random.h> +#include <linux/tboot.h> +#include <linux/usb/xhci-dbgp.h> +#include <linux/vmalloc.h> #include <uapi/linux/mount.h> #include <xen/xen.h> #include <asm/apic.h> -#include <asm/efi.h> -#include <asm/numa.h> #include <asm/bios_ebda.h> #include <asm/bugs.h> #include <asm/cacheinfo.h> @@ -47,18 +46,16 @@ #include <asm/mce.h> #include <asm/memtype.h> #include <asm/mtrr.h> -#include <asm/realmode.h> +#include <asm/nmi.h> +#include <asm/numa.h> #include <asm/olpc_ofw.h> #include <asm/pci-direct.h> #include <asm/prom.h> #include <asm/proto.h> +#include <asm/realmode.h> #include <asm/thermal.h> #include <asm/unwind.h> #include <asm/vsyscall.h> -#include <linux/vmalloc.h> -#if defined(CONFIG_X86_LOCAL_APIC) -#include <asm/nmi.h> -#endif /* * max_low_pfn_mapped: highest directly mapped pfn < 4 GB @@ -134,6 +131,7 @@ struct ist_info ist_info; struct cpuinfo_x86 boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); +SYM_PIC_ALIAS(boot_cpu_data); #if !defined(CONFIG_X86_PAE) || defined(CONFIG_X86_64) __visible unsigned long mmu_cr4_features __ro_after_init; @@ -151,6 +149,13 @@ int bootloader_type, bootloader_version; static const struct ctl_table x86_sysctl_table[] = { { + .procname = "unknown_nmi_panic", + .data = &unknown_nmi_panic, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { .procname = "panic_on_unrecovered_nmi", .data = &panic_on_unrecovered_nmi, .maxlen = sizeof(int), @@ -185,15 +190,6 @@ static const struct ctl_table x86_sysctl_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, -#if defined(CONFIG_X86_LOCAL_APIC) - { - .procname = "unknown_nmi_panic", - .data = &unknown_nmi_panic, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, -#endif #if defined(CONFIG_ACPI_SLEEP) { .procname = "acpi_video_flags", diff --git a/arch/x86/kernel/shstk.c b/arch/x86/kernel/shstk.c index 059685612362..2ddf23387c7e 100644 --- a/arch/x86/kernel/shstk.c +++ b/arch/x86/kernel/shstk.c @@ -173,8 +173,8 @@ static int shstk_setup(void) return PTR_ERR((void *)addr); fpregs_lock_and_load(); - wrmsrl(MSR_IA32_PL3_SSP, addr + size); - wrmsrl(MSR_IA32_U_CET, CET_SHSTK_EN); + wrmsrq(MSR_IA32_PL3_SSP, addr + size); + wrmsrq(MSR_IA32_U_CET, CET_SHSTK_EN); fpregs_unlock(); shstk->base = addr; @@ -239,7 +239,7 @@ static unsigned long get_user_shstk_addr(void) fpregs_lock_and_load(); - rdmsrl(MSR_IA32_PL3_SSP, ssp); + rdmsrq(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); @@ -372,7 +372,7 @@ int setup_signal_shadow_stack(struct ksignal *ksig) return -EFAULT; fpregs_lock_and_load(); - wrmsrl(MSR_IA32_PL3_SSP, ssp); + wrmsrq(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; @@ -396,7 +396,7 @@ int restore_signal_shadow_stack(void) return err; fpregs_lock_and_load(); - wrmsrl(MSR_IA32_PL3_SSP, ssp); + wrmsrq(MSR_IA32_PL3_SSP, ssp); fpregs_unlock(); return 0; @@ -460,7 +460,7 @@ static int wrss_control(bool enable) return 0; fpregs_lock_and_load(); - rdmsrl(MSR_IA32_U_CET, msrval); + rdmsrq(MSR_IA32_U_CET, msrval); if (enable) { features_set(ARCH_SHSTK_WRSS); @@ -473,7 +473,7 @@ static int wrss_control(bool enable) msrval &= ~CET_WRSS_EN; } - wrmsrl(MSR_IA32_U_CET, msrval); + wrmsrq(MSR_IA32_U_CET, msrval); unlock: fpregs_unlock(); @@ -492,8 +492,8 @@ static int shstk_disable(void) fpregs_lock_and_load(); /* Disable WRSS too when disabling shadow stack */ - wrmsrl(MSR_IA32_U_CET, 0); - wrmsrl(MSR_IA32_PL3_SSP, 0); + wrmsrq(MSR_IA32_U_CET, 0); + wrmsrq(MSR_IA32_PL3_SSP, 0); fpregs_unlock(); shstk_free(current); diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 5f441039b572..2404233336ab 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -255,7 +255,7 @@ static void handle_signal(struct ksignal *ksig, struct pt_regs *regs) { bool stepping, failed; - struct fpu *fpu = ¤t->thread.fpu; + struct fpu *fpu = x86_task_fpu(current); if (v8086_mode(regs)) save_v86_state((struct kernel_vm86_regs *) regs, VM86_SIGNAL); @@ -423,14 +423,14 @@ bool sigaltstack_size_valid(size_t ss_size) if (!fpu_state_size_dynamic() && !strict_sigaltstack_size) return true; - fsize += current->group_leader->thread.fpu.perm.__user_state_size; + fsize += x86_task_fpu(current->group_leader)->perm.__user_state_size; if (likely(ss_size > fsize)) return true; if (strict_sigaltstack_size) return ss_size > fsize; - mask = current->group_leader->thread.fpu.perm.__state_perm; + mask = x86_task_fpu(current->group_leader)->perm.__state_perm; if (mask & XFEATURE_MASK_USER_DYNAMIC) return ss_size > fsize; diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index d6cf1e23c2a3..b90d872aa0c8 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -64,7 +64,7 @@ #include <asm/acpi.h> #include <asm/cacheinfo.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/desc.h> #include <asm/nmi.h> #include <asm/irq.h> @@ -1188,6 +1188,12 @@ void cpu_disable_common(void) remove_siblinginfo(cpu); + /* + * Stop allowing kernel-mode FPU. This is needed so that if the CPU is + * brought online again, the initial state is not allowed: + */ + this_cpu_write(kernel_fpu_allowed, false); + /* It's now safe to remove this processor from the online map */ lock_vector_lock(); remove_cpu_from_maps(cpu); diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index c3d7ff44b29a..378c388d1b31 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -108,7 +108,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, if (system_state == SYSTEM_BOOTING || modinit) return text_poke_early(insn, code, size); - text_poke_bp(insn, code, size, emulate); + smp_text_poke_single(insn, code, size, emulate); } static void __static_call_validate(u8 *insn, bool tail, bool tramp) diff --git a/arch/x86/kernel/trace_clock.c b/arch/x86/kernel/trace_clock.c index b8e7abe00b06..708d61743d15 100644 --- a/arch/x86/kernel/trace_clock.c +++ b/arch/x86/kernel/trace_clock.c @@ -4,7 +4,7 @@ */ #include <asm/trace_clock.h> #include <asm/barrier.h> -#include <asm/msr.h> +#include <asm/tsc.h> /* * trace_clock_x86_tsc(): A clock that is just the cycle counter. diff --git a/arch/x86/kernel/tracepoint.c b/arch/x86/kernel/tracepoint.c deleted file mode 100644 index 03ae1caaa878..000000000000 --- a/arch/x86/kernel/tracepoint.c +++ /dev/null @@ -1,21 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2013 Seiji Aguchi <seiji.aguchi@hds.com> - */ -#include <linux/jump_label.h> -#include <linux/atomic.h> - -#include <asm/trace/exceptions.h> - -DEFINE_STATIC_KEY_FALSE(trace_pagefault_key); - -int trace_pagefault_reg(void) -{ - static_branch_inc(&trace_pagefault_key); - return 0; -} - -void trace_pagefault_unreg(void) -{ - static_branch_dec(&trace_pagefault_key); -} diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 9f88b8a78e50..c5c897a86418 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -68,6 +68,7 @@ #include <asm/vdso.h> #include <asm/tdx.h> #include <asm/cfi.h> +#include <asm/msr.h> #ifdef CONFIG_X86_64 #include <asm/x86_init.h> @@ -351,7 +352,7 @@ static noinstr bool handle_bug(struct pt_regs *regs) case BUG_UD1_UBSAN: if (IS_ENABLED(CONFIG_UBSAN_TRAP)) { pr_crit("%s at %pS\n", - report_ubsan_failure(regs, ud_imm), + report_ubsan_failure(ud_imm), (void *)regs->ip); } break; @@ -749,7 +750,7 @@ static bool try_fixup_enqcmd_gp(void) if (current->pasid_activated) return false; - wrmsrl(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); + wrmsrq(MSR_IA32_PASID, pasid | MSR_IA32_PASID_VALID); current->pasid_activated = 1; return true; @@ -882,16 +883,16 @@ static void do_int3_user(struct pt_regs *regs) DEFINE_IDTENTRY_RAW(exc_int3) { /* - * poke_int3_handler() is completely self contained code; it does (and + * smp_text_poke_int3_handler() is completely self contained code; it does (and * must) *NOT* call out to anything, lest it hits upon yet another * INT3. */ - if (poke_int3_handler(regs)) + if (smp_text_poke_int3_handler(regs)) return; /* * irqentry_enter_from_user_mode() uses static_branch_{,un}likely() - * and therefore can trigger INT3, hence poke_int3_handler() must + * and therefore can trigger INT3, hence smp_text_poke_int3_handler() must * be done before. If the entry came from kernel mode, then use * nmi_enter() because the INT3 could have been hit in any context * including NMI. @@ -1120,9 +1121,9 @@ static noinstr void exc_debug_kernel(struct pt_regs *regs, unsigned long dr6) */ unsigned long debugctl; - rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + rdmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); debugctl |= DEBUGCTLMSR_BTF; - wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + wrmsrq(MSR_IA32_DEBUGCTLMSR, debugctl); } /* @@ -1295,7 +1296,7 @@ DEFINE_IDTENTRY_RAW(exc_debug) static void math_error(struct pt_regs *regs, int trapnr) { struct task_struct *task = current; - struct fpu *fpu = &task->thread.fpu; + struct fpu *fpu = x86_task_fpu(task); int si_code; char *str = (trapnr == X86_TRAP_MF) ? "fpu exception" : "simd exception"; @@ -1386,11 +1387,11 @@ static bool handle_xfd_event(struct pt_regs *regs) if (!IS_ENABLED(CONFIG_X86_64) || !cpu_feature_enabled(X86_FEATURE_XFD)) return false; - rdmsrl(MSR_IA32_XFD_ERR, xfd_err); + rdmsrq(MSR_IA32_XFD_ERR, xfd_err); if (!xfd_err) return false; - wrmsrl(MSR_IA32_XFD_ERR, 0); + wrmsrq(MSR_IA32_XFD_ERR, 0); /* Die if that happens in kernel space */ if (WARN_ON(!user_mode(regs))) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 88e5a4ed9db3..87e749106dda 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -16,7 +16,7 @@ #include <linux/static_key.h> #include <linux/static_call.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/hpet.h> #include <asm/timer.h> #include <asm/vgtod.h> @@ -29,6 +29,7 @@ #include <asm/apic.h> #include <asm/cpu_device_id.h> #include <asm/i8259.h> +#include <asm/msr.h> #include <asm/topology.h> #include <asm/uv/uv.h> #include <asm/sev.h> @@ -1098,7 +1099,7 @@ static void __init detect_art(void) if (art_base_clk.denominator < ART_MIN_DENOMINATOR) return; - rdmsrl(MSR_IA32_TSC_ADJUST, art_base_clk.offset); + rdmsrq(MSR_IA32_TSC_ADJUST, art_base_clk.offset); /* Make this sticky over multiple CPU init calls */ setup_force_cpu_cap(X86_FEATURE_ART); diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 4334033658ed..ec3aa340d351 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -21,6 +21,7 @@ #include <linux/kernel.h> #include <linux/smp.h> #include <linux/nmi.h> +#include <asm/msr.h> #include <asm/tsc.h> struct tsc_adjust { @@ -65,12 +66,12 @@ void tsc_verify_tsc_adjust(bool resume) adj->nextcheck = jiffies + HZ; - rdmsrl(MSR_IA32_TSC_ADJUST, curval); + rdmsrq(MSR_IA32_TSC_ADJUST, curval); if (adj->adjusted == curval) return; /* Restore the original value */ - wrmsrl(MSR_IA32_TSC_ADJUST, adj->adjusted); + wrmsrq(MSR_IA32_TSC_ADJUST, adj->adjusted); if (!adj->warned || resume) { pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n", @@ -142,7 +143,7 @@ static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, if (likely(!tsc_async_resets)) { pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n", cpu, bootval); - wrmsrl(MSR_IA32_TSC_ADJUST, 0); + wrmsrq(MSR_IA32_TSC_ADJUST, 0); bootval = 0; } else { pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n", @@ -165,7 +166,7 @@ bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) if (check_tsc_unstable()) return false; - rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + rdmsrq(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->nextcheck = jiffies + HZ; tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); @@ -187,7 +188,7 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) return false; - rdmsrl(MSR_IA32_TSC_ADJUST, bootval); + rdmsrq(MSR_IA32_TSC_ADJUST, bootval); cur->bootval = bootval; cur->nextcheck = jiffies + HZ; cur->warned = false; @@ -229,7 +230,7 @@ bool tsc_store_and_check_tsc_adjust(bool bootcpu) */ if (bootval != ref->adjusted) { cur->adjusted = ref->adjusted; - wrmsrl(MSR_IA32_TSC_ADJUST, ref->adjusted); + wrmsrq(MSR_IA32_TSC_ADJUST, ref->adjusted); } /* * We have the TSCs forced to be in sync on this package. Skip sync @@ -518,7 +519,7 @@ retry: pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n", cpu, cur_max_warp, cur->adjusted); - wrmsrl(MSR_IA32_TSC_ADJUST, cur->adjusted); + wrmsrq(MSR_IA32_TSC_ADJUST, cur->adjusted); goto retry; } diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 9194695662b2..6d383839e839 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -840,6 +840,11 @@ static int branch_setup_xol_ops(struct arch_uprobe *auprobe, struct insn *insn) insn_byte_t p; int i; + /* x86_nops[insn->length]; same as jmp with .offs = 0 */ + if (insn->length <= ASM_NOP_MAX && + !memcmp(insn->kaddr, x86_nops[insn->length], insn->length)) + goto setup; + switch (opc1) { case 0xeb: /* jmp 8 */ case 0xe9: /* jmp 32 */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index cda5f8362e9d..4fa0be732af1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -79,11 +79,13 @@ const_cpu_current_top_of_stack = cpu_current_top_of_stack; #define BSS_DECRYPTED \ . = ALIGN(PMD_SIZE); \ __start_bss_decrypted = .; \ + __pi___start_bss_decrypted = .; \ *(.bss..decrypted); \ . = ALIGN(PAGE_SIZE); \ __start_bss_decrypted_unused = .; \ . = ALIGN(PMD_SIZE); \ __end_bss_decrypted = .; \ + __pi___end_bss_decrypted = .; \ #else @@ -128,6 +130,7 @@ SECTIONS /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; + __pi__text = .; _stext = .; ALIGN_ENTRY_TEXT_BEGIN *(.text..__x86.rethunk_untrain) @@ -391,6 +394,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); /* keep VO_INIT_SIZE page aligned */ _end = .; + __pi__end = .; #ifdef CONFIG_AMD_MEM_ENCRYPT /* |