diff options
Diffstat (limited to 'arch/x86/kernel/cpu')
43 files changed, 2304 insertions, 8880 deletions
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 4efdf5c2efc8..1e26179ff18c 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -24,7 +24,7 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o obj-y += aperfmperf.o -obj-y += cpuid-deps.o +obj-y += cpuid-deps.o cpuid_0x2_table.o obj-y += umwait.o obj-y += capflags.o powerflags.o @@ -38,6 +38,9 @@ obj-y += intel.o tsx.o obj-$(CONFIG_PM) += intel_epb.o endif obj-$(CONFIG_CPU_SUP_AMD) += amd.o +ifeq ($(CONFIG_AMD_NB)$(CONFIG_SYSFS),yy) +obj-y += amd_cache_disable.o +endif obj-$(CONFIG_CPU_SUP_HYGON) += hygon.o obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o obj-$(CONFIG_CPU_SUP_CENTAUR) += centaur.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 2b36379ff675..93da466dfe2c 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -9,6 +9,7 @@ #include <linux/sched/clock.h> #include <linux/random.h> #include <linux/topology.h> +#include <asm/amd/fch.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cacheinfo.h> @@ -21,6 +22,7 @@ #include <asm/delay.h> #include <asm/debugreg.h> #include <asm/resctrl.h> +#include <asm/msr.h> #include <asm/sev.h> #ifdef CONFIG_X86_64 @@ -31,7 +33,7 @@ u16 invlpgb_count_max __ro_after_init; -static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) +static inline int rdmsrq_amd_safe(unsigned msr, u64 *p) { u32 gprs[8] = { 0 }; int err; @@ -49,7 +51,7 @@ static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) return err; } -static inline int wrmsrl_amd_safe(unsigned msr, unsigned long long val) +static inline int wrmsrq_amd_safe(unsigned msr, u64 val) { u32 gprs[8] = { 0 }; @@ -383,7 +385,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) (c->x86 == 0x10 && c->x86_model >= 0x2)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -422,7 +424,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << bit; @@ -472,6 +474,11 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) case 0x60 ... 0x7f: setup_force_cpu_cap(X86_FEATURE_ZEN5); break; + case 0x50 ... 0x5f: + case 0x90 ... 0xaf: + case 0xc0 ... 0xcf: + setup_force_cpu_cap(X86_FEATURE_ZEN6); + break; default: goto warn; } @@ -508,7 +515,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) */ if (cpu_has(c, X86_FEATURE_SME) || cpu_has(c, X86_FEATURE_SEV)) { /* Check if memory encryption is enabled */ - rdmsrl(MSR_AMD64_SYSCFG, msr); + rdmsrq(MSR_AMD64_SYSCFG, msr); if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) goto clear_all; @@ -525,7 +532,7 @@ static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) if (!sme_me_mask) setup_clear_cpu_cap(X86_FEATURE_SME); - rdmsrl(MSR_K7_HWCR, msr); + rdmsrq(MSR_K7_HWCR, msr); if (!(msr & MSR_K7_HWCR_SMMLOCK)) goto clear_sev; @@ -612,7 +619,7 @@ static void early_init_amd(struct cpuinfo_x86 *c) if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && !cpu_has(c, X86_FEATURE_IBPB_BRTYPE)) { if (c->x86 == 0x17 && boot_cpu_has(X86_FEATURE_AMD_IBPB)) setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); - else if (c->x86 >= 0x19 && !wrmsrl_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { + else if (c->x86 >= 0x19 && !wrmsrq_safe(MSR_IA32_PRED_CMD, PRED_CMD_SBPB)) { setup_force_cpu_cap(X86_FEATURE_IBPB_BRTYPE); setup_force_cpu_cap(X86_FEATURE_SBPB); } @@ -636,14 +643,14 @@ static void init_amd_k8(struct cpuinfo_x86 *c) */ if (c->x86_model < 0x14 && cpu_has(c, X86_FEATURE_LAHF_LM) && !cpu_has(c, X86_FEATURE_HYPERVISOR)) { clear_cpu_cap(c, X86_FEATURE_LAHF_LM); - if (!rdmsrl_amd_safe(0xc001100d, &value)) { + if (!rdmsrq_amd_safe(0xc001100d, &value)) { value &= ~BIT_64(32); - wrmsrl_amd_safe(0xc001100d, value); + wrmsrq_amd_safe(0xc001100d, value); } } if (!c->x86_model_id[0]) - strcpy(c->x86_model_id, "Hammer"); + strscpy(c->x86_model_id, "Hammer"); #ifdef CONFIG_SMP /* @@ -788,9 +795,9 @@ static void init_amd_bd(struct cpuinfo_x86 *c) * Disable it on the affected CPUs. */ if ((c->x86_model >= 0x02) && (c->x86_model < 0x20)) { - if (!rdmsrl_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { + if (!rdmsrq_safe(MSR_F15H_IC_CFG, &value) && !(value & 0x1E)) { value |= 0x1E; - wrmsrl_safe(MSR_F15H_IC_CFG, value); + wrmsrq_safe(MSR_F15H_IC_CFG, value); } } @@ -839,9 +846,9 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) * suppresses non-branch predictions. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { - if (!rdmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { + if (!rdmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, &value)) { value |= MSR_ZEN2_SPECTRAL_CHICKEN_BIT; - wrmsrl_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); + wrmsrq_safe(MSR_ZEN2_SPECTRAL_CHICKEN, value); } } #endif @@ -1025,7 +1032,7 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); @@ -1206,7 +1213,7 @@ void amd_set_dr_addr_mask(unsigned long mask, unsigned int dr) if (per_cpu(amd_dr_addr_mask, cpu)[dr] == mask) return; - wrmsr(amd_msr_dr_addr_masks[dr], mask, 0); + wrmsrq(amd_msr_dr_addr_masks[dr], mask); per_cpu(amd_dr_addr_mask, cpu)[dr] = mask; } @@ -1237,3 +1244,56 @@ void amd_check_microcode(void) if (cpu_feature_enabled(X86_FEATURE_ZEN2)) on_each_cpu(zenbleed_check_cpu, NULL, 1); } + +static const char * const s5_reset_reason_txt[] = { + [0] = "thermal pin BP_THERMTRIP_L was tripped", + [1] = "power button was pressed for 4 seconds", + [2] = "shutdown pin was tripped", + [4] = "remote ASF power off command was received", + [9] = "internal CPU thermal limit was tripped", + [16] = "system reset pin BP_SYS_RST_L was tripped", + [17] = "software issued PCI reset", + [18] = "software wrote 0x4 to reset control register 0xCF9", + [19] = "software wrote 0x6 to reset control register 0xCF9", + [20] = "software wrote 0xE to reset control register 0xCF9", + [21] = "ACPI power state transition occurred", + [22] = "keyboard reset pin KB_RST_L was tripped", + [23] = "internal CPU shutdown event occurred", + [24] = "system failed to boot before failed boot timer expired", + [25] = "hardware watchdog timer expired", + [26] = "remote ASF reset command was received", + [27] = "an uncorrected error caused a data fabric sync flood event", + [29] = "FCH and MP1 failed warm reset handshake", + [30] = "a parity error occurred", + [31] = "a software sync flood event occurred", +}; + +static __init int print_s5_reset_status_mmio(void) +{ + unsigned long value; + void __iomem *addr; + int i; + + if (!cpu_feature_enabled(X86_FEATURE_ZEN)) + return 0; + + addr = ioremap(FCH_PM_BASE + FCH_PM_S5_RESET_STATUS, sizeof(value)); + if (!addr) + return 0; + + value = ioread32(addr); + iounmap(addr); + + for (i = 0; i < ARRAY_SIZE(s5_reset_reason_txt); i++) { + if (!(value & BIT(i))) + continue; + + if (s5_reset_reason_txt[i]) { + pr_info("x86/amd: Previous system reset reason [0x%08lx]: %s\n", + value, s5_reset_reason_txt[i]); + } + } + + return 0; +} +late_initcall(print_s5_reset_status_mmio); diff --git a/arch/x86/kernel/cpu/amd_cache_disable.c b/arch/x86/kernel/cpu/amd_cache_disable.c new file mode 100644 index 000000000000..8843b9557aea --- /dev/null +++ b/arch/x86/kernel/cpu/amd_cache_disable.c @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * AMD L3 cache_disable_{0,1} sysfs handling + * Documentation/ABI/testing/sysfs-devices-system-cpu + */ + +#include <linux/cacheinfo.h> +#include <linux/capability.h> +#include <linux/pci.h> +#include <linux/sysfs.h> + +#include <asm/amd/nb.h> + +#include "cpu.h" + +/* + * L3 cache descriptors + */ +static void amd_calc_l3_indices(struct amd_northbridge *nb) +{ + struct amd_l3_cache *l3 = &nb->l3_cache; + unsigned int sc0, sc1, sc2, sc3; + u32 val = 0; + + pci_read_config_dword(nb->misc, 0x1C4, &val); + + /* calculate subcache sizes */ + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); + + l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; +} + +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned int slot) +{ + unsigned int reg = 0; + + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; + + return -1; +} + +static ssize_t show_cache_disable(struct cacheinfo *ci, char *buf, unsigned int slot) +{ + int index; + struct amd_northbridge *nb = ci->priv; + + index = amd_get_l3_disable_slot(nb, slot); + if (index >= 0) + return sysfs_emit(buf, "%d\n", index); + + return sysfs_emit(buf, "FREE\n"); +} + +#define SHOW_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return show_cache_disable(ci, buf, slot); \ +} + +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long idx) +{ + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!nb->l3_cache.subcaches[i]) + continue; + + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, + unsigned int slot, unsigned long index) +{ + int ret = 0; + + /* check if @slot is already used or the index is already disabled */ + ret = amd_get_l3_disable_slot(nb, slot); + if (ret >= 0) + return -EEXIST; + + if (index > nb->l3_cache.indices) + return -EINVAL; + + /* check whether the other slot has disabled the same index already */ + if (index == amd_get_l3_disable_slot(nb, !slot)) + return -EEXIST; + + amd_l3_disable_index(nb, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct cacheinfo *ci, const char *buf, + size_t count, unsigned int slot) +{ + struct amd_northbridge *nb = ci->priv; + unsigned long val = 0; + int cpu, err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + cpu = cpumask_first(&ci->shared_cpu_map); + + if (kstrtoul(buf, 10, &val) < 0) + return -EINVAL; + + err = amd_set_l3_disable_slot(nb, cpu, slot, val); + if (err) { + if (err == -EEXIST) + pr_warn("L3 slot %d in use/index already disabled!\n", + slot); + return err; + } + return count; +} + +#define STORE_CACHE_DISABLE(slot) \ +static ssize_t \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ +{ \ + struct cacheinfo *ci = dev_get_drvdata(dev); \ + return store_cache_disable(ci, buf, count, slot); \ +} + +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + +static ssize_t subcaches_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + + return sysfs_emit(buf, "%x\n", amd_get_subcaches(cpu)); +} + +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cacheinfo *ci = dev_get_drvdata(dev); + int cpu = cpumask_first(&ci->shared_cpu_map); + unsigned long val; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (kstrtoul(buf, 16, &val) < 0) + return -EINVAL; + + if (amd_set_subcaches(cpu, val)) + return -EINVAL; + + return count; +} + +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *ci = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!ci->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + static struct attribute **amd_l3_attrs; + int n = 1; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group *cache_get_priv_group(struct cacheinfo *ci) +{ + struct amd_northbridge *nb = ci->priv; + + if (ci->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +struct amd_northbridge *amd_init_l3_cache(int index) +{ + struct amd_northbridge *nb; + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return NULL; + + node = topology_amd_node_id(smp_processor_id()); + nb = node_to_amd_nb(node); + if (nb && !nb->l3_cache.indices) + amd_calc_l3_indices(nb); + + return nb; +} diff --git a/arch/x86/kernel/cpu/aperfmperf.c b/arch/x86/kernel/cpu/aperfmperf.c index 6cf31a1649c4..a315b0627dfb 100644 --- a/arch/x86/kernel/cpu/aperfmperf.c +++ b/arch/x86/kernel/cpu/aperfmperf.c @@ -20,6 +20,7 @@ #include <asm/cpu.h> #include <asm/cpu_device_id.h> #include <asm/intel-family.h> +#include <asm/msr.h> #include "cpu.h" @@ -40,8 +41,8 @@ static void init_counter_refs(void) { u64 aperf, mperf; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); this_cpu_write(cpu_samples.aperf, aperf); this_cpu_write(cpu_samples.mperf, mperf); @@ -99,7 +100,7 @@ static bool __init turbo_disabled(void) u64 misc_en; int err; - err = rdmsrl_safe(MSR_IA32_MISC_ENABLE, &misc_en); + err = rdmsrq_safe(MSR_IA32_MISC_ENABLE, &misc_en); if (err) return false; @@ -110,11 +111,11 @@ static bool __init slv_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) { int err; - err = rdmsrl_safe(MSR_ATOM_CORE_RATIOS, base_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_RATIOS, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); + err = rdmsrq_safe(MSR_ATOM_CORE_TURBO_RATIOS, turbo_freq); if (err) return false; @@ -152,13 +153,13 @@ static bool __init knl_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int err, i; u64 msr; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -190,17 +191,17 @@ static bool __init skx_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq, int s u32 group_size; int err, i; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; *base_freq = (*base_freq >> 8) & 0xFF; /* max P state */ - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &ratios); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &ratios); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT1, &counts); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT1, &counts); if (err) return false; @@ -220,11 +221,11 @@ static bool __init core_set_max_freq_ratio(u64 *base_freq, u64 *turbo_freq) u64 msr; int err; - err = rdmsrl_safe(MSR_PLATFORM_INFO, base_freq); + err = rdmsrq_safe(MSR_PLATFORM_INFO, base_freq); if (err) return false; - err = rdmsrl_safe(MSR_TURBO_RATIO_LIMIT, &msr); + err = rdmsrq_safe(MSR_TURBO_RATIO_LIMIT, &msr); if (err) return false; @@ -474,8 +475,8 @@ void arch_scale_freq_tick(void) if (!cpu_feature_enabled(X86_FEATURE_APERFMPERF)) return; - rdmsrl(MSR_IA32_APERF, aperf); - rdmsrl(MSR_IA32_MPERF, mperf); + rdmsrq(MSR_IA32_APERF, aperf); + rdmsrq(MSR_IA32_MPERF, mperf); acnt = aperf - s->aperf; mcnt = mperf - s->mperf; diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 362602b705cc..7f94e6a5497d 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -34,21 +34,66 @@ #include "cpu.h" +/* + * Speculation Vulnerability Handling + * + * Each vulnerability is handled with the following functions: + * <vuln>_select_mitigation() -- Selects a mitigation to use. This should + * take into account all relevant command line + * options. + * <vuln>_update_mitigation() -- This is called after all vulnerabilities have + * selected a mitigation, in case the selection + * may want to change based on other choices + * made. This function is optional. + * <vuln>_apply_mitigation() -- Enable the selected mitigation. + * + * The compile-time mitigation in all cases should be AUTO. An explicit + * command-line option can override AUTO. If no such option is + * provided, <vuln>_select_mitigation() will override AUTO to the best + * mitigation option. + */ + static void __init spectre_v1_select_mitigation(void); +static void __init spectre_v1_apply_mitigation(void); static void __init spectre_v2_select_mitigation(void); +static void __init spectre_v2_update_mitigation(void); +static void __init spectre_v2_apply_mitigation(void); static void __init retbleed_select_mitigation(void); +static void __init retbleed_update_mitigation(void); +static void __init retbleed_apply_mitigation(void); static void __init spectre_v2_user_select_mitigation(void); +static void __init spectre_v2_user_update_mitigation(void); +static void __init spectre_v2_user_apply_mitigation(void); static void __init ssb_select_mitigation(void); +static void __init ssb_apply_mitigation(void); static void __init l1tf_select_mitigation(void); +static void __init l1tf_apply_mitigation(void); static void __init mds_select_mitigation(void); -static void __init md_clear_update_mitigation(void); -static void __init md_clear_select_mitigation(void); +static void __init mds_update_mitigation(void); +static void __init mds_apply_mitigation(void); static void __init taa_select_mitigation(void); +static void __init taa_update_mitigation(void); +static void __init taa_apply_mitigation(void); static void __init mmio_select_mitigation(void); +static void __init mmio_update_mitigation(void); +static void __init mmio_apply_mitigation(void); +static void __init rfds_select_mitigation(void); +static void __init rfds_update_mitigation(void); +static void __init rfds_apply_mitigation(void); static void __init srbds_select_mitigation(void); +static void __init srbds_apply_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init srso_select_mitigation(void); +static void __init srso_update_mitigation(void); +static void __init srso_apply_mitigation(void); static void __init gds_select_mitigation(void); +static void __init gds_apply_mitigation(void); +static void __init bhi_select_mitigation(void); +static void __init bhi_update_mitigation(void); +static void __init bhi_apply_mitigation(void); +static void __init its_select_mitigation(void); +static void __init its_update_mitigation(void); +static void __init its_apply_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -66,11 +111,19 @@ static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = __x86_return_thunk; +static void __init set_return_thunk(void *thunk) +{ + if (x86_return_thunk != __x86_return_thunk) + pr_warn("x86/bugs: return thunk changed\n"); + + x86_return_thunk = thunk; +} + /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { this_cpu_write(x86_spec_ctrl_current, val); - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } /* @@ -89,7 +142,7 @@ void update_spec_ctrl_cond(u64 val) * forced the update can be delayed until that time. */ if (!cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS)) - wrmsrl(MSR_IA32_SPEC_CTRL, val); + wrmsrq(MSR_IA32_SPEC_CTRL, val); } noinstr u64 spec_ctrl_current(void) @@ -127,9 +180,13 @@ EXPORT_SYMBOL_GPL(mds_idle_clear); */ DEFINE_STATIC_KEY_FALSE(switch_mm_cond_l1d_flush); -/* Controls CPU Fill buffer clear before KVM guest MMIO accesses */ -DEFINE_STATIC_KEY_FALSE(mmio_stale_data_clear); -EXPORT_SYMBOL_GPL(mmio_stale_data_clear); +/* + * Controls CPU Fill buffer clear before VMenter. This is a subset of + * X86_FEATURE_CLEAR_CPU_BUF, and should only be enabled when KVM-only + * mitigation is required. + */ +DEFINE_STATIC_KEY_FALSE(cpu_buf_vm_clear); +EXPORT_SYMBOL_GPL(cpu_buf_vm_clear); void __init cpu_select_mitigations(void) { @@ -139,7 +196,7 @@ void __init cpu_select_mitigations(void) * init code as it is not enumerated and depends on the family. */ if (cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL)) { - rdmsrl(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); + rdmsrq(MSR_IA32_SPEC_CTRL, x86_spec_ctrl_base); /* * Previously running kernel (kexec), may have some controls @@ -154,30 +211,67 @@ void __init cpu_select_mitigations(void) /* Select the proper CPU mitigations before patching alternatives: */ spectre_v1_select_mitigation(); spectre_v2_select_mitigation(); - /* - * retbleed_select_mitigation() relies on the state set by - * spectre_v2_select_mitigation(); specifically it wants to know about - * spectre_v2=ibrs. - */ retbleed_select_mitigation(); - /* - * spectre_v2_user_select_mitigation() relies on the state set by - * retbleed_select_mitigation(); specifically the STIBP selection is - * forced for UNRET or IBPB. - */ spectre_v2_user_select_mitigation(); ssb_select_mitigation(); l1tf_select_mitigation(); - md_clear_select_mitigation(); + mds_select_mitigation(); + taa_select_mitigation(); + mmio_select_mitigation(); + rfds_select_mitigation(); srbds_select_mitigation(); l1d_flush_select_mitigation(); + srso_select_mitigation(); + gds_select_mitigation(); + its_select_mitigation(); + bhi_select_mitigation(); /* - * srso_select_mitigation() depends and must run after - * retbleed_select_mitigation(). + * After mitigations are selected, some may need to update their + * choices. */ - srso_select_mitigation(); - gds_select_mitigation(); + spectre_v2_update_mitigation(); + /* + * retbleed_update_mitigation() relies on the state set by + * spectre_v2_update_mitigation(); specifically it wants to know about + * spectre_v2=ibrs. + */ + retbleed_update_mitigation(); + /* + * its_update_mitigation() depends on spectre_v2_update_mitigation() + * and retbleed_update_mitigation(). + */ + its_update_mitigation(); + + /* + * spectre_v2_user_update_mitigation() depends on + * retbleed_update_mitigation(), specifically the STIBP + * selection is forced for UNRET or IBPB. + */ + spectre_v2_user_update_mitigation(); + mds_update_mitigation(); + taa_update_mitigation(); + mmio_update_mitigation(); + rfds_update_mitigation(); + bhi_update_mitigation(); + /* srso_update_mitigation() depends on retbleed_update_mitigation(). */ + srso_update_mitigation(); + + spectre_v1_apply_mitigation(); + spectre_v2_apply_mitigation(); + retbleed_apply_mitigation(); + spectre_v2_user_apply_mitigation(); + ssb_apply_mitigation(); + l1tf_apply_mitigation(); + mds_apply_mitigation(); + taa_apply_mitigation(); + mmio_apply_mitigation(); + rfds_apply_mitigation(); + srbds_apply_mitigation(); + srso_apply_mitigation(); + gds_apply_mitigation(); + its_apply_mitigation(); + bhi_apply_mitigation(); } /* @@ -227,9 +321,9 @@ static void x86_amd_ssb_disable(void) u64 msrval = x86_amd_ls_cfg_base | x86_amd_ls_cfg_ssbd_mask; if (boot_cpu_has(X86_FEATURE_VIRT_SSBD)) - wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); + wrmsrq(MSR_AMD64_VIRT_SPEC_CTRL, SPEC_CTRL_SSBD); else if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD)) - wrmsrl(MSR_AMD64_LS_CFG, msrval); + wrmsrq(MSR_AMD64_LS_CFG, msrval); } #undef pr_fmt @@ -280,6 +374,12 @@ enum rfds_mitigations { static enum rfds_mitigations rfds_mitigation __ro_after_init = IS_ENABLED(CONFIG_MITIGATION_RFDS) ? RFDS_MITIGATION_AUTO : RFDS_MITIGATION_OFF; +/* + * Set if any of MDS/TAA/MMIO/RFDS are going to enable VERW clearing + * through X86_FEATURE_CLEAR_CPU_BUF on kernel and guest entry. + */ +static bool verw_clear_cpu_buf_mitigation_selected __ro_after_init; + static void __init mds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) { @@ -290,12 +390,34 @@ static void __init mds_select_mitigation(void) if (mds_mitigation == MDS_MITIGATION_AUTO) mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_OFF) + return; + + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MDS) || cpu_mitigations_off()) + return; + + /* If TAA, MMIO, or RFDS are being mitigated, MDS gets mitigated too. */ + if (verw_clear_cpu_buf_mitigation_selected) + mds_mitigation = MDS_MITIGATION_FULL; + if (mds_mitigation == MDS_MITIGATION_FULL) { if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) mds_mitigation = MDS_MITIGATION_VMWERV; + } - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + pr_info("%s\n", mds_strings[mds_mitigation]); +} +static void __init mds_apply_mitigation(void) +{ + if (mds_mitigation == MDS_MITIGATION_FULL || + mds_mitigation == MDS_MITIGATION_VMWERV) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); if (!boot_cpu_has(X86_BUG_MSBDS_ONLY) && (mds_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); @@ -335,6 +457,11 @@ static const char * const taa_strings[] = { [TAA_MITIGATION_TSX_DISABLED] = "Mitigation: TSX disabled", }; +static bool __init taa_vulnerable(void) +{ + return boot_cpu_has_bug(X86_BUG_TAA) && boot_cpu_has(X86_FEATURE_RTM); +} + static void __init taa_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_TAA)) { @@ -348,48 +475,63 @@ static void __init taa_select_mitigation(void) return; } - if (cpu_mitigations_off()) { + if (cpu_mitigations_off()) taa_mitigation = TAA_MITIGATION_OFF; - return; - } - /* - * TAA mitigation via VERW is turned off if both - * tsx_async_abort=off and mds=off are specified. - */ - if (taa_mitigation == TAA_MITIGATION_OFF && - mds_mitigation == MDS_MITIGATION_OFF) + /* Microcode will be checked in taa_update_mitigation(). */ + if (taa_mitigation == TAA_MITIGATION_AUTO) + taa_mitigation = TAA_MITIGATION_VERW; + + if (taa_mitigation != TAA_MITIGATION_OFF) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init taa_update_mitigation(void) +{ + if (!taa_vulnerable() || cpu_mitigations_off()) return; - if (boot_cpu_has(X86_FEATURE_MD_CLEAR)) + if (verw_clear_cpu_buf_mitigation_selected) taa_mitigation = TAA_MITIGATION_VERW; - else - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. - * A microcode update fixes this behavior to clear CPU buffers. It also - * adds support for MSR_IA32_TSX_CTRL which is enumerated by the - * ARCH_CAP_TSX_CTRL_MSR bit. - * - * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode - * update is required. - */ - if ( (x86_arch_cap_msr & ARCH_CAP_MDS_NO) && - !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) - taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + if (taa_mitigation == TAA_MITIGATION_VERW) { + /* Check if the requisite ucode is available. */ + if (!boot_cpu_has(X86_FEATURE_MD_CLEAR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; - /* - * TSX is enabled, select alternate mitigation for TAA which is - * the same as MDS. Enable MDS static branch to clear CPU buffers. - * - * For guests that can't determine whether the correct microcode is - * present on host, enable the mitigation for UCODE_NEEDED as well. - */ - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + /* + * VERW doesn't clear the CPU buffers when MD_CLEAR=1 and MDS_NO=1. + * A microcode update fixes this behavior to clear CPU buffers. It also + * adds support for MSR_IA32_TSX_CTRL which is enumerated by the + * ARCH_CAP_TSX_CTRL_MSR bit. + * + * On MDS_NO=1 CPUs if ARCH_CAP_TSX_CTRL_MSR is not set, microcode + * update is required. + */ + if ((x86_arch_cap_msr & ARCH_CAP_MDS_NO) && + !(x86_arch_cap_msr & ARCH_CAP_TSX_CTRL_MSR)) + taa_mitigation = TAA_MITIGATION_UCODE_NEEDED; + } - if (taa_nosmt || cpu_mitigations_auto_nosmt()) - cpu_smt_disable(false); + pr_info("%s\n", taa_strings[taa_mitigation]); +} + +static void __init taa_apply_mitigation(void) +{ + if (taa_mitigation == TAA_MITIGATION_VERW || + taa_mitigation == TAA_MITIGATION_UCODE_NEEDED) { + /* + * TSX is enabled, select alternate mitigation for TAA which is + * the same as MDS. Enable MDS static branch to clear CPU buffers. + * + * For guests that can't determine whether the correct microcode is + * present on host, enable the mitigation for UCODE_NEEDED as well. + */ + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + + if (taa_nosmt || cpu_mitigations_auto_nosmt()) + cpu_smt_disable(false); + } } static int __init tsx_async_abort_parse_cmdline(char *str) @@ -427,31 +569,67 @@ static const char * const mmio_strings[] = { static void __init mmio_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || - boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN) || cpu_mitigations_off()) { mmio_mitigation = MMIO_MITIGATION_OFF; return; } + /* Microcode will be checked in mmio_update_mitigation(). */ + if (mmio_mitigation == MMIO_MITIGATION_AUTO) + mmio_mitigation = MMIO_MITIGATION_VERW; + if (mmio_mitigation == MMIO_MITIGATION_OFF) return; /* * Enable CPU buffer clear mitigation for host and VMM, if also affected - * by MDS or TAA. Otherwise, enable mitigation for VMM only. + * by MDS or TAA. */ - if (boot_cpu_has_bug(X86_BUG_MDS) || (boot_cpu_has_bug(X86_BUG_TAA) && - boot_cpu_has(X86_FEATURE_RTM))) - setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + if (boot_cpu_has_bug(X86_BUG_MDS) || taa_vulnerable()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init mmio_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) + mmio_mitigation = MMIO_MITIGATION_VERW; + + if (mmio_mitigation == MMIO_MITIGATION_VERW) { + /* + * Check if the system has the right microcode. + * + * CPU Fill buffer clear mitigation is enumerated by either an explicit + * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS + * affected systems. + */ + if (!((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || + (boot_cpu_has(X86_FEATURE_MD_CLEAR) && + boot_cpu_has(X86_FEATURE_FLUSH_L1D) && + !(x86_arch_cap_msr & ARCH_CAP_MDS_NO)))) + mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", mmio_strings[mmio_mitigation]); +} + +static void __init mmio_apply_mitigation(void) +{ + if (mmio_mitigation == MMIO_MITIGATION_OFF) + return; /* - * X86_FEATURE_CLEAR_CPU_BUF could be enabled by other VERW based - * mitigations, disable KVM-only mitigation in that case. + * Only enable the VMM mitigation if the CPU buffer clear mitigation is + * not being used. */ - if (boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - static_branch_disable(&mmio_stale_data_clear); - else - static_branch_enable(&mmio_stale_data_clear); + if (verw_clear_cpu_buf_mitigation_selected) { + setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); + static_branch_disable(&cpu_buf_vm_clear); + } else { + static_branch_enable(&cpu_buf_vm_clear); + } /* * If Processor-MMIO-Stale-Data bug is present and Fill Buffer data can @@ -461,21 +639,6 @@ static void __init mmio_select_mitigation(void) if (!(x86_arch_cap_msr & ARCH_CAP_FBSDP_NO)) static_branch_enable(&mds_idle_clear); - /* - * Check if the system has the right microcode. - * - * CPU Fill buffer clear mitigation is enumerated by either an explicit - * FB_CLEAR or by the presence of both MD_CLEAR and L1D_FLUSH on MDS - * affected systems. - */ - if ((x86_arch_cap_msr & ARCH_CAP_FB_CLEAR) || - (boot_cpu_has(X86_FEATURE_MD_CLEAR) && - boot_cpu_has(X86_FEATURE_FLUSH_L1D) && - !(x86_arch_cap_msr & ARCH_CAP_MDS_NO))) - mmio_mitigation = MMIO_MITIGATION_VERW; - else - mmio_mitigation = MMIO_MITIGATION_UCODE_NEEDED; - if (mmio_nosmt || cpu_mitigations_auto_nosmt()) cpu_smt_disable(false); } @@ -510,22 +673,48 @@ static const char * const rfds_strings[] = { [RFDS_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", }; +static inline bool __init verw_clears_cpu_reg_file(void) +{ + return (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR); +} + static void __init rfds_select_mitigation(void) { if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) { rfds_mitigation = RFDS_MITIGATION_OFF; return; } + + if (rfds_mitigation == RFDS_MITIGATION_AUTO) + rfds_mitigation = RFDS_MITIGATION_VERW; + if (rfds_mitigation == RFDS_MITIGATION_OFF) return; - if (rfds_mitigation == RFDS_MITIGATION_AUTO) + if (verw_clears_cpu_reg_file()) + verw_clear_cpu_buf_mitigation_selected = true; +} + +static void __init rfds_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RFDS) || cpu_mitigations_off()) + return; + + if (verw_clear_cpu_buf_mitigation_selected) rfds_mitigation = RFDS_MITIGATION_VERW; - if (x86_arch_cap_msr & ARCH_CAP_RFDS_CLEAR) + if (rfds_mitigation == RFDS_MITIGATION_VERW) { + if (!verw_clears_cpu_reg_file()) + rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; + } + + pr_info("%s\n", rfds_strings[rfds_mitigation]); +} + +static void __init rfds_apply_mitigation(void) +{ + if (rfds_mitigation == RFDS_MITIGATION_VERW) setup_force_cpu_cap(X86_FEATURE_CLEAR_CPU_BUF); - else - rfds_mitigation = RFDS_MITIGATION_UCODE_NEEDED; } static __init int rfds_parse_cmdline(char *str) @@ -546,76 +735,11 @@ static __init int rfds_parse_cmdline(char *str) early_param("reg_file_data_sampling", rfds_parse_cmdline); #undef pr_fmt -#define pr_fmt(fmt) "" fmt - -static void __init md_clear_update_mitigation(void) -{ - if (cpu_mitigations_off()) - return; - - if (!boot_cpu_has(X86_FEATURE_CLEAR_CPU_BUF)) - goto out; - - /* - * X86_FEATURE_CLEAR_CPU_BUF is now enabled. Update MDS, TAA and MMIO - * Stale Data mitigation, if necessary. - */ - if (mds_mitigation == MDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_MDS)) { - mds_mitigation = MDS_MITIGATION_FULL; - mds_select_mitigation(); - } - if (taa_mitigation == TAA_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_TAA)) { - taa_mitigation = TAA_MITIGATION_VERW; - taa_select_mitigation(); - } - /* - * MMIO_MITIGATION_OFF is not checked here so that mmio_stale_data_clear - * gets updated correctly as per X86_FEATURE_CLEAR_CPU_BUF state. - */ - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) { - mmio_mitigation = MMIO_MITIGATION_VERW; - mmio_select_mitigation(); - } - if (rfds_mitigation == RFDS_MITIGATION_OFF && - boot_cpu_has_bug(X86_BUG_RFDS)) { - rfds_mitigation = RFDS_MITIGATION_VERW; - rfds_select_mitigation(); - } -out: - if (boot_cpu_has_bug(X86_BUG_MDS)) - pr_info("MDS: %s\n", mds_strings[mds_mitigation]); - if (boot_cpu_has_bug(X86_BUG_TAA)) - pr_info("TAA: %s\n", taa_strings[taa_mitigation]); - if (boot_cpu_has_bug(X86_BUG_MMIO_STALE_DATA)) - pr_info("MMIO Stale Data: %s\n", mmio_strings[mmio_mitigation]); - else if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - pr_info("MMIO Stale Data: Unknown: No mitigations\n"); - if (boot_cpu_has_bug(X86_BUG_RFDS)) - pr_info("Register File Data Sampling: %s\n", rfds_strings[rfds_mitigation]); -} - -static void __init md_clear_select_mitigation(void) -{ - mds_select_mitigation(); - taa_select_mitigation(); - mmio_select_mitigation(); - rfds_select_mitigation(); - - /* - * As these mitigations are inter-related and rely on VERW instruction - * to clear the microarchitural buffers, update and print their status - * after mitigation selection is done for each of these vulnerabilities. - */ - md_clear_update_mitigation(); -} - -#undef pr_fmt #define pr_fmt(fmt) "SRBDS: " fmt enum srbds_mitigations { SRBDS_MITIGATION_OFF, + SRBDS_MITIGATION_AUTO, SRBDS_MITIGATION_UCODE_NEEDED, SRBDS_MITIGATION_FULL, SRBDS_MITIGATION_TSX_OFF, @@ -623,7 +747,7 @@ enum srbds_mitigations { }; static enum srbds_mitigations srbds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_FULL : SRBDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SRBDS) ? SRBDS_MITIGATION_AUTO : SRBDS_MITIGATION_OFF; static const char * const srbds_strings[] = { [SRBDS_MITIGATION_OFF] = "Vulnerable", @@ -655,7 +779,7 @@ void update_srbds_msr(void) if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); switch (srbds_mitigation) { case SRBDS_MITIGATION_OFF: @@ -669,13 +793,18 @@ void update_srbds_msr(void) break; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); } static void __init srbds_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SRBDS)) + if (!boot_cpu_has_bug(X86_BUG_SRBDS) || cpu_mitigations_off()) { + srbds_mitigation = SRBDS_MITIGATION_OFF; return; + } + + if (srbds_mitigation == SRBDS_MITIGATION_AUTO) + srbds_mitigation = SRBDS_MITIGATION_FULL; /* * Check to see if this is one of the MDS_NO systems supporting TSX that @@ -689,13 +818,17 @@ static void __init srbds_select_mitigation(void) srbds_mitigation = SRBDS_MITIGATION_HYPERVISOR; else if (!boot_cpu_has(X86_FEATURE_SRBDS_CTRL)) srbds_mitigation = SRBDS_MITIGATION_UCODE_NEEDED; - else if (cpu_mitigations_off() || srbds_off) + else if (srbds_off) srbds_mitigation = SRBDS_MITIGATION_OFF; - update_srbds_msr(); pr_info("%s\n", srbds_strings[srbds_mitigation]); } +static void __init srbds_apply_mitigation(void) +{ + update_srbds_msr(); +} + static int __init srbds_parse_cmdline(char *str) { if (!str) @@ -742,6 +875,7 @@ early_param("l1d_flush", l1d_flush_parse_cmdline); enum gds_mitigations { GDS_MITIGATION_OFF, + GDS_MITIGATION_AUTO, GDS_MITIGATION_UCODE_NEEDED, GDS_MITIGATION_FORCE, GDS_MITIGATION_FULL, @@ -750,7 +884,7 @@ enum gds_mitigations { }; static enum gds_mitigations gds_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_FULL : GDS_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_GDS) ? GDS_MITIGATION_AUTO : GDS_MITIGATION_OFF; static const char * const gds_strings[] = { [GDS_MITIGATION_OFF] = "Vulnerable", @@ -775,7 +909,7 @@ void update_gds_msr(void) switch (gds_mitigation) { case GDS_MITIGATION_OFF: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl |= GDS_MITG_DIS; break; case GDS_MITIGATION_FULL_LOCKED: @@ -785,23 +919,24 @@ void update_gds_msr(void) * CPUs. */ case GDS_MITIGATION_FULL: - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); mcu_ctrl &= ~GDS_MITG_DIS; break; case GDS_MITIGATION_FORCE: case GDS_MITIGATION_UCODE_NEEDED: case GDS_MITIGATION_HYPERVISOR: + case GDS_MITIGATION_AUTO: return; } - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); /* * Check to make sure that the WRMSR value was not ignored. Writes to * GDS_MITG_DIS will be ignored if this processor is locked but the boot * processor was not. */ - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl_after); WARN_ON_ONCE(mcu_ctrl != mcu_ctrl_after); } @@ -814,33 +949,28 @@ static void __init gds_select_mitigation(void) if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) { gds_mitigation = GDS_MITIGATION_HYPERVISOR; - goto out; + return; } if (cpu_mitigations_off()) gds_mitigation = GDS_MITIGATION_OFF; /* Will verify below that mitigation _can_ be disabled */ + if (gds_mitigation == GDS_MITIGATION_AUTO) + gds_mitigation = GDS_MITIGATION_FULL; + /* No microcode */ if (!(x86_arch_cap_msr & ARCH_CAP_GDS_CTRL)) { - if (gds_mitigation == GDS_MITIGATION_FORCE) { - /* - * This only needs to be done on the boot CPU so do it - * here rather than in update_gds_msr() - */ - setup_clear_cpu_cap(X86_FEATURE_AVX); - pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); - } else { + if (gds_mitigation != GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_UCODE_NEEDED; - } - goto out; + return; } /* Microcode has mitigation, use it */ if (gds_mitigation == GDS_MITIGATION_FORCE) gds_mitigation = GDS_MITIGATION_FULL; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_ctrl); if (mcu_ctrl & GDS_MITG_LOCKED) { if (gds_mitigation == GDS_MITIGATION_OFF) pr_warn("Mitigation locked. Disable failed.\n"); @@ -854,9 +984,25 @@ static void __init gds_select_mitigation(void) */ gds_mitigation = GDS_MITIGATION_FULL_LOCKED; } +} + +static void __init gds_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_GDS)) + return; + + /* Microcode is present */ + if (x86_arch_cap_msr & ARCH_CAP_GDS_CTRL) + update_gds_msr(); + else if (gds_mitigation == GDS_MITIGATION_FORCE) { + /* + * This only needs to be done on the boot CPU so do it + * here rather than in update_gds_msr() + */ + setup_clear_cpu_cap(X86_FEATURE_AVX); + pr_warn("Microcode update needed! Disabling AVX as mitigation.\n"); + } - update_gds_msr(); -out: pr_info("%s\n", gds_strings[gds_mitigation]); } @@ -917,10 +1063,14 @@ static bool smap_works_speculatively(void) static void __init spectre_v1_select_mitigation(void) { - if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) { + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) spectre_v1_mitigation = SPECTRE_V1_MITIGATION_NONE; +} + +static void __init spectre_v1_apply_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1) || cpu_mitigations_off()) return; - } if (spectre_v1_mitigation == SPECTRE_V1_MITIGATION_AUTO) { /* @@ -973,8 +1123,20 @@ enum spectre_v2_mitigation spectre_v2_enabled __ro_after_init = SPECTRE_V2_NONE; #undef pr_fmt #define pr_fmt(fmt) "RETBleed: " fmt +enum its_mitigation { + ITS_MITIGATION_OFF, + ITS_MITIGATION_AUTO, + ITS_MITIGATION_VMEXIT_ONLY, + ITS_MITIGATION_ALIGNED_THUNKS, + ITS_MITIGATION_RETPOLINE_STUFF, +}; + +static enum its_mitigation its_mitigation __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_MITIGATION_AUTO : ITS_MITIGATION_OFF; + enum retbleed_mitigation { RETBLEED_MITIGATION_NONE, + RETBLEED_MITIGATION_AUTO, RETBLEED_MITIGATION_UNRET, RETBLEED_MITIGATION_IBPB, RETBLEED_MITIGATION_IBRS, @@ -982,14 +1144,6 @@ enum retbleed_mitigation { RETBLEED_MITIGATION_STUFF, }; -enum retbleed_mitigation_cmd { - RETBLEED_CMD_OFF, - RETBLEED_CMD_AUTO, - RETBLEED_CMD_UNRET, - RETBLEED_CMD_IBPB, - RETBLEED_CMD_STUFF, -}; - static const char * const retbleed_strings[] = { [RETBLEED_MITIGATION_NONE] = "Vulnerable", [RETBLEED_MITIGATION_UNRET] = "Mitigation: untrained return thunk", @@ -1000,9 +1154,7 @@ static const char * const retbleed_strings[] = { }; static enum retbleed_mitigation retbleed_mitigation __ro_after_init = - RETBLEED_MITIGATION_NONE; -static enum retbleed_mitigation_cmd retbleed_cmd __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_CMD_AUTO : RETBLEED_CMD_OFF; + IS_ENABLED(CONFIG_MITIGATION_RETBLEED) ? RETBLEED_MITIGATION_AUTO : RETBLEED_MITIGATION_NONE; static int __ro_after_init retbleed_nosmt = false; @@ -1019,15 +1171,15 @@ static int __init retbleed_parse_cmdline(char *str) } if (!strcmp(str, "off")) { - retbleed_cmd = RETBLEED_CMD_OFF; + retbleed_mitigation = RETBLEED_MITIGATION_NONE; } else if (!strcmp(str, "auto")) { - retbleed_cmd = RETBLEED_CMD_AUTO; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else if (!strcmp(str, "unret")) { - retbleed_cmd = RETBLEED_CMD_UNRET; + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; } else if (!strcmp(str, "ibpb")) { - retbleed_cmd = RETBLEED_CMD_IBPB; + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; } else if (!strcmp(str, "stuff")) { - retbleed_cmd = RETBLEED_CMD_STUFF; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } else if (!strcmp(str, "nosmt")) { retbleed_nosmt = true; } else if (!strcmp(str, "force")) { @@ -1048,77 +1200,122 @@ early_param("retbleed", retbleed_parse_cmdline); static void __init retbleed_select_mitigation(void) { - bool mitigate_smt = false; - - if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) - return; - - switch (retbleed_cmd) { - case RETBLEED_CMD_OFF: + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) { + retbleed_mitigation = RETBLEED_MITIGATION_NONE; return; + } - case RETBLEED_CMD_UNRET: - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - } else { + switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_UNRET: + if (!IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; pr_err("WARNING: kernel not compiled with MITIGATION_UNRET_ENTRY.\n"); - goto do_cmd_auto; } break; - - case RETBLEED_CMD_IBPB: + case RETBLEED_MITIGATION_IBPB: if (!boot_cpu_has(X86_FEATURE_IBPB)) { pr_err("WARNING: CPU does not support IBPB.\n"); - goto do_cmd_auto; - } else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; - } else { + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } break; + case RETBLEED_MITIGATION_STUFF: + if (!IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } else if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { + pr_err("WARNING: retbleed=stuff only supported for Intel CPUs.\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; + } + break; + default: + break; + } - case RETBLEED_CMD_STUFF: - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING) && - spectre_v2_enabled == SPECTRE_V2_RETPOLINE) { - retbleed_mitigation = RETBLEED_MITIGATION_STUFF; + if (retbleed_mitigation != RETBLEED_MITIGATION_AUTO) + return; + + /* Intel mitigation selected in retbleed_update_mitigation() */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { + if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) + retbleed_mitigation = RETBLEED_MITIGATION_UNRET; + else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && + boot_cpu_has(X86_FEATURE_IBPB)) + retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + else + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +} + +static void __init retbleed_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_RETBLEED) || cpu_mitigations_off()) + return; + if (retbleed_mitigation == RETBLEED_MITIGATION_NONE) + goto out; + + /* + * retbleed=stuff is only allowed on Intel. If stuffing can't be used + * then a different mitigation will be selected below. + * + * its=stuff will also attempt to enable stuffing. + */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF || + its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF) { + if (spectre_v2_enabled != SPECTRE_V2_RETPOLINE) { + pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); + retbleed_mitigation = RETBLEED_MITIGATION_AUTO; } else { - if (IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) - pr_err("WARNING: retbleed=stuff depends on spectre_v2=retpoline\n"); - else - pr_err("WARNING: kernel not compiled with MITIGATION_CALL_DEPTH_TRACKING.\n"); + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_info("Retbleed mitigation updated to stuffing\n"); - goto do_cmd_auto; + retbleed_mitigation = RETBLEED_MITIGATION_STUFF; } - break; - -do_cmd_auto: - case RETBLEED_CMD_AUTO: - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || - boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - if (IS_ENABLED(CONFIG_MITIGATION_UNRET_ENTRY)) - retbleed_mitigation = RETBLEED_MITIGATION_UNRET; - else if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY) && - boot_cpu_has(X86_FEATURE_IBPB)) - retbleed_mitigation = RETBLEED_MITIGATION_IBPB; + } + /* + * Let IBRS trump all on Intel without affecting the effects of the + * retbleed= cmdline option except for call depth based stuffing + */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + switch (spectre_v2_enabled) { + case SPECTRE_V2_IBRS: + retbleed_mitigation = RETBLEED_MITIGATION_IBRS; + break; + case SPECTRE_V2_EIBRS: + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_EIBRS_LFENCE: + retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; + break; + default: + if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + pr_err(RETBLEED_INTEL_MSG); } + /* If nothing has set the mitigation yet, default to NONE. */ + if (retbleed_mitigation == RETBLEED_MITIGATION_AUTO) + retbleed_mitigation = RETBLEED_MITIGATION_NONE; + } +out: + pr_info("%s\n", retbleed_strings[retbleed_mitigation]); +} - /* - * The Intel mitigation (IBRS or eIBRS) was already selected in - * spectre_v2_select_mitigation(). 'retbleed_mitigation' will - * be set accordingly below. - */ - break; - } +static void __init retbleed_apply_mitigation(void) +{ + bool mitigate_smt = false; switch (retbleed_mitigation) { + case RETBLEED_MITIGATION_NONE: + return; + case RETBLEED_MITIGATION_UNRET: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_UNRET); - x86_return_thunk = retbleed_return_thunk; + set_return_thunk(retbleed_return_thunk); if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1153,7 +1350,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_RETHUNK); setup_force_cpu_cap(X86_FEATURE_CALL_DEPTH); - x86_return_thunk = call_depth_return_thunk; + set_return_thunk(call_depth_return_thunk); break; default: @@ -1163,28 +1360,131 @@ do_cmd_auto: if (mitigate_smt && !boot_cpu_has(X86_FEATURE_STIBP) && (retbleed_nosmt || cpu_mitigations_auto_nosmt())) cpu_smt_disable(false); +} + +#undef pr_fmt +#define pr_fmt(fmt) "ITS: " fmt + +static const char * const its_strings[] = { + [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected", + [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", + [ITS_MITIGATION_RETPOLINE_STUFF] = "Mitigation: Retpolines, Stuffing RSB", +}; + +static int __init its_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) { + pr_err("Mitigation disabled at compile time, ignoring option (%s)", str); + return 0; + } + + if (!strcmp(str, "off")) { + its_mitigation = ITS_MITIGATION_OFF; + } else if (!strcmp(str, "on")) { + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + } else if (!strcmp(str, "force")) { + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + setup_force_cpu_bug(X86_BUG_ITS); + } else if (!strcmp(str, "vmexit")) { + its_mitigation = ITS_MITIGATION_VMEXIT_ONLY; + } else if (!strcmp(str, "stuff")) { + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + } else { + pr_err("Ignoring unknown indirect_target_selection option (%s).", str); + } + + return 0; +} +early_param("indirect_target_selection", its_parse_cmdline); + +static void __init its_select_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) { + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (its_mitigation == ITS_MITIGATION_AUTO) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + + if (its_mitigation == ITS_MITIGATION_OFF) + return; + + if (!IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) || + !IS_ENABLED(CONFIG_MITIGATION_RETHUNK)) { + pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n"); + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) { + pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n"); + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + !IS_ENABLED(CONFIG_MITIGATION_CALL_DEPTH_TRACKING)) { + pr_err("RSB stuff mitigation not supported, using default\n"); + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + } + + if (its_mitigation == ITS_MITIGATION_VMEXIT_ONLY && + !boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; +} + +static void __init its_update_mitigation(void) +{ + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) + return; + + switch (spectre_v2_enabled) { + case SPECTRE_V2_NONE: + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); + its_mitigation = ITS_MITIGATION_OFF; + break; + case SPECTRE_V2_RETPOLINE: + /* Retpoline+CDT mitigates ITS */ + if (retbleed_mitigation == RETBLEED_MITIGATION_STUFF) + its_mitigation = ITS_MITIGATION_RETPOLINE_STUFF; + break; + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_EIBRS_LFENCE: + pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); + its_mitigation = ITS_MITIGATION_OFF; + break; + default: + break; + } /* - * Let IBRS trump all on Intel without affecting the effects of the - * retbleed= cmdline option except for call depth based stuffing + * retbleed_update_mitigation() will try to do stuffing if its=stuff. + * If it can't, such as if spectre_v2!=retpoline, then fall back to + * aligned thunks. */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - switch (spectre_v2_enabled) { - case SPECTRE_V2_IBRS: - retbleed_mitigation = RETBLEED_MITIGATION_IBRS; - break; - case SPECTRE_V2_EIBRS: - case SPECTRE_V2_EIBRS_RETPOLINE: - case SPECTRE_V2_EIBRS_LFENCE: - retbleed_mitigation = RETBLEED_MITIGATION_EIBRS; - break; - default: - if (retbleed_mitigation != RETBLEED_MITIGATION_STUFF) - pr_err(RETBLEED_INTEL_MSG); - } - } + if (its_mitigation == ITS_MITIGATION_RETPOLINE_STUFF && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF) + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; - pr_info("%s\n", retbleed_strings[retbleed_mitigation]); + pr_info("%s\n", its_strings[its_mitigation]); +} + +static void __init its_apply_mitigation(void) +{ + /* its=stuff forces retbleed stuffing and is enabled there. */ + if (its_mitigation != ITS_MITIGATION_ALIGNED_THUNKS) + return; + + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); } #undef pr_fmt @@ -1264,6 +1564,8 @@ enum spectre_v2_mitigation_cmd { SPECTRE_V2_CMD_IBRS, }; +static enum spectre_v2_mitigation_cmd spectre_v2_cmd __ro_after_init = SPECTRE_V2_CMD_AUTO; + enum spectre_v2_user_cmd { SPECTRE_V2_USER_CMD_NONE, SPECTRE_V2_USER_CMD_AUTO, @@ -1302,31 +1604,18 @@ static void __init spec_v2_user_print_cond(const char *reason, bool secure) pr_info("spectre_v2_user=%s forced on command line.\n", reason); } -static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; - -static enum spectre_v2_user_cmd __init -spectre_v2_parse_user_cmdline(void) +static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { - enum spectre_v2_user_cmd mode; char arg[20]; int ret, i; - mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? - SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; - - switch (spectre_v2_cmd) { - case SPECTRE_V2_CMD_NONE: + if (cpu_mitigations_off() || !IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2)) return SPECTRE_V2_USER_CMD_NONE; - case SPECTRE_V2_CMD_FORCE: - return SPECTRE_V2_USER_CMD_FORCE; - default: - break; - } ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); if (ret < 0) - return mode; + return SPECTRE_V2_USER_CMD_AUTO; for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { if (match_option(arg, ret, v2_user_options[i].option)) { @@ -1337,7 +1626,7 @@ spectre_v2_parse_user_cmdline(void) } pr_err("Unknown user space protection option (%s). Switching to default\n", arg); - return mode; + return SPECTRE_V2_USER_CMD_AUTO; } static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) @@ -1345,60 +1634,72 @@ static inline bool spectre_v2_in_ibrs_mode(enum spectre_v2_mitigation mode) return spectre_v2_in_eibrs_mode(mode) || mode == SPECTRE_V2_IBRS; } -static void __init -spectre_v2_user_select_mitigation(void) +static void __init spectre_v2_user_select_mitigation(void) { - enum spectre_v2_user_mitigation mode = SPECTRE_V2_USER_NONE; - enum spectre_v2_user_cmd cmd; - if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) return; - cmd = spectre_v2_parse_user_cmdline(); - switch (cmd) { + switch (spectre_v2_parse_user_cmdline()) { case SPECTRE_V2_USER_CMD_NONE: - goto set_mode; + return; case SPECTRE_V2_USER_CMD_FORCE: - mode = SPECTRE_V2_USER_STRICT; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; break; case SPECTRE_V2_USER_CMD_AUTO: case SPECTRE_V2_USER_CMD_PRCTL: + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; + break; case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; case SPECTRE_V2_USER_CMD_SECCOMP: + if (IS_ENABLED(CONFIG_SECCOMP)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_SECCOMP; + else + spectre_v2_user_ibpb = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = spectre_v2_user_ibpb; + break; case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPECTRE_V2_USER_SECCOMP; + spectre_v2_user_stibp = SPECTRE_V2_USER_SECCOMP; else - mode = SPECTRE_V2_USER_PRCTL; + spectre_v2_user_stibp = SPECTRE_V2_USER_PRCTL; break; } - /* Initialize Indirect Branch Prediction Barrier */ - if (boot_cpu_has(X86_FEATURE_IBPB)) { - static_branch_enable(&switch_vcpu_ibpb); + /* + * At this point, an STIBP mode other than "off" has been set. + * If STIBP support is not being forced, check if STIBP always-on + * is preferred. + */ + if ((spectre_v2_user_stibp == SPECTRE_V2_USER_PRCTL || + spectre_v2_user_stibp == SPECTRE_V2_USER_SECCOMP) && + boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; - spectre_v2_user_ibpb = mode; - switch (cmd) { - case SPECTRE_V2_USER_CMD_NONE: - break; - case SPECTRE_V2_USER_CMD_FORCE: - case SPECTRE_V2_USER_CMD_PRCTL_IBPB: - case SPECTRE_V2_USER_CMD_SECCOMP_IBPB: - static_branch_enable(&switch_mm_always_ibpb); - spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; - break; - case SPECTRE_V2_USER_CMD_PRCTL: - case SPECTRE_V2_USER_CMD_AUTO: - case SPECTRE_V2_USER_CMD_SECCOMP: - static_branch_enable(&switch_mm_cond_ibpb); - break; - } + if (!boot_cpu_has(X86_FEATURE_IBPB)) + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; - pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", - static_key_enabled(&switch_mm_always_ibpb) ? - "always-on" : "conditional"); + if (!boot_cpu_has(X86_FEATURE_STIBP)) + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; +} + +static void __init spectre_v2_user_update_mitigation(void) +{ + if (!boot_cpu_has(X86_FEATURE_IBPB) && !boot_cpu_has(X86_FEATURE_STIBP)) + return; + + /* The spectre_v2 cmd line can override spectre_v2_user options */ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_NONE; + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; + } else if (spectre_v2_cmd == SPECTRE_V2_CMD_FORCE) { + spectre_v2_user_ibpb = SPECTRE_V2_USER_STRICT; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT; } /* @@ -1416,30 +1717,44 @@ spectre_v2_user_select_mitigation(void) if (!boot_cpu_has(X86_FEATURE_STIBP) || !cpu_smt_possible() || (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && - !boot_cpu_has(X86_FEATURE_AUTOIBRS))) + !boot_cpu_has(X86_FEATURE_AUTOIBRS))) { + spectre_v2_user_stibp = SPECTRE_V2_USER_NONE; return; + } - /* - * At this point, an STIBP mode other than "off" has been set. - * If STIBP support is not being forced, check if STIBP always-on - * is preferred. - */ - if (mode != SPECTRE_V2_USER_STRICT && - boot_cpu_has(X86_FEATURE_AMD_STIBP_ALWAYS_ON)) - mode = SPECTRE_V2_USER_STRICT_PREFERRED; - - if (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || - retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - if (mode != SPECTRE_V2_USER_STRICT && - mode != SPECTRE_V2_USER_STRICT_PREFERRED) + if (spectre_v2_user_stibp != SPECTRE_V2_USER_NONE && + (retbleed_mitigation == RETBLEED_MITIGATION_UNRET || + retbleed_mitigation == RETBLEED_MITIGATION_IBPB)) { + if (spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT && + spectre_v2_user_stibp != SPECTRE_V2_USER_STRICT_PREFERRED) pr_info("Selecting STIBP always-on mode to complement retbleed mitigation\n"); - mode = SPECTRE_V2_USER_STRICT_PREFERRED; + spectre_v2_user_stibp = SPECTRE_V2_USER_STRICT_PREFERRED; } + pr_info("%s\n", spectre_v2_user_strings[spectre_v2_user_stibp]); +} - spectre_v2_user_stibp = mode; +static void __init spectre_v2_user_apply_mitigation(void) +{ + /* Initialize Indirect Branch Prediction Barrier */ + if (spectre_v2_user_ibpb != SPECTRE_V2_USER_NONE) { + static_branch_enable(&switch_vcpu_ibpb); + + switch (spectre_v2_user_ibpb) { + case SPECTRE_V2_USER_STRICT: + static_branch_enable(&switch_mm_always_ibpb); + break; + case SPECTRE_V2_USER_PRCTL: + case SPECTRE_V2_USER_SECCOMP: + static_branch_enable(&switch_mm_cond_ibpb); + break; + default: + break; + } -set_mode: - pr_info("%s\n", spectre_v2_user_strings[mode]); + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n", + static_key_enabled(&switch_mm_always_ibpb) ? + "always-on" : "conditional"); + } } static const char * const spectre_v2_strings[] = { @@ -1659,12 +1974,13 @@ static bool __init spec_ctrl_bhi_dis(void) enum bhi_mitigations { BHI_MITIGATION_OFF, + BHI_MITIGATION_AUTO, BHI_MITIGATION_ON, BHI_MITIGATION_VMEXIT_ONLY, }; static enum bhi_mitigations bhi_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_ON : BHI_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_SPECTRE_BHI) ? BHI_MITIGATION_AUTO : BHI_MITIGATION_OFF; static int __init spectre_bhi_parse_cmdline(char *str) { @@ -1686,6 +2002,25 @@ early_param("spectre_bhi", spectre_bhi_parse_cmdline); static void __init bhi_select_mitigation(void) { + if (!boot_cpu_has(X86_BUG_BHI) || cpu_mitigations_off()) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (bhi_mitigation == BHI_MITIGATION_AUTO) + bhi_mitigation = BHI_MITIGATION_ON; +} + +static void __init bhi_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_NONE) + bhi_mitigation = BHI_MITIGATION_OFF; + + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + spectre_v2_cmd == SPECTRE_V2_CMD_AUTO) + bhi_mitigation = BHI_MITIGATION_OFF; +} + +static void __init bhi_apply_mitigation(void) +{ if (bhi_mitigation == BHI_MITIGATION_OFF) return; @@ -1697,95 +2032,101 @@ static void __init bhi_select_mitigation(void) return; } - /* Mitigate in hardware if supported */ - if (spec_ctrl_bhi_dis()) + if (!IS_ENABLED(CONFIG_X86_64)) return; - if (!IS_ENABLED(CONFIG_X86_64)) + /* Mitigate in hardware if supported */ + if (spec_ctrl_bhi_dis()) return; if (bhi_mitigation == BHI_MITIGATION_VMEXIT_ONLY) { pr_info("Spectre BHI mitigation: SW BHB clearing on VM exit only\n"); - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); return; } pr_info("Spectre BHI mitigation: SW BHB clearing on syscall and VM exit\n"); setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP); - setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT); + setup_force_cpu_cap(X86_FEATURE_CLEAR_BHB_VMEXIT); } static void __init spectre_v2_select_mitigation(void) { - enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); - enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + spectre_v2_cmd = spectre_v2_parse_cmdline(); - /* - * If the CPU is not affected and the command line mode is NONE or AUTO - * then nothing to do. - */ if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && - (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + (spectre_v2_cmd == SPECTRE_V2_CMD_NONE || spectre_v2_cmd == SPECTRE_V2_CMD_AUTO)) return; - switch (cmd) { + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return; case SPECTRE_V2_CMD_FORCE: case SPECTRE_V2_CMD_AUTO: if (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED)) { - mode = SPECTRE_V2_EIBRS; - break; - } - - if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && - boot_cpu_has_bug(X86_BUG_RETBLEED) && - retbleed_cmd != RETBLEED_CMD_OFF && - retbleed_cmd != RETBLEED_CMD_STUFF && - boot_cpu_has(X86_FEATURE_IBRS) && - boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; } - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_RETPOLINE_LFENCE: pr_err(SPECTRE_V2_LFENCE_MSG); - mode = SPECTRE_V2_LFENCE; + spectre_v2_enabled = SPECTRE_V2_LFENCE; break; case SPECTRE_V2_CMD_RETPOLINE_GENERIC: - mode = SPECTRE_V2_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_RETPOLINE; break; case SPECTRE_V2_CMD_RETPOLINE: - mode = spectre_v2_select_retpoline(); + spectre_v2_enabled = spectre_v2_select_retpoline(); break; case SPECTRE_V2_CMD_IBRS: - mode = SPECTRE_V2_IBRS; + spectre_v2_enabled = SPECTRE_V2_IBRS; break; case SPECTRE_V2_CMD_EIBRS: - mode = SPECTRE_V2_EIBRS; + spectre_v2_enabled = SPECTRE_V2_EIBRS; break; case SPECTRE_V2_CMD_EIBRS_LFENCE: - mode = SPECTRE_V2_EIBRS_LFENCE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_LFENCE; break; case SPECTRE_V2_CMD_EIBRS_RETPOLINE: - mode = SPECTRE_V2_EIBRS_RETPOLINE; + spectre_v2_enabled = SPECTRE_V2_EIBRS_RETPOLINE; break; } +} - if (mode == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) +static void __init spectre_v2_update_mitigation(void) +{ + if (spectre_v2_cmd == SPECTRE_V2_CMD_AUTO && + !spectre_v2_in_eibrs_mode(spectre_v2_enabled)) { + if (IS_ENABLED(CONFIG_MITIGATION_IBRS_ENTRY) && + boot_cpu_has_bug(X86_BUG_RETBLEED) && + retbleed_mitigation != RETBLEED_MITIGATION_NONE && + retbleed_mitigation != RETBLEED_MITIGATION_STUFF && + boot_cpu_has(X86_FEATURE_IBRS) && + boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + spectre_v2_enabled = SPECTRE_V2_IBRS; + } + } + + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && !cpu_mitigations_off()) + pr_info("%s\n", spectre_v2_strings[spectre_v2_enabled]); +} + +static void __init spectre_v2_apply_mitigation(void) +{ + if (spectre_v2_enabled == SPECTRE_V2_EIBRS && unprivileged_ebpf_enabled()) pr_err(SPECTRE_V2_EIBRS_EBPF_MSG); - if (spectre_v2_in_ibrs_mode(mode)) { + if (spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) { msr_set_bit(MSR_EFER, _EFER_AUTOIBRS); } else { @@ -1794,8 +2135,10 @@ static void __init spectre_v2_select_mitigation(void) } } - switch (mode) { + switch (spectre_v2_enabled) { case SPECTRE_V2_NONE: + return; + case SPECTRE_V2_EIBRS: break; @@ -1821,18 +2164,12 @@ static void __init spectre_v2_select_mitigation(void) * JMPs gets protection against BHI and Intramode-BTI, but RET * prediction from a non-RSB predictor is still a risk. */ - if (mode == SPECTRE_V2_EIBRS_LFENCE || - mode == SPECTRE_V2_EIBRS_RETPOLINE || - mode == SPECTRE_V2_RETPOLINE) + if (spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE || + spectre_v2_enabled == SPECTRE_V2_EIBRS_RETPOLINE || + spectre_v2_enabled == SPECTRE_V2_RETPOLINE) spec_ctrl_disable_kernel_rrsba(); - if (boot_cpu_has(X86_BUG_BHI)) - bhi_select_mitigation(); - - spectre_v2_enabled = mode; - pr_info("%s\n", spectre_v2_strings[mode]); - - spectre_v2_select_rsb_mitigation(mode); + spectre_v2_select_rsb_mitigation(spectre_v2_enabled); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -1840,28 +2177,26 @@ static void __init spectre_v2_select_mitigation(void) * firmware calls only when IBRS / Enhanced / Automatic IBRS aren't * otherwise enabled. * - * Use "mode" to check Enhanced IBRS instead of boot_cpu_has(), because - * the user might select retpoline on the kernel command line and if - * the CPU supports Enhanced IBRS, kernel might un-intentionally not - * enable IBRS around firmware calls. + * Use "spectre_v2_enabled" to check Enhanced IBRS instead of + * boot_cpu_has(), because the user might select retpoline on the kernel + * command line and if the CPU supports Enhanced IBRS, kernel might + * un-intentionally not enable IBRS around firmware calls. */ if (boot_cpu_has_bug(X86_BUG_RETBLEED) && boot_cpu_has(X86_FEATURE_IBPB) && (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON)) { - if (retbleed_cmd != RETBLEED_CMD_IBPB) { + if (retbleed_mitigation != RETBLEED_MITIGATION_IBPB) { setup_force_cpu_cap(X86_FEATURE_USE_IBPB_FW); pr_info("Enabling Speculation Barrier for firmware calls\n"); } - } else if (boot_cpu_has(X86_FEATURE_IBRS) && !spectre_v2_in_ibrs_mode(mode)) { + } else if (boot_cpu_has(X86_FEATURE_IBRS) && + !spectre_v2_in_ibrs_mode(spectre_v2_enabled)) { setup_force_cpu_cap(X86_FEATURE_USE_IBRS_FW); pr_info("Enabling Restricted Speculation for firmware calls\n"); } - - /* Set up IBPB and STIBP depending on the general spectre V2 command */ - spectre_v2_cmd = cmd; } static void update_stibp_msr(void * __unused) @@ -2050,19 +2385,18 @@ static enum ssb_mitigation_cmd __init ssb_parse_cmdline(void) return cmd; } -static enum ssb_mitigation __init __ssb_select_mitigation(void) +static void __init ssb_select_mitigation(void) { - enum ssb_mitigation mode = SPEC_STORE_BYPASS_NONE; enum ssb_mitigation_cmd cmd; if (!boot_cpu_has(X86_FEATURE_SSBD)) - return mode; + goto out; cmd = ssb_parse_cmdline(); if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS) && (cmd == SPEC_STORE_BYPASS_CMD_NONE || cmd == SPEC_STORE_BYPASS_CMD_AUTO)) - return mode; + return; switch (cmd) { case SPEC_STORE_BYPASS_CMD_SECCOMP: @@ -2071,28 +2405,35 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) * enabled. */ if (IS_ENABLED(CONFIG_SECCOMP)) - mode = SPEC_STORE_BYPASS_SECCOMP; + ssb_mode = SPEC_STORE_BYPASS_SECCOMP; else - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_ON: - mode = SPEC_STORE_BYPASS_DISABLE; + ssb_mode = SPEC_STORE_BYPASS_DISABLE; break; case SPEC_STORE_BYPASS_CMD_AUTO: case SPEC_STORE_BYPASS_CMD_PRCTL: - mode = SPEC_STORE_BYPASS_PRCTL; + ssb_mode = SPEC_STORE_BYPASS_PRCTL; break; case SPEC_STORE_BYPASS_CMD_NONE: break; } +out: + if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) + pr_info("%s\n", ssb_strings[ssb_mode]); +} + +static void __init ssb_apply_mitigation(void) +{ /* * We have three CPU feature flags that are in play here: * - X86_BUG_SPEC_STORE_BYPASS - CPU is susceptible. * - X86_FEATURE_SSBD - CPU is able to turn off speculative store bypass * - X86_FEATURE_SPEC_STORE_BYPASS_DISABLE - engage the mitigation */ - if (mode == SPEC_STORE_BYPASS_DISABLE) { + if (ssb_mode == SPEC_STORE_BYPASS_DISABLE) { setup_force_cpu_cap(X86_FEATURE_SPEC_STORE_BYPASS_DISABLE); /* * Intel uses the SPEC CTRL MSR Bit(2) for this, while AMD may @@ -2106,16 +2447,6 @@ static enum ssb_mitigation __init __ssb_select_mitigation(void) update_spec_ctrl(x86_spec_ctrl_base); } } - - return mode; -} - -static void ssb_select_mitigation(void) -{ - ssb_mode = __ssb_select_mitigation(); - - if (boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) - pr_info("%s\n", ssb_strings[ssb_mode]); } #undef pr_fmt @@ -2371,7 +2702,7 @@ EXPORT_SYMBOL_GPL(itlb_multihit_kvm_mitigation); /* Default mitigation for L1TF-affected CPUs */ enum l1tf_mitigations l1tf_mitigation __ro_after_init = - IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_FLUSH : L1TF_MITIGATION_OFF; + IS_ENABLED(CONFIG_MITIGATION_L1TF) ? L1TF_MITIGATION_AUTO : L1TF_MITIGATION_OFF; #if IS_ENABLED(CONFIG_KVM_INTEL) EXPORT_SYMBOL_GPL(l1tf_mitigation); #endif @@ -2419,22 +2750,33 @@ static void override_cache_bits(struct cpuinfo_x86 *c) static void __init l1tf_select_mitigation(void) { + if (!boot_cpu_has_bug(X86_BUG_L1TF) || cpu_mitigations_off()) { + l1tf_mitigation = L1TF_MITIGATION_OFF; + return; + } + + if (l1tf_mitigation == L1TF_MITIGATION_AUTO) { + if (cpu_mitigations_auto_nosmt()) + l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; + else + l1tf_mitigation = L1TF_MITIGATION_FLUSH; + } +} + +static void __init l1tf_apply_mitigation(void) +{ u64 half_pa; if (!boot_cpu_has_bug(X86_BUG_L1TF)) return; - if (cpu_mitigations_off()) - l1tf_mitigation = L1TF_MITIGATION_OFF; - else if (cpu_mitigations_auto_nosmt()) - l1tf_mitigation = L1TF_MITIGATION_FLUSH_NOSMT; - override_cache_bits(&boot_cpu_data); switch (l1tf_mitigation) { case L1TF_MITIGATION_OFF: case L1TF_MITIGATION_FLUSH_NOWARN: case L1TF_MITIGATION_FLUSH: + case L1TF_MITIGATION_AUTO: break; case L1TF_MITIGATION_FLUSH_NOSMT: case L1TF_MITIGATION_FULL: @@ -2494,6 +2836,7 @@ early_param("l1tf", l1tf_cmdline); enum srso_mitigation { SRSO_MITIGATION_NONE, + SRSO_MITIGATION_AUTO, SRSO_MITIGATION_UCODE_NEEDED, SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED, SRSO_MITIGATION_MICROCODE, @@ -2503,14 +2846,6 @@ enum srso_mitigation { SRSO_MITIGATION_BP_SPEC_REDUCE, }; -enum srso_mitigation_cmd { - SRSO_CMD_OFF, - SRSO_CMD_MICROCODE, - SRSO_CMD_SAFE_RET, - SRSO_CMD_IBPB, - SRSO_CMD_IBPB_ON_VMEXIT, -}; - static const char * const srso_strings[] = { [SRSO_MITIGATION_NONE] = "Vulnerable", [SRSO_MITIGATION_UCODE_NEEDED] = "Vulnerable: No microcode", @@ -2522,8 +2857,7 @@ static const char * const srso_strings[] = { [SRSO_MITIGATION_BP_SPEC_REDUCE] = "Mitigation: Reduced Speculation" }; -static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_NONE; -static enum srso_mitigation_cmd srso_cmd __ro_after_init = SRSO_CMD_SAFE_RET; +static enum srso_mitigation srso_mitigation __ro_after_init = SRSO_MITIGATION_AUTO; static int __init srso_parse_cmdline(char *str) { @@ -2531,15 +2865,15 @@ static int __init srso_parse_cmdline(char *str) return -EINVAL; if (!strcmp(str, "off")) - srso_cmd = SRSO_CMD_OFF; + srso_mitigation = SRSO_MITIGATION_NONE; else if (!strcmp(str, "microcode")) - srso_cmd = SRSO_CMD_MICROCODE; + srso_mitigation = SRSO_MITIGATION_MICROCODE; else if (!strcmp(str, "safe-ret")) - srso_cmd = SRSO_CMD_SAFE_RET; + srso_mitigation = SRSO_MITIGATION_SAFE_RET; else if (!strcmp(str, "ibpb")) - srso_cmd = SRSO_CMD_IBPB; + srso_mitigation = SRSO_MITIGATION_IBPB; else if (!strcmp(str, "ibpb-vmexit")) - srso_cmd = SRSO_CMD_IBPB_ON_VMEXIT; + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; else pr_err("Ignoring unknown SRSO option (%s).", str); @@ -2551,132 +2885,85 @@ early_param("spec_rstack_overflow", srso_parse_cmdline); static void __init srso_select_mitigation(void) { - bool has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); + bool has_microcode; - if (!boot_cpu_has_bug(X86_BUG_SRSO) || - cpu_mitigations_off() || - srso_cmd == SRSO_CMD_OFF) { - if (boot_cpu_has(X86_FEATURE_SBPB)) - x86_pred_cmd = PRED_CMD_SBPB; - goto out; - } + if (!boot_cpu_has_bug(X86_BUG_SRSO) || cpu_mitigations_off()) + srso_mitigation = SRSO_MITIGATION_NONE; + + if (srso_mitigation == SRSO_MITIGATION_NONE) + return; + + if (srso_mitigation == SRSO_MITIGATION_AUTO) + srso_mitigation = SRSO_MITIGATION_SAFE_RET; + has_microcode = boot_cpu_has(X86_FEATURE_IBPB_BRTYPE); if (has_microcode) { /* * Zen1/2 with SMT off aren't vulnerable after the right * IBPB microcode has been applied. - * - * Zen1/2 don't have SBPB, no need to try to enable it here. */ if (boot_cpu_data.x86 < 0x19 && !cpu_smt_possible()) { setup_force_cpu_cap(X86_FEATURE_SRSO_NO); - goto out; - } - - if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB) { - srso_mitigation = SRSO_MITIGATION_IBPB; - goto out; + srso_mitigation = SRSO_MITIGATION_NONE; + return; } } else { pr_warn("IBPB-extending microcode not applied!\n"); pr_warn(SRSO_NOTICE); - - /* may be overwritten by SRSO_CMD_SAFE_RET below */ - srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; } - switch (srso_cmd) { - case SRSO_CMD_MICROCODE: - if (has_microcode) { - srso_mitigation = SRSO_MITIGATION_MICROCODE; - pr_warn(SRSO_NOTICE); - } - break; - - case SRSO_CMD_SAFE_RET: - if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + if (boot_cpu_has(X86_FEATURE_SRSO_USER_KERNEL_NO)) { + srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; goto ibpb_on_vmexit; + } - if (IS_ENABLED(CONFIG_MITIGATION_SRSO)) { - /* - * Enable the return thunk for generated code - * like ftrace, static_call, etc. - */ - setup_force_cpu_cap(X86_FEATURE_RETHUNK); - setup_force_cpu_cap(X86_FEATURE_UNRET); - - if (boot_cpu_data.x86 == 0x19) { - setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - x86_return_thunk = srso_alias_return_thunk; - } else { - setup_force_cpu_cap(X86_FEATURE_SRSO); - x86_return_thunk = srso_return_thunk; - } - if (has_microcode) - srso_mitigation = SRSO_MITIGATION_SAFE_RET; - else - srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; - } else { + if (!IS_ENABLED(CONFIG_MITIGATION_SRSO)) { pr_err("WARNING: kernel not compiled with MITIGATION_SRSO.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } - break; - case SRSO_CMD_IBPB: - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB; - - /* - * IBPB on entry already obviates the need for - * software-based untraining so clear those in case some - * other mitigation like Retbleed has selected them. - */ - setup_clear_cpu_cap(X86_FEATURE_UNRET); - setup_clear_cpu_cap(X86_FEATURE_RETHUNK); - - /* - * There is no need for RSB filling: write_ibpb() ensures - * all predictions, including the RSB, are invalidated, - * regardless of IBPB implementation. - */ - setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); - } - } else { - pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); - } + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED; break; - ibpb_on_vmexit: - case SRSO_CMD_IBPB_ON_VMEXIT: + case SRSO_MITIGATION_IBPB_ON_VMEXIT: if (boot_cpu_has(X86_FEATURE_SRSO_BP_SPEC_REDUCE)) { pr_notice("Reducing speculation to address VM/HV SRSO attack vector.\n"); srso_mitigation = SRSO_MITIGATION_BP_SPEC_REDUCE; break; } - - if (IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { - if (has_microcode) { - setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); - srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; - - /* - * There is no need for RSB filling: write_ibpb() ensures - * all predictions, including the RSB, are invalidated, - * regardless of IBPB implementation. - */ - setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); - } - } else { + fallthrough; + case SRSO_MITIGATION_IBPB: + if (!IS_ENABLED(CONFIG_MITIGATION_IBPB_ENTRY)) { pr_err("WARNING: kernel not compiled with MITIGATION_IBPB_ENTRY.\n"); + srso_mitigation = SRSO_MITIGATION_NONE; } + + if (!has_microcode) + srso_mitigation = SRSO_MITIGATION_UCODE_NEEDED; break; default: break; } +} -out: +static void __init srso_update_mitigation(void) +{ + /* If retbleed is using IBPB, that works for SRSO as well */ + if (retbleed_mitigation == RETBLEED_MITIGATION_IBPB && + boot_cpu_has(X86_FEATURE_IBPB_BRTYPE)) + srso_mitigation = SRSO_MITIGATION_IBPB; + + if (boot_cpu_has_bug(X86_BUG_SRSO) && + !cpu_mitigations_off() && + !boot_cpu_has(X86_FEATURE_SRSO_NO)) + pr_info("%s\n", srso_strings[srso_mitigation]); +} + +static void __init srso_apply_mitigation(void) +{ /* * Clear the feature flag if this mitigation is not selected as that * feature flag controls the BpSpecReduce MSR bit toggling in KVM. @@ -2684,8 +2971,52 @@ out: if (srso_mitigation != SRSO_MITIGATION_BP_SPEC_REDUCE) setup_clear_cpu_cap(X86_FEATURE_SRSO_BP_SPEC_REDUCE); - if (srso_mitigation != SRSO_MITIGATION_NONE) - pr_info("%s\n", srso_strings[srso_mitigation]); + if (srso_mitigation == SRSO_MITIGATION_NONE) { + if (boot_cpu_has(X86_FEATURE_SBPB)) + x86_pred_cmd = PRED_CMD_SBPB; + return; + } + + switch (srso_mitigation) { + case SRSO_MITIGATION_SAFE_RET: + case SRSO_MITIGATION_SAFE_RET_UCODE_NEEDED: + /* + * Enable the return thunk for generated code + * like ftrace, static_call, etc. + */ + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + setup_force_cpu_cap(X86_FEATURE_UNRET); + + if (boot_cpu_data.x86 == 0x19) { + setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); + set_return_thunk(srso_alias_return_thunk); + } else { + setup_force_cpu_cap(X86_FEATURE_SRSO); + set_return_thunk(srso_return_thunk); + } + break; + case SRSO_MITIGATION_IBPB: + setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); + /* + * IBPB on entry already obviates the need for + * software-based untraining so clear those in case some + * other mitigation like Retbleed has selected them. + */ + setup_clear_cpu_cap(X86_FEATURE_UNRET); + setup_clear_cpu_cap(X86_FEATURE_RETHUNK); + fallthrough; + case SRSO_MITIGATION_IBPB_ON_VMEXIT: + setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); + /* + * There is no need for RSB filling: entry_ibpb() ensures + * all predictions, including the RSB, are invalidated, + * regardless of IBPB implementation. + */ + setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT); + break; + default: + break; + } } #undef pr_fmt @@ -2780,9 +3111,6 @@ static ssize_t tsx_async_abort_show_state(char *buf) static ssize_t mmio_stale_data_show_state(char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return sysfs_emit(buf, "Unknown: No mitigations\n"); - if (mmio_mitigation == MMIO_MITIGATION_OFF) return sysfs_emit(buf, "%s\n", mmio_strings[mmio_mitigation]); @@ -2800,6 +3128,19 @@ static ssize_t rfds_show_state(char *buf) return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } +static ssize_t old_microcode_show_state(char *buf) +{ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return sysfs_emit(buf, "Unknown: running under hypervisor"); + + return sysfs_emit(buf, "Vulnerable\n"); +} + +static ssize_t its_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2858,7 +3199,7 @@ static const char *spectre_bhi_state(void) !boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE) && rrsba_disabled) return "; BHI: Retpoline"; - else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT)) + else if (boot_cpu_has(X86_FEATURE_CLEAR_BHB_VMEXIT)) return "; BHI: Vulnerable, KVM: SW loop"; return "; BHI: Vulnerable"; @@ -2967,7 +3308,6 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr return srbds_show_state(buf); case X86_BUG_MMIO_STALE_DATA: - case X86_BUG_MMIO_UNKNOWN: return mmio_stale_data_show_state(buf); case X86_BUG_RETBLEED: @@ -2982,6 +3322,12 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RFDS: return rfds_show_state(buf); + case X86_BUG_OLD_MICROCODE: + return old_microcode_show_state(buf); + + case X86_BUG_ITS: + return its_show_state(buf); + default: break; } @@ -3036,10 +3382,7 @@ ssize_t cpu_show_srbds(struct device *dev, struct device_attribute *attr, char * ssize_t cpu_show_mmio_stale_data(struct device *dev, struct device_attribute *attr, char *buf) { - if (boot_cpu_has_bug(X86_BUG_MMIO_UNKNOWN)) - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_UNKNOWN); - else - return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); + return cpu_show_common(dev, attr, buf, X86_BUG_MMIO_STALE_DATA); } ssize_t cpu_show_retbleed(struct device *dev, struct device_attribute *attr, char *buf) @@ -3061,6 +3404,16 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib { return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } + +ssize_t cpu_show_old_microcode(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_OLD_MICROCODE); +} + +ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITS); +} #endif void __warn_thunk(void) diff --git a/arch/x86/kernel/cpu/bus_lock.c b/arch/x86/kernel/cpu/bus_lock.c index 237faf7e700c..981f8b1f0792 100644 --- a/arch/x86/kernel/cpu/bus_lock.c +++ b/arch/x86/kernel/cpu/bus_lock.c @@ -10,6 +10,7 @@ #include <asm/cmdline.h> #include <asm/traps.h> #include <asm/cpu.h> +#include <asm/msr.h> enum split_lock_detect_state { sld_off = 0, @@ -95,15 +96,15 @@ static bool split_lock_verify_msr(bool on) { u64 ctrl, tmp; - if (rdmsrl_safe(MSR_TEST_CTRL, &ctrl)) + if (rdmsrq_safe(MSR_TEST_CTRL, &ctrl)) return false; if (on) ctrl |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; else ctrl &= ~MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - if (wrmsrl_safe(MSR_TEST_CTRL, ctrl)) + if (wrmsrq_safe(MSR_TEST_CTRL, ctrl)) return false; - rdmsrl(MSR_TEST_CTRL, tmp); + rdmsrq(MSR_TEST_CTRL, tmp); return ctrl == tmp; } @@ -137,7 +138,7 @@ static void __init __split_lock_setup(void) return; } - rdmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + rdmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); if (!split_lock_verify_msr(true)) { pr_info("MSR access failed: Disabled\n"); @@ -145,7 +146,7 @@ static void __init __split_lock_setup(void) } /* Restore the MSR to its cached value. */ - wrmsrl(MSR_TEST_CTRL, msr_test_ctrl_cache); + wrmsrq(MSR_TEST_CTRL, msr_test_ctrl_cache); setup_force_cpu_cap(X86_FEATURE_SPLIT_LOCK_DETECT); } @@ -162,7 +163,7 @@ static void sld_update_msr(bool on) if (on) test_ctrl_val |= MSR_TEST_CTRL_SPLIT_LOCK_DETECT; - wrmsrl(MSR_TEST_CTRL, test_ctrl_val); + wrmsrq(MSR_TEST_CTRL, test_ctrl_val); } void split_lock_init(void) @@ -297,7 +298,7 @@ void bus_lock_init(void) if (!boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT)) return; - rdmsrl(MSR_IA32_DEBUGCTLMSR, val); + rdmsrq(MSR_IA32_DEBUGCTLMSR, val); if ((boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT) && (sld_state == sld_warn || sld_state == sld_fatal)) || @@ -311,7 +312,7 @@ void bus_lock_init(void) val |= DEBUGCTLMSR_BUS_LOCK_DETECT; } - wrmsrl(MSR_IA32_DEBUGCTLMSR, val); + wrmsrq(MSR_IA32_DEBUGCTLMSR, val); } bool handle_user_split_lock(struct pt_regs *regs, long error_code) @@ -375,7 +376,7 @@ static void __init split_lock_setup(struct cpuinfo_x86 *c) * MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT is. All CPUs that set * it have split lock detection. */ - rdmsrl(MSR_IA32_CORE_CAPS, ia32_core_caps); + rdmsrq(MSR_IA32_CORE_CAPS, ia32_core_caps); if (ia32_core_caps & MSR_IA32_CORE_CAPS_SPLIT_LOCK_DETECT) goto supported; diff --git a/arch/x86/kernel/cpu/cacheinfo.c b/arch/x86/kernel/cpu/cacheinfo.c index b3a520959b51..adfa7e8bb865 100644 --- a/arch/x86/kernel/cpu/cacheinfo.c +++ b/arch/x86/kernel/cpu/cacheinfo.c @@ -1,35 +1,28 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Routines to identify caches on Intel CPU. + * x86 CPU caches detection and configuration * - * Changes: - * Venkatesh Pallipadi : Adding cache identification through cpuid(4) - * Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure. - * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. + * Previous changes + * - Venkatesh Pallipadi: Cache identification through CPUID(0x4) + * - Ashok Raj <ashok.raj@intel.com>: Work with CPU hotplug infrastructure + * - Andi Kleen / Andreas Herrmann: CPUID(0x4) emulation on AMD */ #include <linux/cacheinfo.h> -#include <linux/capability.h> #include <linux/cpu.h> #include <linux/cpuhotplug.h> -#include <linux/pci.h> #include <linux/stop_machine.h> -#include <linux/sysfs.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/cacheinfo.h> #include <asm/cpufeature.h> +#include <asm/cpuid/api.h> #include <asm/mtrr.h> #include <asm/smp.h> #include <asm/tlbflush.h> #include "cpu.h" -#define LVL_1_INST 1 -#define LVL_1_DATA 2 -#define LVL_2 3 -#define LVL_3 4 - /* Shared last level cache maps */ DEFINE_PER_CPU_READ_MOSTLY(cpumask_var_t, cpu_llc_shared_map); @@ -41,208 +34,127 @@ static cpumask_var_t cpu_cacheinfo_mask; /* Kernel controls MTRR and/or PAT MSRs. */ unsigned int memory_caching_control __ro_after_init; -struct _cache_table { - unsigned char descriptor; - char cache_type; - short size; -}; - -#define MB(x) ((x) * 1024) - -/* All the cache descriptor types we care about (no TLB or - trace cache entries) */ - -static const struct _cache_table cache_table[] = -{ - { 0x06, LVL_1_INST, 8 }, /* 4-way set assoc, 32 byte line size */ - { 0x08, LVL_1_INST, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x09, LVL_1_INST, 32 }, /* 4-way set assoc, 64 byte line size */ - { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ - { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ - { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ - { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */ - { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ - { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ - { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3a, LVL_2, 192 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3b, LVL_2, 128 }, /* 2-way set assoc, sectored cache, 64 byte line size */ - { 0x3c, LVL_2, 256 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3d, LVL_2, 384 }, /* 6-way set assoc, sectored cache, 64 byte line size */ - { 0x3e, LVL_2, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x3f, LVL_2, 256 }, /* 2-way set assoc, 64 byte line size */ - { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ - { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ - { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ - { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */ - { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ - { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ - { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ - { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */ - { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ - { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */ - { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */ - { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */ - { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x68, LVL_1_DATA, 32 }, /* 4-way set assoc, sectored cache, 64 byte line size */ - { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ - { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ - { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */ - { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ - { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ - { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ - { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */ - { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ - { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */ - { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */ - { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */ - { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */ - { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */ - { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */ - { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */ - { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */ - { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ - { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ - { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */ - { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */ - { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */ - { 0x00, 0, 0} -}; - - enum _cache_type { - CTYPE_NULL = 0, - CTYPE_DATA = 1, - CTYPE_INST = 2, - CTYPE_UNIFIED = 3 + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 }; union _cpuid4_leaf_eax { struct { - enum _cache_type type:5; - unsigned int level:3; - unsigned int is_self_initializing:1; - unsigned int is_fully_associative:1; - unsigned int reserved:4; - unsigned int num_threads_sharing:12; - unsigned int num_cores_on_die:6; + enum _cache_type type :5; + unsigned int level :3; + unsigned int is_self_initializing :1; + unsigned int is_fully_associative :1; + unsigned int reserved :4; + unsigned int num_threads_sharing :12; + unsigned int num_cores_on_die :6; } split; u32 full; }; union _cpuid4_leaf_ebx { struct { - unsigned int coherency_line_size:12; - unsigned int physical_line_partition:10; - unsigned int ways_of_associativity:10; + unsigned int coherency_line_size :12; + unsigned int physical_line_partition :10; + unsigned int ways_of_associativity :10; } split; u32 full; }; union _cpuid4_leaf_ecx { struct { - unsigned int number_of_sets:32; + unsigned int number_of_sets :32; } split; u32 full; }; -struct _cpuid4_info_regs { +struct _cpuid4_info { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned int id; unsigned long size; - struct amd_northbridge *nb; }; -/* AMD doesn't have CPUID4. Emulate it here to report the same - information to the user. This makes some assumptions about the machine: - L2 not shared, no SMT etc. that is currently true on AMD CPUs. +/* Map CPUID(0x4) EAX.cache_type to <linux/cacheinfo.h> types */ +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + +/* + * Fallback AMD CPUID(0x4) emulation + * AMD CPUs with TOPOEXT can just use CPUID(0x8000001d) + * + * @AMD_L2_L3_INVALID_ASSOC: cache info for the respective L2/L3 cache should + * be determined from CPUID(0x8000001d) instead of CPUID(0x80000006). + */ + +#define AMD_CPUID4_FULLY_ASSOCIATIVE 0xffff +#define AMD_L2_L3_INVALID_ASSOC 0x9 - In theory the TLBs could be reported as fake type (they are in "dummy"). - Maybe later */ union l1_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:8; - unsigned assoc:8; - unsigned size_in_kb:8; + unsigned line_size :8; + unsigned lines_per_tag :8; + unsigned assoc :8; + unsigned size_in_kb :8; }; - unsigned val; + unsigned int val; }; union l2_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned size_in_kb:16; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned size_in_kb :16; }; - unsigned val; + unsigned int val; }; union l3_cache { struct { - unsigned line_size:8; - unsigned lines_per_tag:4; - unsigned assoc:4; - unsigned res:2; - unsigned size_encoded:14; + unsigned line_size :8; + unsigned lines_per_tag :4; + unsigned assoc :4; + unsigned res :2; + unsigned size_encoded :14; }; - unsigned val; + unsigned int val; }; +/* L2/L3 associativity mapping */ static const unsigned short assocs[] = { - [1] = 1, - [2] = 2, - [4] = 4, - [6] = 8, - [8] = 16, - [0xa] = 32, - [0xb] = 48, - [0xc] = 64, - [0xd] = 96, - [0xe] = 128, - [0xf] = 0xffff /* fully associative - no way to show this currently */ + [1] = 1, + [2] = 2, + [3] = 3, + [4] = 4, + [5] = 6, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, + [0xc] = 64, + [0xd] = 96, + [0xe] = 128, + [0xf] = AMD_CPUID4_FULLY_ASSOCIATIVE }; static const unsigned char levels[] = { 1, 1, 2, 3 }; -static const unsigned char types[] = { 1, 2, 3, 3 }; +static const unsigned char types[] = { 1, 2, 3, 3 }; -static const enum cache_type cache_type_map[] = { - [CTYPE_NULL] = CACHE_TYPE_NOCACHE, - [CTYPE_DATA] = CACHE_TYPE_DATA, - [CTYPE_INST] = CACHE_TYPE_INST, - [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, -}; - -static void -amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, - union _cpuid4_leaf_ebx *ebx, - union _cpuid4_leaf_ecx *ecx) +static void legacy_amd_cpuid4(int index, union _cpuid4_leaf_eax *eax, + union _cpuid4_leaf_ebx *ebx, union _cpuid4_leaf_ecx *ecx) { - unsigned dummy; - unsigned line_size, lines_per_tag, assoc, size_in_kb; - union l1_cache l1i, l1d; + unsigned int dummy, line_size, lines_per_tag, assoc, size_in_kb; + union l1_cache l1i, l1d, *l1; union l2_cache l2; union l3_cache l3; - union l1_cache *l1 = &l1d; eax->full = 0; ebx->full = 0; @@ -251,430 +163,155 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, cpuid(0x80000005, &dummy, &dummy, &l1d.val, &l1i.val); cpuid(0x80000006, &dummy, &dummy, &l2.val, &l3.val); - switch (leaf) { + l1 = &l1d; + switch (index) { case 1: l1 = &l1i; fallthrough; case 0: if (!l1->val) return; - assoc = assocs[l1->assoc]; - line_size = l1->line_size; - lines_per_tag = l1->lines_per_tag; - size_in_kb = l1->size_in_kb; + + assoc = (l1->assoc == 0xff) ? AMD_CPUID4_FULLY_ASSOCIATIVE : l1->assoc; + line_size = l1->line_size; + lines_per_tag = l1->lines_per_tag; + size_in_kb = l1->size_in_kb; break; case 2: - if (!l2.val) + if (!l2.assoc || l2.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l2.assoc]; - line_size = l2.line_size; - lines_per_tag = l2.lines_per_tag; - /* cpu_data has errata corrections for K7 applied */ - size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); + + /* Use x86_cache_size as it might have K7 errata fixes */ + assoc = assocs[l2.assoc]; + line_size = l2.line_size; + lines_per_tag = l2.lines_per_tag; + size_in_kb = __this_cpu_read(cpu_info.x86_cache_size); break; case 3: - if (!l3.val) + if (!l3.assoc || l3.assoc == AMD_L2_L3_INVALID_ASSOC) return; - assoc = assocs[l3.assoc]; - line_size = l3.line_size; - lines_per_tag = l3.lines_per_tag; - size_in_kb = l3.size_encoded * 512; + + assoc = assocs[l3.assoc]; + line_size = l3.line_size; + lines_per_tag = l3.lines_per_tag; + size_in_kb = l3.size_encoded * 512; if (boot_cpu_has(X86_FEATURE_AMD_DCM)) { - size_in_kb = size_in_kb >> 1; - assoc = assoc >> 1; + size_in_kb = size_in_kb >> 1; + assoc = assoc >> 1; } break; default: return; } - eax->split.is_self_initializing = 1; - eax->split.type = types[leaf]; - eax->split.level = levels[leaf]; - eax->split.num_threads_sharing = 0; - eax->split.num_cores_on_die = topology_num_cores_per_package(); + eax->split.is_self_initializing = 1; + eax->split.type = types[index]; + eax->split.level = levels[index]; + eax->split.num_threads_sharing = 0; + eax->split.num_cores_on_die = topology_num_cores_per_package(); - - if (assoc == 0xffff) + if (assoc == AMD_CPUID4_FULLY_ASSOCIATIVE) eax->split.is_fully_associative = 1; - ebx->split.coherency_line_size = line_size - 1; - ebx->split.ways_of_associativity = assoc - 1; - ebx->split.physical_line_partition = lines_per_tag - 1; - ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / - (ebx->split.ways_of_associativity + 1) - 1; -} - -#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) - -/* - * L3 cache descriptors - */ -static void amd_calc_l3_indices(struct amd_northbridge *nb) -{ - struct amd_l3_cache *l3 = &nb->l3_cache; - unsigned int sc0, sc1, sc2, sc3; - u32 val = 0; - - pci_read_config_dword(nb->misc, 0x1C4, &val); - - /* calculate subcache sizes */ - l3->subcaches[0] = sc0 = !(val & BIT(0)); - l3->subcaches[1] = sc1 = !(val & BIT(4)); - - if (boot_cpu_data.x86 == 0x15) { - l3->subcaches[0] = sc0 += !(val & BIT(1)); - l3->subcaches[1] = sc1 += !(val & BIT(5)); - } - - l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); - l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - - l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; -} - -/* - * check whether a slot used for disabling an L3 index is occupied. - * @l3: L3 cache descriptor - * @slot: slot number (0..1) - * - * @returns: the disabled index if used or negative value if slot free. - */ -static int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) -{ - unsigned int reg = 0; - - pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); - - /* check whether this slot is activated already */ - if (reg & (3UL << 30)) - return reg & 0xfff; - - return -1; -} - -static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, - unsigned int slot) -{ - int index; - struct amd_northbridge *nb = this_leaf->priv; - - index = amd_get_l3_disable_slot(nb, slot); - if (index >= 0) - return sprintf(buf, "%d\n", index); - - return sprintf(buf, "FREE\n"); -} - -#define SHOW_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_show(struct device *dev, \ - struct device_attribute *attr, char *buf) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return show_cache_disable(this_leaf, buf, slot); \ -} -SHOW_CACHE_DISABLE(0) -SHOW_CACHE_DISABLE(1) - -static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long idx) -{ - int i; - idx |= BIT(30); - - /* - * disable index in all 4 subcaches - */ - for (i = 0; i < 4; i++) { - u32 reg = idx | (i << 20); - - if (!nb->l3_cache.subcaches[i]) - continue; - - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - - /* - * We need to WBINVD on a core on the node containing the L3 - * cache which indices we disable therefore a simple wbinvd() - * is not sufficient. - */ - wbinvd_on_cpu(cpu); - - reg |= BIT(31); - pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); - } -} - -/* - * disable a L3 cache index by using a disable-slot - * - * @l3: L3 cache descriptor - * @cpu: A CPU on the node containing the L3 cache - * @slot: slot number (0..1) - * @index: index to disable - * - * @return: 0 on success, error status on failure - */ -static int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, - unsigned slot, unsigned long index) -{ - int ret = 0; - - /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(nb, slot); - if (ret >= 0) - return -EEXIST; - - if (index > nb->l3_cache.indices) - return -EINVAL; - - /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(nb, !slot)) - return -EEXIST; - - amd_l3_disable_index(nb, cpu, slot, index); - - return 0; -} - -static ssize_t store_cache_disable(struct cacheinfo *this_leaf, - const char *buf, size_t count, - unsigned int slot) -{ - unsigned long val = 0; - int cpu, err = 0; - struct amd_northbridge *nb = this_leaf->priv; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - cpu = cpumask_first(&this_leaf->shared_cpu_map); - - if (kstrtoul(buf, 10, &val) < 0) - return -EINVAL; - - err = amd_set_l3_disable_slot(nb, cpu, slot, val); - if (err) { - if (err == -EEXIST) - pr_warn("L3 slot %d in use/index already disabled!\n", - slot); - return err; - } - return count; -} - -#define STORE_CACHE_DISABLE(slot) \ -static ssize_t \ -cache_disable_##slot##_store(struct device *dev, \ - struct device_attribute *attr, \ - const char *buf, size_t count) \ -{ \ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ - return store_cache_disable(this_leaf, buf, count, slot); \ -} -STORE_CACHE_DISABLE(0) -STORE_CACHE_DISABLE(1) - -static ssize_t subcaches_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - - return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); -} - -static ssize_t subcaches_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - int cpu = cpumask_first(&this_leaf->shared_cpu_map); - unsigned long val; - - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (kstrtoul(buf, 16, &val) < 0) - return -EINVAL; - - if (amd_set_subcaches(cpu, val)) - return -EINVAL; - - return count; + ebx->split.coherency_line_size = line_size - 1; + ebx->split.ways_of_associativity = assoc - 1; + ebx->split.physical_line_partition = lines_per_tag - 1; + ecx->split.number_of_sets = (size_in_kb * 1024) / line_size / + (ebx->split.ways_of_associativity + 1) - 1; } -static DEVICE_ATTR_RW(cache_disable_0); -static DEVICE_ATTR_RW(cache_disable_1); -static DEVICE_ATTR_RW(subcaches); - -static umode_t -cache_private_attrs_is_visible(struct kobject *kobj, - struct attribute *attr, int unused) +static int cpuid4_info_fill_done(struct _cpuid4_info *id4, union _cpuid4_leaf_eax eax, + union _cpuid4_leaf_ebx ebx, union _cpuid4_leaf_ecx ecx) { - struct device *dev = kobj_to_dev(kobj); - struct cacheinfo *this_leaf = dev_get_drvdata(dev); - umode_t mode = attr->mode; - - if (!this_leaf->priv) - return 0; - - if ((attr == &dev_attr_subcaches.attr) && - amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return mode; + if (eax.split.type == CTYPE_NULL) + return -EIO; - if ((attr == &dev_attr_cache_disable_0.attr || - attr == &dev_attr_cache_disable_1.attr) && - amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return mode; + id4->eax = eax; + id4->ebx = ebx; + id4->ecx = ecx; + id4->size = (ecx.split.number_of_sets + 1) * + (ebx.split.coherency_line_size + 1) * + (ebx.split.physical_line_partition + 1) * + (ebx.split.ways_of_associativity + 1); return 0; } -static struct attribute_group cache_private_group = { - .is_visible = cache_private_attrs_is_visible, -}; - -static void init_amd_l3_attrs(void) -{ - int n = 1; - static struct attribute **amd_l3_attrs; - - if (amd_l3_attrs) /* already initialized */ - return; - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - n += 2; - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - n += 1; - - amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); - if (!amd_l3_attrs) - return; - - n = 0; - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { - amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; - amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; - } - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - amd_l3_attrs[n++] = &dev_attr_subcaches.attr; - - cache_private_group.attrs = amd_l3_attrs; -} - -const struct attribute_group * -cache_get_priv_group(struct cacheinfo *this_leaf) +static int amd_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - struct amd_northbridge *nb = this_leaf->priv; - - if (this_leaf->level < 3 || !nb) - return NULL; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - if (nb && nb->l3_cache.indices) - init_amd_l3_attrs(); + if (boot_cpu_has(X86_FEATURE_TOPOEXT) || boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + cpuid_count(0x8000001d, index, &eax.full, &ebx.full, &ecx.full, &ignored); + else + legacy_amd_cpuid4(index, &eax, &ebx, &ecx); - return &cache_private_group; + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +static int intel_fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - int node; + union _cpuid4_leaf_eax eax; + union _cpuid4_leaf_ebx ebx; + union _cpuid4_leaf_ecx ecx; + u32 ignored; - /* only for L3, and not in virtualized environments */ - if (index < 3) - return; + cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &ignored); - node = topology_amd_node_id(smp_processor_id()); - this_leaf->nb = node_to_amd_nb(node); - if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) - amd_calc_l3_indices(this_leaf->nb); + return cpuid4_info_fill_done(id4, eax, ebx, ecx); } -#else -#define amd_init_l3_cache(x, y) -#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ -static int -cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) +static int fill_cpuid4_info(int index, struct _cpuid4_info *id4) { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned edx; - - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - else - amd_cpuid4(index, &eax, &ebx, &ecx); - amd_init_l3_cache(this_leaf, index); - } else if (boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) { - cpuid_count(0x8000001d, index, &eax.full, - &ebx.full, &ecx.full, &edx); - amd_init_l3_cache(this_leaf, index); - } else { - cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); - } + u8 cpu_vendor = boot_cpu_data.x86_vendor; - if (eax.split.type == CTYPE_NULL) - return -EIO; /* better error ? */ - - this_leaf->eax = eax; - this_leaf->ebx = ebx; - this_leaf->ecx = ecx; - this_leaf->size = (ecx.split.number_of_sets + 1) * - (ebx.split.coherency_line_size + 1) * - (ebx.split.physical_line_partition + 1) * - (ebx.split.ways_of_associativity + 1); - return 0; + return (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) ? + amd_fill_cpuid4_info(index, id4) : + intel_fill_cpuid4_info(index, id4); } static int find_num_cache_leaves(struct cpuinfo_x86 *c) { - unsigned int eax, ebx, ecx, edx, op; - union _cpuid4_leaf_eax cache_eax; - int i = -1; - - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) - op = 0x8000001d; - else - op = 4; + unsigned int eax, ebx, ecx, edx, op; + union _cpuid4_leaf_eax cache_eax; + int i = -1; + /* Do a CPUID(op) loop to calculate num_cache_leaves */ + op = (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) ? 0x8000001d : 4; do { ++i; - /* Do cpuid(op) loop to find out num_cache_leaves */ cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; } while (cache_eax.split.type != CTYPE_NULL); return i; } +/* + * AMD/Hygon CPUs may have multiple LLCs if L3 caches exist. + */ + void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; if (c->x86 < 0x17) { - /* LLC is at the node level. */ + /* Pre-Zen: LLC is at the node level */ c->topo.llc_id = die_id; } else if (c->x86 == 0x17 && c->x86_model <= 0x1F) { /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Family 17h up to 1F models: LLC is at the core + * complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } else { /* - * LLC ID is calculated from the number of threads sharing the - * cache. - * */ + * Newer families: LLC ID is calculated from the number + * of threads sharing the L3 cache. + */ u32 eax, ebx, ecx, edx, num_sharing_cache = 0; u32 llc_index = find_num_cache_leaves(c) - 1; @@ -683,25 +320,21 @@ void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id) num_sharing_cache = ((eax >> 14) & 0xfff) + 1; if (num_sharing_cache) { - int bits = get_count_order(num_sharing_cache); + int index_msb = get_count_order(num_sharing_cache); - c->topo.llc_id = c->topo.apicid >> bits; + c->topo.llc_id = c->topo.apicid >> index_msb; } } } void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c) { - /* - * We may have multiple LLCs if L3 caches exist, so check if we - * have an L3 cache by looking at the L3 cache CPUID leaf. - */ - if (!cpuid_edx(0x80000006)) + if (!cpuid_amd_hygon_has_l3_cache()) return; /* - * LLC is at the core complex level. - * Core complex ID is ApicId[3] for these processors. + * Hygons are similar to AMD Family 17h up to 1F models: LLC is + * at the core complex level. Core complex ID is ApicId[3]. */ c->topo.llc_id = c->topo.apicid >> 3; } @@ -710,14 +343,10 @@ void init_amd_cacheinfo(struct cpuinfo_x86 *c) { struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + if (boot_cpu_has(X86_FEATURE_TOPOEXT)) ci->num_leaves = find_num_cache_leaves(c); - } else if (c->extended_cpuid_level >= 0x80000006) { - if (cpuid_edx(0x80000006) & 0xf000) - ci->num_leaves = 4; - else - ci->num_leaves = 3; - } + else if (c->extended_cpuid_level >= 0x80000006) + ci->num_leaves = (cpuid_edx(0x80000006) & 0xf000) ? 4 : 3; } void init_hygon_cacheinfo(struct cpuinfo_x86 *c) @@ -727,148 +356,131 @@ void init_hygon_cacheinfo(struct cpuinfo_x86 *c) ci->num_leaves = find_num_cache_leaves(c); } -void init_intel_cacheinfo(struct cpuinfo_x86 *c) +static void intel_cacheinfo_done(struct cpuinfo_x86 *c, unsigned int l3, + unsigned int l2, unsigned int l1i, unsigned int l1d) { - /* Cache sizes */ - unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; - unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */ - unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */ - unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb; - struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + /* + * If llc_id is still unset, then cpuid_level < 4, which implies + * that the only possibility left is SMT. Since CPUID(0x2) doesn't + * specify any shared caches and SMT shares all caches, we can + * unconditionally set LLC ID to the package ID so that all + * threads share it. + */ + if (c->topo.llc_id == BAD_APICID) + c->topo.llc_id = c->topo.pkg_id; - if (c->cpuid_level > 3) { - /* - * There should be at least one leaf. A non-zero value means - * that the number of leaves has been initialized. - */ - if (!ci->num_leaves) - ci->num_leaves = find_num_cache_leaves(c); + c->x86_cache_size = l3 ? l3 : (l2 ? l2 : l1i + l1d); - /* - * Whenever possible use cpuid(4), deterministic cache - * parameters cpuid leaf to find the cache details - */ - for (i = 0; i < ci->num_leaves; i++) { - struct _cpuid4_info_regs this_leaf = {}; - int retval; + if (!l2) + cpu_detect_cache_sizes(c); +} - retval = cpuid4_cache_lookup_regs(i, &this_leaf); - if (retval < 0) - continue; +/* + * Legacy Intel CPUID(0x2) path if CPUID(0x4) is not available. + */ +static void intel_cacheinfo_0x2(struct cpuinfo_x86 *c) +{ + unsigned int l1i = 0, l1d = 0, l2 = 0, l3 = 0; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; - switch (this_leaf.eax.split.level) { - case 1: - if (this_leaf.eax.split.type == CTYPE_DATA) - new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == CTYPE_INST) - new_l1i = this_leaf.size/1024; - break; - case 2: - new_l2 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l2_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - case 3: - new_l3 = this_leaf.size/1024; - num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing; - index_msb = get_count_order(num_threads_sharing); - l3_id = c->topo.apicid & ~((1 << index_msb) - 1); - break; - default: - break; - } - } - } + if (c->cpuid_level < 2) + return; - /* Don't use CPUID(2) if CPUID(4) is supported. */ - if (!ci->num_leaves && c->cpuid_level > 1) { - /* supports eax=2 call */ - int j, n; - unsigned int regs[4]; - unsigned char *dp = (unsigned char *)regs; - - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 4 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) { - unsigned char des = dp[j]; - unsigned char k = 0; - - /* look up this descriptor in the table */ - while (cache_table[k].descriptor != 0) { - if (cache_table[k].descriptor == des) { - switch (cache_table[k].cache_type) { - case LVL_1_INST: - l1i += cache_table[k].size; - break; - case LVL_1_DATA: - l1d += cache_table[k].size; - break; - case LVL_2: - l2 += cache_table[k].size; - break; - case LVL_3: - l3 += cache_table[k].size; - break; - } - - break; - } - - k++; - } - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) { + switch (desc->c_type) { + case CACHE_L1_INST: l1i += desc->c_size; break; + case CACHE_L1_DATA: l1d += desc->c_size; break; + case CACHE_L2: l2 += desc->c_size; break; + case CACHE_L3: l3 += desc->c_size; break; } } - if (new_l1d) - l1d = new_l1d; + intel_cacheinfo_done(c, l3, l2, l1i, l1d); +} - if (new_l1i) - l1i = new_l1i; +static unsigned int calc_cache_topo_id(struct cpuinfo_x86 *c, const struct _cpuid4_info *id4) +{ + unsigned int num_threads_sharing; + int index_msb; - if (new_l2) { - l2 = new_l2; - c->topo.llc_id = l2_id; - c->topo.l2c_id = l2_id; - } + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; + index_msb = get_count_order(num_threads_sharing); + return c->topo.apicid & ~((1 << index_msb) - 1); +} - if (new_l3) { - l3 = new_l3; - c->topo.llc_id = l3_id; - } +static bool intel_cacheinfo_0x4(struct cpuinfo_x86 *c) +{ + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(c->cpu_index); + unsigned int l2_id = BAD_APICID, l3_id = BAD_APICID; + unsigned int l1d = 0, l1i = 0, l2 = 0, l3 = 0; + + if (c->cpuid_level < 4) + return false; /* - * If llc_id is not yet set, this means cpuid_level < 4 which in - * turns means that the only possibility is SMT (as indicated in - * cpuid1). Since cpuid2 doesn't specify shared caches, and we know - * that SMT shares all caches, we can unconditionally set cpu_llc_id to - * c->topo.pkg_id. + * There should be at least one leaf. A non-zero value means + * that the number of leaves has been previously initialized. */ - if (c->topo.llc_id == BAD_APICID) - c->topo.llc_id = c->topo.pkg_id; + if (!ci->num_leaves) + ci->num_leaves = find_num_cache_leaves(c); + + if (!ci->num_leaves) + return false; - c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); + for (int i = 0; i < ci->num_leaves; i++) { + struct _cpuid4_info id4 = {}; + int ret; - if (!l2) - cpu_detect_cache_sizes(c); + ret = intel_fill_cpuid4_info(i, &id4); + if (ret < 0) + continue; + + switch (id4.eax.split.level) { + case 1: + if (id4.eax.split.type == CTYPE_DATA) + l1d = id4.size / 1024; + else if (id4.eax.split.type == CTYPE_INST) + l1i = id4.size / 1024; + break; + case 2: + l2 = id4.size / 1024; + l2_id = calc_cache_topo_id(c, &id4); + break; + case 3: + l3 = id4.size / 1024; + l3_id = calc_cache_topo_id(c, &id4); + break; + default: + break; + } + } + + c->topo.l2c_id = l2_id; + c->topo.llc_id = (l3_id == BAD_APICID) ? l2_id : l3_id; + intel_cacheinfo_done(c, l3, l2, l1i, l1d); + return true; +} + +void init_intel_cacheinfo(struct cpuinfo_x86 *c) +{ + /* Don't use CPUID(0x2) if CPUID(0x4) is supported. */ + if (intel_cacheinfo_0x4(c)) + return; + + intel_cacheinfo_0x2(c); } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, AMD/Hygon + */ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci; - struct cacheinfo *this_leaf; + struct cacheinfo *ci; int i, sibling; /* @@ -880,18 +492,18 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, this_cpu_ci = get_cpu_cacheinfo(i); if (!this_cpu_ci->info_list) continue; - this_leaf = this_cpu_ci->info_list + index; + + ci = this_cpu_ci->info_list + index; for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; - nshared = base->eax.split.num_threads_sharing + 1; + nshared = id4->eax.split.num_threads_sharing + 1; apicid = cpu_data(cpu).topo.apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; @@ -905,14 +517,13 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, if ((apicid < first) || (apicid > last)) continue; - this_leaf = this_cpu_ci->info_list + index; + ci = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { apicid = cpu_data(sibling).topo.apicid; if ((apicid < first) || (apicid > last)) continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, &ci->shared_cpu_map); } } } else @@ -921,25 +532,27 @@ static int __cache_amd_cpumap_setup(unsigned int cpu, int index, return 1; } +/* + * <linux/cacheinfo.h> shared_cpu_map setup, Intel + fallback AMD/Hygon + */ static void __cache_cpumap_setup(unsigned int cpu, int index, - struct _cpuid4_info_regs *base) + const struct _cpuid4_info *id4) { struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf, *sibling_leaf; + struct cpuinfo_x86 *c = &cpu_data(cpu); + struct cacheinfo *ci, *sibling_ci; unsigned long num_threads_sharing; int index_msb, i; - struct cpuinfo_x86 *c = &cpu_data(cpu); - if (c->x86_vendor == X86_VENDOR_AMD || - c->x86_vendor == X86_VENDOR_HYGON) { - if (__cache_amd_cpumap_setup(cpu, index, base)) + if (c->x86_vendor == X86_VENDOR_AMD || c->x86_vendor == X86_VENDOR_HYGON) { + if (__cache_amd_cpumap_setup(cpu, index, id4)) return; } - this_leaf = this_cpu_ci->info_list + index; - num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + ci = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; - cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &ci->shared_cpu_map); if (num_threads_sharing == 1) return; @@ -949,30 +562,29 @@ static void __cache_cpumap_setup(unsigned int cpu, int index, if (cpu_data(i).topo.apicid >> index_msb == c->topo.apicid >> index_msb) { struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); + /* Skip if itself or no cacheinfo */ if (i == cpu || !sib_cpu_ci->info_list) - continue;/* skip if itself or no cacheinfo */ - sibling_leaf = sib_cpu_ci->info_list + index; - cpumask_set_cpu(i, &this_leaf->shared_cpu_map); - cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); + continue; + + sibling_ci = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &ci->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_ci->shared_cpu_map); } } -static void ci_leaf_init(struct cacheinfo *this_leaf, - struct _cpuid4_info_regs *base) +static void ci_info_init(struct cacheinfo *ci, const struct _cpuid4_info *id4, + struct amd_northbridge *nb) { - this_leaf->id = base->id; - this_leaf->attributes = CACHE_ID; - this_leaf->level = base->eax.split.level; - this_leaf->type = cache_type_map[base->eax.split.type]; - this_leaf->coherency_line_size = - base->ebx.split.coherency_line_size + 1; - this_leaf->ways_of_associativity = - base->ebx.split.ways_of_associativity + 1; - this_leaf->size = base->size; - this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; - this_leaf->physical_line_partition = - base->ebx.split.physical_line_partition + 1; - this_leaf->priv = base->nb; + ci->id = id4->id; + ci->attributes = CACHE_ID; + ci->level = id4->eax.split.level; + ci->type = cache_type_map[id4->eax.split.type]; + ci->coherency_line_size = id4->ebx.split.coherency_line_size + 1; + ci->ways_of_associativity = id4->ebx.split.ways_of_associativity + 1; + ci->size = id4->size; + ci->number_of_sets = id4->ecx.split.number_of_sets + 1; + ci->physical_line_partition = id4->ebx.split.physical_line_partition + 1; + ci->priv = nb; } int init_cache_level(unsigned int cpu) @@ -987,38 +599,45 @@ int init_cache_level(unsigned int cpu) } /* - * The max shared threads number comes from CPUID.4:EAX[25-14] with input + * The max shared threads number comes from CPUID(0x4) EAX[25-14] with input * ECX as cache index. Then right shift apicid by the number's order to get * cache id for this cache node. */ -static void get_cache_id(int cpu, struct _cpuid4_info_regs *id4_regs) +static void get_cache_id(int cpu, struct _cpuid4_info *id4) { struct cpuinfo_x86 *c = &cpu_data(cpu); unsigned long num_threads_sharing; int index_msb; - num_threads_sharing = 1 + id4_regs->eax.split.num_threads_sharing; + num_threads_sharing = 1 + id4->eax.split.num_threads_sharing; index_msb = get_count_order(num_threads_sharing); - id4_regs->id = c->topo.apicid >> index_msb; + id4->id = c->topo.apicid >> index_msb; } int populate_cache_leaves(unsigned int cpu) { - unsigned int idx, ret; struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); - struct cacheinfo *this_leaf = this_cpu_ci->info_list; - struct _cpuid4_info_regs id4_regs = {}; + struct cacheinfo *ci = this_cpu_ci->info_list; + u8 cpu_vendor = boot_cpu_data.x86_vendor; + struct amd_northbridge *nb = NULL; + struct _cpuid4_info id4 = {}; + int idx, ret; for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { - ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + ret = fill_cpuid4_info(idx, &id4); if (ret) return ret; - get_cache_id(cpu, &id4_regs); - ci_leaf_init(this_leaf++, &id4_regs); - __cache_cpumap_setup(cpu, idx, &id4_regs); + + get_cache_id(cpu, &id4); + + if (cpu_vendor == X86_VENDOR_AMD || cpu_vendor == X86_VENDOR_HYGON) + nb = amd_init_l3_cache(idx); + + ci_info_init(ci++, &id4, nb); + __cache_cpumap_setup(cpu, idx, &id4); } - this_cpu_ci->cpu_map_populated = true; + this_cpu_ci->cpu_map_populated = true; return 0; } @@ -1034,31 +653,33 @@ int populate_cache_leaves(unsigned int cpu) static unsigned long saved_cr4; static DEFINE_RAW_SPINLOCK(cache_disable_lock); +/* + * Cache flushing is the most time-consuming step when programming the + * MTRRs. On many Intel CPUs without known erratas, it can be skipped + * if the CPU declares cache self-snooping support. + */ +static void maybe_flush_caches(void) +{ + if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) + wbinvd(); +} + void cache_disable(void) __acquires(cache_disable_lock) { unsigned long cr0; /* - * Note that this is not ideal - * since the cache is only flushed/disabled for this CPU while the - * MTRRs are changed, but changing this requires more invasive - * changes to the way the kernel boots + * This is not ideal since the cache is only flushed/disabled + * for this CPU while the MTRRs are changed, but changing this + * requires more invasive changes to the way the kernel boots. */ - raw_spin_lock(&cache_disable_lock); /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ cr0 = read_cr0() | X86_CR0_CD; write_cr0(cr0); - /* - * Cache flushing is the most time-consuming step when programming - * the MTRRs. Fortunately, as per the Intel Software Development - * Manual, we can skip it if the processor supports cache self- - * snooping. - */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); /* Save value of CR4 and clear Page Global Enable (bit 7) */ if (cpu_feature_enabled(X86_FEATURE_PGE)) { @@ -1073,9 +694,7 @@ void cache_disable(void) __acquires(cache_disable_lock) if (cpu_feature_enabled(X86_FEATURE_MTRR)) mtrr_disable(); - /* Again, only flush caches if we have to. */ - if (!static_cpu_has(X86_FEATURE_SELFSNOOP)) - wbinvd(); + maybe_flush_caches(); } void cache_enable(void) __releases(cache_disable_lock) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 12126adbc3a9..8feb8fd2957a 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -29,7 +29,7 @@ #include <asm/alternative.h> #include <asm/cmdline.h> -#include <asm/cpuid.h> +#include <asm/cpuid/api.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> #include <asm/doublefault.h> @@ -148,7 +148,7 @@ static void ppin_init(struct cpuinfo_x86 *c) */ info = (struct ppin_info *)id->driver_data; - if (rdmsrl_safe(info->msr_ppin_ctl, &val)) + if (rdmsrq_safe(info->msr_ppin_ctl, &val)) goto clear_ppin; if ((val & 3UL) == 1UL) { @@ -158,13 +158,13 @@ static void ppin_init(struct cpuinfo_x86 *c) /* If PPIN is disabled, try to enable */ if (!(val & 2UL)) { - wrmsrl_safe(info->msr_ppin_ctl, val | 2UL); - rdmsrl_safe(info->msr_ppin_ctl, &val); + wrmsrq_safe(info->msr_ppin_ctl, val | 2UL); + rdmsrq_safe(info->msr_ppin_ctl, &val); } /* Is the enable bit set? */ if (val & 2UL) { - c->ppin = __rdmsr(info->msr_ppin); + c->ppin = native_rdmsrq(info->msr_ppin); set_cpu_cap(c, info->feature); return; } @@ -242,6 +242,7 @@ DEFINE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page) = { .gdt = { #endif } }; EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); +SYM_PIC_ALIAS(gdt_page); #ifdef CONFIG_X86_64 static int __init x86_nopcid_setup(char *s) @@ -321,7 +322,7 @@ static int __init cachesize_setup(char *str) __setup("cachesize=", cachesize_setup); /* Probe for the CPUID instruction */ -bool have_cpuid_p(void) +bool cpuid_feature(void) { return flag_is_changeable_p(X86_EFLAGS_ID); } @@ -562,9 +563,9 @@ __noendbr u64 ibt_save(bool disable) u64 msr = 0; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); if (disable) - wrmsrl(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, msr & ~CET_ENDBR_EN); } return msr; @@ -575,10 +576,10 @@ __noendbr void ibt_restore(u64 save) u64 msr; if (cpu_feature_enabled(X86_FEATURE_IBT)) { - rdmsrl(MSR_IA32_S_CET, msr); + rdmsrq(MSR_IA32_S_CET, msr); msr &= ~CET_ENDBR_EN; msr |= (save & CET_ENDBR_EN); - wrmsrl(MSR_IA32_S_CET, msr); + wrmsrq(MSR_IA32_S_CET, msr); } } @@ -602,15 +603,15 @@ static __always_inline void setup_cet(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_USER_SHSTK); if (kernel_ibt) - wrmsrl(MSR_IA32_S_CET, CET_ENDBR_EN); + wrmsrq(MSR_IA32_S_CET, CET_ENDBR_EN); else - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); cr4_set_bits(X86_CR4_CET); if (kernel_ibt && ibt_selftest()) { pr_err("IBT selftest: Failed!\n"); - wrmsrl(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); setup_clear_cpu_cap(X86_FEATURE_IBT); } } @@ -621,8 +622,8 @@ __noendbr void cet_disable(void) cpu_feature_enabled(X86_FEATURE_SHSTK))) return; - wrmsrl(MSR_IA32_S_CET, 0); - wrmsrl(MSR_IA32_U_CET, 0); + wrmsrq(MSR_IA32_S_CET, 0); + wrmsrq(MSR_IA32_U_CET, 0); } /* @@ -751,9 +752,9 @@ void __init switch_gdt_and_percpu_base(int cpu) * No need to load %gs. It is already correct. * * Writing %gs on 64bit would zero GSBASE which would make any per - * CPU operation up to the point of the wrmsrl() fault. + * CPU operation up to the point of the wrmsrq() fault. * - * Set GSBASE to the new offset. Until the wrmsrl() happens the + * Set GSBASE to the new offset. Until the wrmsrq() happens the * early mapping is still valid. That means the GSBASE update will * lose any prior per CPU data which was not copied over in * setup_per_cpu_areas(). @@ -761,7 +762,7 @@ void __init switch_gdt_and_percpu_base(int cpu) * This works even with stackprotector enabled because the * per CPU stack canary is 0 in both per CPU areas. */ - wrmsrl(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); + wrmsrq(MSR_GS_BASE, cpu_kernelmode_gs_base(cpu)); #else /* * %fs is already set to __KERNEL_PERCPU, but after switching GDT @@ -1005,17 +1006,18 @@ void get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[CPUID_D_1_EAX] = eax; } - /* AMD-defined flags: level 0x80000001 */ + /* + * Check if extended CPUID leaves are implemented: Max extended + * CPUID leaf must be in the 0x80000001-0x8000ffff range. + */ eax = cpuid_eax(0x80000000); - c->extended_cpuid_level = eax; + c->extended_cpuid_level = ((eax & 0xffff0000) == 0x80000000) ? eax : 0; - if ((eax & 0xffff0000) == 0x80000000) { - if (eax >= 0x80000001) { - cpuid(0x80000001, &eax, &ebx, &ecx, &edx); + if (c->extended_cpuid_level >= 0x80000001) { + cpuid(0x80000001, &eax, &ebx, &ecx, &edx); - c->x86_capability[CPUID_8000_0001_ECX] = ecx; - c->x86_capability[CPUID_8000_0001_EDX] = edx; - } + c->x86_capability[CPUID_8000_0001_ECX] = ecx; + c->x86_capability[CPUID_8000_0001_EDX] = edx; } if (c->extended_cpuid_level >= 0x80000007) { @@ -1227,6 +1229,10 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define GDS BIT(6) /* CPU is affected by Register File Data Sampling */ #define RFDS BIT(7) +/* CPU is affected by Indirect Target Selection */ +#define ITS BIT(8) +/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */ +#define ITS_NATIVE_ONLY BIT(9) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_IVYBRIDGE, X86_STEP_MAX, SRBDS), @@ -1238,22 +1244,25 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPS(INTEL_BROADWELL_G, X86_STEP_MAX, SRBDS), VULNBL_INTEL_STEPS(INTEL_BROADWELL_X, X86_STEP_MAX, MMIO), VULNBL_INTEL_STEPS(INTEL_BROADWELL, X86_STEP_MAX, SRBDS), - VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, 0x5, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_SKYLAKE_X, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS), VULNBL_INTEL_STEPS(INTEL_SKYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPS(INTEL_SKYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, 0xb, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE_L, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, 0xc, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPS(INTEL_KABYLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | SRBDS | ITS), VULNBL_INTEL_STEPS(INTEL_CANNONLAKE_L, X86_STEP_MAX, RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS), - VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED), - VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS), - VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_D, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_ICELAKE_X, X86_STEP_MAX, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, 0x0, MMIO | RETBLEED | ITS), + VULNBL_INTEL_STEPS(INTEL_COMETLAKE_L, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE_L, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPS(INTEL_TIGERLAKE, X86_STEP_MAX, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPS(INTEL_LAKEFIELD, X86_STEP_MAX, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPS(INTEL_ROCKETLAKE, X86_STEP_MAX, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_TYPE(INTEL_ALDERLAKE, ATOM, RFDS), VULNBL_INTEL_STEPS(INTEL_ALDERLAKE_L, X86_STEP_MAX, RFDS), VULNBL_INTEL_TYPE(INTEL_RAPTORLAKE, ATOM, RFDS), @@ -1288,7 +1297,7 @@ u64 x86_read_arch_cap_msr(void) u64 x86_arch_cap_msr = 0; if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) - rdmsrl(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); + rdmsrq(MSR_IA32_ARCH_CAPABILITIES, x86_arch_cap_msr); return x86_arch_cap_msr; } @@ -1318,10 +1327,78 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) return cpu_matches(cpu_vuln_blacklist, RFDS); } +static bool __init vulnerable_to_its(u64 x86_arch_cap_msr) +{ + /* The "immunity" bit trumps everything else: */ + if (x86_arch_cap_msr & ARCH_CAP_ITS_NO) + return false; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + /* None of the affected CPUs have BHI_CTRL */ + if (boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + /* + * If a VMM did not expose ITS_NO, assume that a guest could + * be running on a vulnerable hardware or may migrate to such + * hardware. + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; + + if (cpu_matches(cpu_vuln_blacklist, ITS)) + return true; + + return false; +} + +static struct x86_cpu_id cpu_latest_microcode[] = { +#include "microcode/intel-ucode-defs.h" + {} +}; + +static bool __init cpu_has_old_microcode(void) +{ + const struct x86_cpu_id *m = x86_match_cpu(cpu_latest_microcode); + + /* Give unknown CPUs a pass: */ + if (!m) { + /* Intel CPUs should be in the list. Warn if not: */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + pr_info("x86/CPU: Model not found in latest microcode list\n"); + return false; + } + + /* + * Hosts usually lie to guests with a super high microcode + * version. Just ignore what hosts tell guests: + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + /* Consider all debug microcode to be old: */ + if (boot_cpu_data.microcode & BIT(31)) + return true; + + /* Give new microcode a pass: */ + if (boot_cpu_data.microcode >= m->driver_data) + return false; + + /* Uh oh, too old: */ + return true; +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); + if (cpu_has_old_microcode()) { + pr_warn("x86/CPU: Running old microcode\n"); + setup_force_cpu_bug(X86_BUG_OLD_MICROCODE); + add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); + } + /* Set ITLB_MULTIHIT bug if cpu is not in the whitelist and not mitigated */ if (!cpu_matches(cpu_vuln_whitelist, NO_ITLB_MULTIHIT) && !(x86_arch_cap_msr & ARCH_CAP_PSCHANGE_MC_NO)) @@ -1402,15 +1479,10 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * Affected CPU list is generally enough to enumerate the vulnerability, * but for virtualization case check for ARCH_CAP MSR bits also, VMM may * not want the guest to enumerate the bug. - * - * Set X86_BUG_MMIO_UNKNOWN for CPUs that are neither in the blacklist, - * nor in the whitelist and also don't enumerate MSR ARCH_CAP MMIO bits. */ if (!arch_cap_mmio_immune(x86_arch_cap_msr)) { if (cpu_matches(cpu_vuln_blacklist, MMIO)) setup_force_cpu_bug(X86_BUG_MMIO_STALE_DATA); - else if (!cpu_matches(cpu_vuln_whitelist, NO_MMIO)) - setup_force_cpu_bug(X86_BUG_MMIO_UNKNOWN); } if (!cpu_has(c, X86_FEATURE_BTC_NO)) { @@ -1439,9 +1511,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); - /* When virtualized, eIBRS could be hidden, assume vulnerable */ - if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && - !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + /* + * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with + * BHI_NO still need to use the BHI mitigation to prevent Intra-mode + * attacks. When virtualized, eIBRS could be hidden, assume vulnerable. + */ + if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) && (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || boot_cpu_has(X86_FEATURE_HYPERVISOR))) setup_force_cpu_bug(X86_BUG_BHI); @@ -1449,6 +1524,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (vulnerable_to_its(x86_arch_cap_msr)) { + setup_force_cpu_bug(X86_BUG_ITS); + if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY)) + setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY); + } + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; @@ -1630,11 +1711,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) memset(&c->x86_capability, 0, sizeof(c->x86_capability)); c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (have_cpuid_p()) { + if (cpuid_feature()) { cpu_detect(c); get_cpu_vendor(c); intel_unlock_cpuid_leafs(c); @@ -1749,11 +1830,11 @@ static bool detect_null_seg_behavior(void) */ unsigned long old_base, tmp; - rdmsrl(MSR_FS_BASE, old_base); - wrmsrl(MSR_FS_BASE, 1); + rdmsrq(MSR_FS_BASE, old_base); + wrmsrq(MSR_FS_BASE, 1); loadsegment(fs, 0); - rdmsrl(MSR_FS_BASE, tmp); - wrmsrl(MSR_FS_BASE, old_base); + rdmsrq(MSR_FS_BASE, tmp); + wrmsrq(MSR_FS_BASE, old_base); return tmp == 0; } @@ -1794,11 +1875,11 @@ static void generic_identify(struct cpuinfo_x86 *c) { c->extended_cpuid_level = 0; - if (!have_cpuid_p()) + if (!cpuid_feature()) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) + if (!cpuid_feature()) return; cpu_detect(c); @@ -1982,9 +2063,9 @@ void enable_sep_cpu(void) */ tss->x86_tss.ss1 = __KERNEL_CS; - wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); + wrmsrq(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1); + wrmsrq(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); + wrmsrq(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32); put_cpu(); } @@ -2091,7 +2172,7 @@ DEFINE_PER_CPU_CACHE_HOT(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_ DEFINE_PER_CPU_CACHE_HOT(u64, __x86_call_depth); EXPORT_PER_CPU_SYMBOL(__x86_call_depth); -static void wrmsrl_cstar(unsigned long val) +static void wrmsrq_cstar(unsigned long val) { /* * Intel CPUs do not support 32-bit SYSCALL. Writing to MSR_CSTAR @@ -2099,37 +2180,37 @@ static void wrmsrl_cstar(unsigned long val) * guest. Avoid the pointless write on all Intel CPUs. */ if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - wrmsrl(MSR_CSTAR, val); + wrmsrq(MSR_CSTAR, val); } static inline void idt_syscall_init(void) { - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + wrmsrq(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); if (ia32_enabled()) { - wrmsrl_cstar((unsigned long)entry_SYSCALL_compat); + wrmsrq_cstar((unsigned long)entry_SYSCALL_compat); /* * This only works on Intel CPUs. * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. * This does not cause SYSENTER to jump to the wrong location, because * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1)); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); } else { - wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore); - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); + wrmsrq_cstar((unsigned long)entry_SYSCALL32_ignore); + wrmsrq_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrq_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrq_safe(MSR_IA32_SYSENTER_EIP, 0ULL); } /* * Flags to clear on syscall; clear as much as possible * to minimize user space-kernel interference. */ - wrmsrl(MSR_SYSCALL_MASK, + wrmsrq(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF| X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF| X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF| @@ -2198,7 +2279,7 @@ static inline void setup_getcpu(int cpu) struct desc_struct d = { }; if (boot_cpu_has(X86_FEATURE_RDTSCP) || boot_cpu_has(X86_FEATURE_RDPID)) - wrmsr(MSR_TSC_AUX, cpudata, 0); + wrmsrq(MSR_TSC_AUX, cpudata); /* Store CPU and node number in limit. */ d.limit0 = cpudata; @@ -2313,8 +2394,8 @@ void cpu_init(void) memset(cur->thread.tls_array, 0, GDT_ENTRY_TLS_ENTRIES * 8); syscall_init(); - wrmsrl(MSR_FS_BASE, 0); - wrmsrl(MSR_KERNEL_GS_BASE, 0); + wrmsrq(MSR_FS_BASE, 0); + wrmsrq(MSR_KERNEL_GS_BASE, 0); barrier(); x2apic_setup(); diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 51deb60a9d26..bc38b2d56f26 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -75,6 +75,15 @@ extern void check_null_seg_clears_base(struct cpuinfo_x86 *c); void cacheinfo_amd_init_llc_id(struct cpuinfo_x86 *c, u16 die_id); void cacheinfo_hygon_init_llc_id(struct cpuinfo_x86 *c); +#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) +struct amd_northbridge *amd_init_l3_cache(int index); +#else +static inline struct amd_northbridge *amd_init_l3_cache(int index) +{ + return NULL; +} +#endif + unsigned int aperfmperf_get_khz(int cpu); void cpu_select_mitigations(void); diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c index a2fbea0be535..46efcbd6afa4 100644 --- a/arch/x86/kernel/cpu/cpuid-deps.c +++ b/arch/x86/kernel/cpu/cpuid-deps.c @@ -28,6 +28,7 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_APX, X86_FEATURE_XSAVE }, { X86_FEATURE_CMOV, X86_FEATURE_FXSR }, { X86_FEATURE_MMX, X86_FEATURE_FXSR }, { X86_FEATURE_MMXEXT, X86_FEATURE_MMX }, @@ -82,8 +83,12 @@ static const struct cpuid_dep cpuid_deps[] = { { X86_FEATURE_XFD, X86_FEATURE_XSAVES }, { X86_FEATURE_XFD, X86_FEATURE_XGETBV1 }, { X86_FEATURE_AMX_TILE, X86_FEATURE_XFD }, + { X86_FEATURE_AMX_FP16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_BF16, X86_FEATURE_AMX_TILE }, + { X86_FEATURE_AMX_INT8, X86_FEATURE_AMX_TILE }, { X86_FEATURE_SHSTK, X86_FEATURE_XSAVES }, { X86_FEATURE_FRED, X86_FEATURE_LKGS }, + { X86_FEATURE_SPEC_CTRL_SSBD, X86_FEATURE_SPEC_CTRL }, {} }; diff --git a/arch/x86/kernel/cpu/cpuid_0x2_table.c b/arch/x86/kernel/cpu/cpuid_0x2_table.c new file mode 100644 index 000000000000..89bc8db5e9c6 --- /dev/null +++ b/arch/x86/kernel/cpu/cpuid_0x2_table.c @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/sizes.h> + +#include <asm/cpuid/types.h> + +#include "cpu.h" + +#define CACHE_ENTRY(_desc, _type, _size) \ + [_desc] = { \ + .c_type = (_type), \ + .c_size = (_size) / SZ_1K, \ + } + +#define TLB_ENTRY(_desc, _type, _entries) \ + [_desc] = { \ + .t_type = (_type), \ + .entries = (_entries), \ + } + +const struct leaf_0x2_table cpuid_0x2_table[256] = { + CACHE_ENTRY(0x06, CACHE_L1_INST, SZ_8K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x08, CACHE_L1_INST, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x09, CACHE_L1_INST, SZ_32K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0a, CACHE_L1_DATA, SZ_8K ), /* 2 way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0c, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x0d, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x0e, CACHE_L1_DATA, SZ_24K ), /* 6-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x21, CACHE_L2, SZ_256K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x22, CACHE_L3, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x23, CACHE_L3, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x25, CACHE_L3, SZ_2M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x29, CACHE_L3, SZ_4M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x2c, CACHE_L1_DATA, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x30, CACHE_L1_INST, SZ_32K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x39, CACHE_L2, SZ_128K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3a, CACHE_L2, SZ_192K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3b, CACHE_L2, SZ_128K ), /* 2-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3c, CACHE_L2, SZ_256K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3d, CACHE_L2, SZ_384K ), /* 6-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3e, CACHE_L2, SZ_512K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x3f, CACHE_L2, SZ_256K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x41, CACHE_L2, SZ_128K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x42, CACHE_L2, SZ_256K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x43, CACHE_L2, SZ_512K ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x44, CACHE_L2, SZ_1M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x45, CACHE_L2, SZ_2M ), /* 4-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x46, CACHE_L3, SZ_4M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x47, CACHE_L3, SZ_8M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x48, CACHE_L2, SZ_3M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x49, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4a, CACHE_L3, SZ_6M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4b, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4c, CACHE_L3, SZ_12M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4d, CACHE_L3, SZ_16M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x4e, CACHE_L2, SZ_6M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x60, CACHE_L1_DATA, SZ_16K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x66, CACHE_L1_DATA, SZ_8K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x67, CACHE_L1_DATA, SZ_16K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x68, CACHE_L1_DATA, SZ_32K ), /* 4-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x78, CACHE_L2, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x79, CACHE_L2, SZ_128K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7a, CACHE_L2, SZ_256K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7b, CACHE_L2, SZ_512K ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7c, CACHE_L2, SZ_1M ), /* 8-way set assoc, sectored cache, 64 byte line size */ + CACHE_ENTRY(0x7d, CACHE_L2, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x7f, CACHE_L2, SZ_512K ), /* 2-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x80, CACHE_L2, SZ_512K ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x82, CACHE_L2, SZ_256K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x83, CACHE_L2, SZ_512K ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x84, CACHE_L2, SZ_1M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x85, CACHE_L2, SZ_2M ), /* 8-way set assoc, 32 byte line size */ + CACHE_ENTRY(0x86, CACHE_L2, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0x87, CACHE_L2, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd0, CACHE_L3, SZ_512K ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd1, CACHE_L3, SZ_1M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd2, CACHE_L3, SZ_2M ), /* 4-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd6, CACHE_L3, SZ_1M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd7, CACHE_L3, SZ_2M ), /* 8-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xd8, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdc, CACHE_L3, SZ_2M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xdd, CACHE_L3, SZ_4M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xde, CACHE_L3, SZ_8M ), /* 12-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe2, CACHE_L3, SZ_2M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe3, CACHE_L3, SZ_4M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xe4, CACHE_L3, SZ_8M ), /* 16-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xea, CACHE_L3, SZ_12M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xeb, CACHE_L3, SZ_18M ), /* 24-way set assoc, 64 byte line size */ + CACHE_ENTRY(0xec, CACHE_L3, SZ_24M ), /* 24-way set assoc, 64 byte line size */ + + TLB_ENTRY( 0x01, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x02, TLB_INST_4M, 2 ), /* TLB_INST 4 MByte pages, full associative */ + TLB_ENTRY( 0x03, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0x04, TLB_DATA_4M, 8 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x05, TLB_DATA_4M, 32 ), /* TLB_DATA 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x0b, TLB_INST_4M, 4 ), /* TLB_INST 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x4f, TLB_INST_4K, 32 ), /* TLB_INST 4 KByte pages */ + TLB_ENTRY( 0x50, TLB_INST_ALL, 64 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x51, TLB_INST_ALL, 128 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x52, TLB_INST_ALL, 256 ), /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ + TLB_ENTRY( 0x55, TLB_INST_2M_4M, 7 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0x56, TLB_DATA0_4M, 16 ), /* TLB_DATA0 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x57, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0x59, TLB_DATA0_4K, 16 ), /* TLB_DATA0 4 KByte pages, fully associative */ + TLB_ENTRY( 0x5a, TLB_DATA0_2M_4M, 32 ), /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ + TLB_ENTRY( 0x5b, TLB_DATA_4K_4M, 64 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5c, TLB_DATA_4K_4M, 128 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x5d, TLB_DATA_4K_4M, 256 ), /* TLB_DATA 4 KByte and 4 MByte pages */ + TLB_ENTRY( 0x61, TLB_INST_4K, 48 ), /* TLB_INST 4 KByte pages, full associative */ + TLB_ENTRY( 0x63, TLB_DATA_1G_2M_4M, 4 ), /* TLB_DATA 1 GByte pages, 4-way set associative + * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ + TLB_ENTRY( 0x6b, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 8-way associative */ + TLB_ENTRY( 0x6c, TLB_DATA_2M_4M, 128 ), /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ + TLB_ENTRY( 0x6d, TLB_DATA_1G, 16 ), /* TLB_DATA 1 GByte pages, fully associative */ + TLB_ENTRY( 0x76, TLB_INST_2M_4M, 8 ), /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ + TLB_ENTRY( 0xb0, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb1, TLB_INST_2M_4M, 4 ), /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ + TLB_ENTRY( 0xb2, TLB_INST_4K, 64 ), /* TLB_INST 4KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb3, TLB_DATA_4K, 128 ), /* TLB_DATA 4 KByte pages, 4-way set associative */ + TLB_ENTRY( 0xb4, TLB_DATA_4K, 256 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xb5, TLB_INST_4K, 64 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xb6, TLB_INST_4K, 128 ), /* TLB_INST 4 KByte pages, 8-way set associative */ + TLB_ENTRY( 0xba, TLB_DATA_4K, 64 ), /* TLB_DATA 4 KByte pages, 4-way associative */ + TLB_ENTRY( 0xc0, TLB_DATA_4K_4M, 8 ), /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ + TLB_ENTRY( 0xc1, STLB_4K_2M, 1024 ), /* STLB 4 KByte and 2 MByte pages, 8-way associative */ + TLB_ENTRY( 0xc2, TLB_DATA_2M_4M, 16 ), /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ + TLB_ENTRY( 0xca, STLB_4K, 512 ), /* STLB 4 KByte pages, 4-way associative */ +}; diff --git a/arch/x86/kernel/cpu/feat_ctl.c b/arch/x86/kernel/cpu/feat_ctl.c index 4a4118784c13..d69757246bde 100644 --- a/arch/x86/kernel/cpu/feat_ctl.c +++ b/arch/x86/kernel/cpu/feat_ctl.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> #include <asm/msr-index.h> +#include <asm/msr.h> #include <asm/processor.h> #include <asm/vmx.h> @@ -118,7 +119,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) bool enable_vmx; u64 msr; - if (rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr)) { + if (rdmsrq_safe(MSR_IA32_FEAT_CTL, &msr)) { clear_cpu_cap(c, X86_FEATURE_VMX); clear_cpu_cap(c, X86_FEATURE_SGX); return; @@ -165,7 +166,7 @@ void init_ia32_feat_ctl(struct cpuinfo_x86 *c) msr |= FEAT_CTL_SGX_LC_ENABLED; } - wrmsrl(MSR_IA32_FEAT_CTL, msr); + wrmsrq(MSR_IA32_FEAT_CTL, msr); update_caps: set_cpu_cap(c, X86_FEATURE_MSR_IA32_FEAT_CTL); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 6af4a4a90a52..2154f12766fb 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -15,6 +15,7 @@ #include <asm/cacheinfo.h> #include <asm/spec-ctrl.h> #include <asm/delay.h> +#include <asm/msr.h> #include "cpu.h" @@ -96,7 +97,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { u64 val; - rdmsrl(MSR_K7_HWCR, val); + rdmsrq(MSR_K7_HWCR, val); if (!(val & BIT(24))) pr_warn(FW_BUG "TSC doesn't count with P0 frequency!\n"); } @@ -110,7 +111,7 @@ static void bsp_init_hygon(struct cpuinfo_x86 *c) * Try to cache the base value so further operations can * avoid RMW. If that faults, do not enable SSBD. */ - if (!rdmsrl_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { + if (!rdmsrq_safe(MSR_AMD64_LS_CFG, &x86_amd_ls_cfg_base)) { setup_force_cpu_cap(X86_FEATURE_LS_CFG_SSBD); setup_force_cpu_cap(X86_FEATURE_SSBD); x86_amd_ls_cfg_ssbd_mask = 1ULL << 10; @@ -194,7 +195,7 @@ static void init_hygon(struct cpuinfo_x86 *c) init_hygon_cacheinfo(c); if (cpu_has(c, X86_FEATURE_SVM)) { - rdmsrl(MSR_VM_CR, vm_cr); + rdmsrq(MSR_VM_CR, vm_cr); if (vm_cr & SVM_VM_CR_SVM_DIS_MASK) { pr_notice_once("SVM disabled (by BIOS) in MSR_VM_CR\n"); clear_cpu_cap(c, X86_FEATURE_SVM); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index cdc9813871ef..076eaa41b8c8 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -6,6 +6,7 @@ #include <linux/minmax.h> #include <linux/smp.h> #include <linux/string.h> +#include <linux/types.h> #ifdef CONFIG_X86_64 #include <linux/topology.h> @@ -15,6 +16,7 @@ #include <asm/cpu_device_id.h> #include <asm/cpufeature.h> #include <asm/cpu.h> +#include <asm/cpuid/api.h> #include <asm/hwcap2.h> #include <asm/intel-family.h> #include <asm/microcode.h> @@ -157,7 +159,7 @@ static void detect_tme_early(struct cpuinfo_x86 *c) u64 tme_activate; int keyid_bits; - rdmsrl(MSR_IA32_TME_ACTIVATE, tme_activate); + rdmsrq(MSR_IA32_TME_ACTIVATE, tme_activate); if (!TME_ACTIVATE_LOCKED(tme_activate) || !TME_ACTIVATE_ENABLED(tme_activate)) { pr_info_once("x86/tme: not enabled by BIOS\n"); @@ -299,7 +301,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) * string flag and enhanced fast string capabilities accordingly. */ if (c->x86_vfm >= INTEL_PENTIUM_M_DOTHAN) { - rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); + rdmsrq(MSR_IA32_MISC_ENABLE, misc_enable); if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { /* X86_FEATURE_ERMS is set based on CPUID */ set_cpu_cap(c, X86_FEATURE_REP_GOOD); @@ -488,7 +490,7 @@ static void init_cpuid_fault(struct cpuinfo_x86 *c) { u64 msr; - if (!rdmsrl_safe(MSR_PLATFORM_INFO, &msr)) { + if (!rdmsrq_safe(MSR_PLATFORM_INFO, &msr)) { if (msr & MSR_PLATFORM_INFO_CPUID_FAULT) set_cpu_cap(c, X86_FEATURE_CPUID_FAULT); } @@ -498,7 +500,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) { u64 msr; - if (rdmsrl_safe(MSR_MISC_FEATURES_ENABLES, &msr)) + if (rdmsrq_safe(MSR_MISC_FEATURES_ENABLES, &msr)) return; /* Clear all MISC features */ @@ -509,7 +511,7 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) probe_xeon_phi_r3mwait(c); msr = this_cpu_read(msr_misc_features_shadow); - wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); + wrmsrq(MSR_MISC_FEATURES_ENABLES, msr); } /* @@ -646,103 +648,11 @@ static unsigned int intel_size_cache(struct cpuinfo_x86 *c, unsigned int size) } #endif -#define TLB_INST_4K 0x01 -#define TLB_INST_4M 0x02 -#define TLB_INST_2M_4M 0x03 - -#define TLB_INST_ALL 0x05 -#define TLB_INST_1G 0x06 - -#define TLB_DATA_4K 0x11 -#define TLB_DATA_4M 0x12 -#define TLB_DATA_2M_4M 0x13 -#define TLB_DATA_4K_4M 0x14 - -#define TLB_DATA_1G 0x16 -#define TLB_DATA_1G_2M_4M 0x17 - -#define TLB_DATA0_4K 0x21 -#define TLB_DATA0_4M 0x22 -#define TLB_DATA0_2M_4M 0x23 - -#define STLB_4K 0x41 -#define STLB_4K_2M 0x42 - -/* - * All of leaf 0x2's one-byte TLB descriptors implies the same number of - * entries for their respective TLB types. The 0x63 descriptor is an - * exception: it implies 4 dTLB entries for 1GB pages 32 dTLB entries - * for 2MB or 4MB pages. Encode descriptor 0x63 dTLB entry count for - * 2MB/4MB pages here, as its count for dTLB 1GB pages is already at the - * intel_tlb_table[] mapping. - */ -#define TLB_0x63_2M_4M_ENTRIES 32 - -struct _tlb_table { - unsigned char descriptor; - char tlb_type; - unsigned int entries; -}; - -static const struct _tlb_table intel_tlb_table[] = { - { 0x01, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages, 4-way set associative */ - { 0x02, TLB_INST_4M, 2}, /* TLB_INST 4 MByte pages, full associative */ - { 0x03, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way set associative */ - { 0x04, TLB_DATA_4M, 8}, /* TLB_DATA 4 MByte pages, 4-way set associative */ - { 0x05, TLB_DATA_4M, 32}, /* TLB_DATA 4 MByte pages, 4-way set associative */ - { 0x0b, TLB_INST_4M, 4}, /* TLB_INST 4 MByte pages, 4-way set associative */ - { 0x4f, TLB_INST_4K, 32}, /* TLB_INST 4 KByte pages */ - { 0x50, TLB_INST_ALL, 64}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ - { 0x51, TLB_INST_ALL, 128}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ - { 0x52, TLB_INST_ALL, 256}, /* TLB_INST 4 KByte and 2-MByte or 4-MByte pages */ - { 0x55, TLB_INST_2M_4M, 7}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ - { 0x56, TLB_DATA0_4M, 16}, /* TLB_DATA0 4 MByte pages, 4-way set associative */ - { 0x57, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, 4-way associative */ - { 0x59, TLB_DATA0_4K, 16}, /* TLB_DATA0 4 KByte pages, fully associative */ - { 0x5a, TLB_DATA0_2M_4M, 32}, /* TLB_DATA0 2-MByte or 4 MByte pages, 4-way set associative */ - { 0x5b, TLB_DATA_4K_4M, 64}, /* TLB_DATA 4 KByte and 4 MByte pages */ - { 0x5c, TLB_DATA_4K_4M, 128}, /* TLB_DATA 4 KByte and 4 MByte pages */ - { 0x5d, TLB_DATA_4K_4M, 256}, /* TLB_DATA 4 KByte and 4 MByte pages */ - { 0x61, TLB_INST_4K, 48}, /* TLB_INST 4 KByte pages, full associative */ - { 0x63, TLB_DATA_1G_2M_4M, 4}, /* TLB_DATA 1 GByte pages, 4-way set associative - * (plus 32 entries TLB_DATA 2 MByte or 4 MByte pages, not encoded here) */ - { 0x6b, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 8-way associative */ - { 0x6c, TLB_DATA_2M_4M, 128}, /* TLB_DATA 2 MByte or 4 MByte pages, 8-way associative */ - { 0x6d, TLB_DATA_1G, 16}, /* TLB_DATA 1 GByte pages, fully associative */ - { 0x76, TLB_INST_2M_4M, 8}, /* TLB_INST 2-MByte or 4-MByte pages, fully associative */ - { 0xb0, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 4-way set associative */ - { 0xb1, TLB_INST_2M_4M, 4}, /* TLB_INST 2M pages, 4-way, 8 entries or 4M pages, 4-way entries */ - { 0xb2, TLB_INST_4K, 64}, /* TLB_INST 4KByte pages, 4-way set associative */ - { 0xb3, TLB_DATA_4K, 128}, /* TLB_DATA 4 KByte pages, 4-way set associative */ - { 0xb4, TLB_DATA_4K, 256}, /* TLB_DATA 4 KByte pages, 4-way associative */ - { 0xb5, TLB_INST_4K, 64}, /* TLB_INST 4 KByte pages, 8-way set associative */ - { 0xb6, TLB_INST_4K, 128}, /* TLB_INST 4 KByte pages, 8-way set associative */ - { 0xba, TLB_DATA_4K, 64}, /* TLB_DATA 4 KByte pages, 4-way associative */ - { 0xc0, TLB_DATA_4K_4M, 8}, /* TLB_DATA 4 KByte and 4 MByte pages, 4-way associative */ - { 0xc1, STLB_4K_2M, 1024}, /* STLB 4 KByte and 2 MByte pages, 8-way associative */ - { 0xc2, TLB_DATA_2M_4M, 16}, /* TLB_DATA 2 MByte/4MByte pages, 4-way associative */ - { 0xca, STLB_4K, 512}, /* STLB 4 KByte pages, 4-way associative */ - { 0x00, 0, 0 } -}; - -static void intel_tlb_lookup(const unsigned char desc) +static void intel_tlb_lookup(const struct leaf_0x2_table *desc) { - unsigned int entries; - unsigned char k; - - if (desc == 0) - return; - - /* look up this descriptor in the table */ - for (k = 0; intel_tlb_table[k].descriptor != desc && - intel_tlb_table[k].descriptor != 0; k++) - ; + short entries = desc->entries; - if (intel_tlb_table[k].tlb_type == 0) - return; - - entries = intel_tlb_table[k].entries; - switch (intel_tlb_table[k].tlb_type) { + switch (desc->t_type) { case STLB_4K: tlb_lli_4k = max(tlb_lli_4k, entries); tlb_lld_4k = max(tlb_lld_4k, entries); @@ -799,28 +709,16 @@ static void intel_tlb_lookup(const unsigned char desc) static void intel_detect_tlb(struct cpuinfo_x86 *c) { - int i, j, n; - unsigned int regs[4]; - unsigned char *desc = (unsigned char *)regs; + const struct leaf_0x2_table *desc; + union leaf_0x2_regs regs; + u8 *ptr; if (c->cpuid_level < 2) return; - /* Number of times to iterate */ - n = cpuid_eax(2) & 0xFF; - - for (i = 0 ; i < n ; i++) { - cpuid(2, ®s[0], ®s[1], ®s[2], ®s[3]); - - /* If bit 31 is set, this is an unknown format */ - for (j = 0 ; j < 4 ; j++) - if (regs[j] & (1 << 31)) - regs[j] = 0; - - /* Byte 0 is level count, not a descriptor */ - for (j = 1 ; j < 16 ; j++) - intel_tlb_lookup(desc[j]); - } + cpuid_leaf_0x2(®s); + for_each_cpuid_0x2_desc(regs, ptr, desc) + intel_tlb_lookup(desc); } static const struct cpu_dev intel_cpu_dev = { diff --git a/arch/x86/kernel/cpu/intel_epb.c b/arch/x86/kernel/cpu/intel_epb.c index 30b1d63b97f3..bc7671f920a7 100644 --- a/arch/x86/kernel/cpu/intel_epb.c +++ b/arch/x86/kernel/cpu/intel_epb.c @@ -79,7 +79,7 @@ static int intel_epb_save(void) { u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); /* * Ensure that saved_epb will always be nonzero after this write even if * the EPB value read from the MSR is 0. @@ -94,7 +94,7 @@ static void intel_epb_restore(void) u64 val = this_cpu_read(saved_epb); u64 epb; - rdmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb); + rdmsrq(MSR_IA32_ENERGY_PERF_BIAS, epb); if (val) { val &= EPB_MASK; } else { @@ -111,7 +111,7 @@ static void intel_epb_restore(void) pr_warn_once("ENERGY_PERF_BIAS: Set to 'normal', was 'performance'\n"); } } - wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); + wrmsrq(MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); } static struct syscore_ops intel_epb_syscore_ops = { @@ -135,7 +135,7 @@ static ssize_t energy_perf_bias_show(struct device *dev, u64 epb; int ret; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; @@ -157,11 +157,11 @@ static ssize_t energy_perf_bias_store(struct device *dev, else if (kstrtou64(buf, 0, &val) || val > MAX_EPB) return -EINVAL; - ret = rdmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); + ret = rdmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, &epb); if (ret < 0) return ret; - ret = wrmsrl_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, + ret = wrmsrq_on_cpu(cpu, MSR_IA32_ENERGY_PERF_BIAS, (epb & ~EPB_MASK) | val); if (ret < 0) return ret; diff --git a/arch/x86/kernel/cpu/mce/amd.c b/arch/x86/kernel/cpu/mce/amd.c index 1075a90141da..9d852c3b2cb5 100644 --- a/arch/x86/kernel/cpu/mce/amd.c +++ b/arch/x86/kernel/cpu/mce/amd.c @@ -662,12 +662,12 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) return; } - rdmsrl(MSR_K7_HWCR, hwcr); + rdmsrq(MSR_K7_HWCR, hwcr); /* McStatusWrEn has to be set */ need_toggle = !(hwcr & BIT(18)); if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + wrmsrq(MSR_K7_HWCR, hwcr | BIT(18)); /* Clear CntP bit safely */ for (i = 0; i < num_msrs; i++) @@ -675,7 +675,7 @@ static void disable_err_thresholding(struct cpuinfo_x86 *c, unsigned int bank) /* restore old settings */ if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); + wrmsrq(MSR_K7_HWCR, hwcr); } /* cpu init entry point, called from mce.c with preempt off */ @@ -805,12 +805,12 @@ static void __log_error(unsigned int bank, u64 status, u64 addr, u64 misc) } if (mce_flags.smca) { - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); + rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), m->ipid); if (m->status & MCI_STATUS_SYNDV) { - rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); - rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); - rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND(bank), m->synd); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(bank), err.vendor.amd.synd1); + rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(bank), err.vendor.amd.synd2); } } @@ -834,16 +834,16 @@ _log_error_bank(unsigned int bank, u32 msr_stat, u32 msr_addr, u64 misc) { u64 status, addr = 0; - rdmsrl(msr_stat, status); + rdmsrq(msr_stat, status); if (!(status & MCI_STATUS_VAL)) return false; if (status & MCI_STATUS_ADDRV) - rdmsrl(msr_addr, addr); + rdmsrq(msr_addr, addr); __log_error(bank, status, addr, misc); - wrmsrl(msr_stat, 0); + wrmsrq(msr_stat, 0); return status & MCI_STATUS_DEFERRED; } @@ -862,7 +862,7 @@ static bool _log_error_deferred(unsigned int bank, u32 misc) return true; /* Clear MCA_DESTAT if the deferred error was logged from MCA_STATUS. */ - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); + wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(bank), 0); return true; } diff --git a/arch/x86/kernel/cpu/mce/core.c b/arch/x86/kernel/cpu/mce/core.c index f6fd71b64b66..e9b3c5d4a52e 100644 --- a/arch/x86/kernel/cpu/mce/core.c +++ b/arch/x86/kernel/cpu/mce/core.c @@ -121,7 +121,7 @@ void mce_prep_record_common(struct mce *m) { m->cpuid = cpuid_eax(1); m->cpuvendor = boot_cpu_data.x86_vendor; - m->mcgcap = __rdmsr(MSR_IA32_MCG_CAP); + m->mcgcap = native_rdmsrq(MSR_IA32_MCG_CAP); /* need the internal __ version to avoid deadlocks */ m->time = __ktime_get_real_seconds(); } @@ -388,9 +388,9 @@ void ex_handler_msr_mce(struct pt_regs *regs, bool wrmsr) } /* MSR access wrappers used for error injection */ -noinstr u64 mce_rdmsrl(u32 msr) +noinstr u64 mce_rdmsrq(u32 msr) { - DECLARE_ARGS(val, low, high); + EAX_EDX_DECLARE_ARGS(val, low, high); if (__this_cpu_read(injectm.finished)) { int offset; @@ -423,7 +423,7 @@ noinstr u64 mce_rdmsrl(u32 msr) return EAX_EDX_VAL(val, low, high); } -static noinstr void mce_wrmsrl(u32 msr, u64 v) +static noinstr void mce_wrmsrq(u32 msr, u64 v) { u32 low, high; @@ -444,7 +444,7 @@ static noinstr void mce_wrmsrl(u32 msr, u64 v) low = (u32)v; high = (u32)(v >> 32); - /* See comment in mce_rdmsrl() */ + /* See comment in mce_rdmsrq() */ asm volatile("1: wrmsr\n" "2:\n" _ASM_EXTABLE_TYPE(1b, 2b, EX_TYPE_WRMSR_IN_MCE) @@ -468,7 +468,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs instrumentation_end(); m = &err->m; - m->mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); + m->mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); if (regs) { /* * Get the address of the instruction at the time of @@ -488,7 +488,7 @@ static noinstr void mce_gather_info(struct mce_hw_err *err, struct pt_regs *regs } /* Use accurate RIP reporting if available. */ if (mca_cfg.rip_msr) - m->ip = mce_rdmsrl(mca_cfg.rip_msr); + m->ip = mce_rdmsrq(mca_cfg.rip_msr); } } @@ -684,10 +684,10 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) struct mce *m = &err->m; if (m->status & MCI_STATUS_MISCV) - m->misc = mce_rdmsrl(mca_msr_reg(i, MCA_MISC)); + m->misc = mce_rdmsrq(mca_msr_reg(i, MCA_MISC)); if (m->status & MCI_STATUS_ADDRV) { - m->addr = mce_rdmsrl(mca_msr_reg(i, MCA_ADDR)); + m->addr = mce_rdmsrq(mca_msr_reg(i, MCA_ADDR)); /* * Mask the reported address by the reported granularity. @@ -702,12 +702,12 @@ static noinstr void mce_read_aux(struct mce_hw_err *err, int i) } if (mce_flags.smca) { - m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + m->ipid = mce_rdmsrq(MSR_AMD64_SMCA_MCx_IPID(i)); if (m->status & MCI_STATUS_SYNDV) { - m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); - err->vendor.amd.synd1 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND1(i)); - err->vendor.amd.synd2 = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND2(i)); + m->synd = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND(i)); + err->vendor.amd.synd1 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND1(i)); + err->vendor.amd.synd2 = mce_rdmsrq(MSR_AMD64_SMCA_MCx_SYND2(i)); } } } @@ -753,7 +753,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) m->bank = i; barrier(); - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); /* * Update storm tracking here, before checking for the @@ -829,7 +829,7 @@ clear_it: /* * Clear state for this bank. */ - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } /* @@ -887,8 +887,8 @@ quirk_sandybridge_ifu(int bank, struct mce *m, struct pt_regs *regs) */ static noinstr bool quirk_skylake_repmov(void) { - u64 mcgstatus = mce_rdmsrl(MSR_IA32_MCG_STATUS); - u64 misc_enable = mce_rdmsrl(MSR_IA32_MISC_ENABLE); + u64 mcgstatus = mce_rdmsrq(MSR_IA32_MCG_STATUS); + u64 misc_enable = mce_rdmsrq(MSR_IA32_MISC_ENABLE); u64 mc1_status; /* @@ -899,7 +899,7 @@ static noinstr bool quirk_skylake_repmov(void) !(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) return false; - mc1_status = mce_rdmsrl(MSR_IA32_MCx_STATUS(1)); + mc1_status = mce_rdmsrq(MSR_IA32_MCx_STATUS(1)); /* Check for a software-recoverable data fetch error. */ if ((mc1_status & @@ -910,8 +910,8 @@ static noinstr bool quirk_skylake_repmov(void) MCI_STATUS_ADDRV | MCI_STATUS_MISCV | MCI_STATUS_AR | MCI_STATUS_S)) { misc_enable &= ~MSR_IA32_MISC_ENABLE_FAST_STRING; - mce_wrmsrl(MSR_IA32_MISC_ENABLE, misc_enable); - mce_wrmsrl(MSR_IA32_MCx_STATUS(1), 0); + mce_wrmsrq(MSR_IA32_MISC_ENABLE, misc_enable); + mce_wrmsrq(MSR_IA32_MCx_STATUS(1), 0); instrumentation_begin(); pr_err_once("Erratum detected, disable fast string copy instructions.\n"); @@ -955,7 +955,7 @@ static __always_inline int mce_no_way_out(struct mce_hw_err *err, char **msg, un int i; for (i = 0; i < this_cpu_read(mce_num_banks); i++) { - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1274,7 +1274,7 @@ static __always_inline void mce_clear_state(unsigned long *toclear) for (i = 0; i < this_cpu_read(mce_num_banks); i++) { if (arch_test_bit(i, toclear)) - mce_wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + mce_wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1298,7 +1298,7 @@ static noinstr bool mce_check_crashing_cpu(void) (crashing_cpu != -1 && crashing_cpu != cpu)) { u64 mcgstatus; - mcgstatus = __rdmsr(MSR_IA32_MCG_STATUS); + mcgstatus = native_rdmsrq(MSR_IA32_MCG_STATUS); if (boot_cpu_data.x86_vendor == X86_VENDOR_ZHAOXIN) { if (mcgstatus & MCG_STATUS_LMCES) @@ -1306,7 +1306,7 @@ static noinstr bool mce_check_crashing_cpu(void) } if (mcgstatus & MCG_STATUS_RIPV) { - __wrmsr(MSR_IA32_MCG_STATUS, 0, 0); + native_wrmsrq(MSR_IA32_MCG_STATUS, 0); return true; } } @@ -1335,7 +1335,7 @@ __mc_scan_banks(struct mce_hw_err *err, struct pt_regs *regs, m->addr = 0; m->bank = i; - m->status = mce_rdmsrl(mca_msr_reg(i, MCA_STATUS)); + m->status = mce_rdmsrq(mca_msr_reg(i, MCA_STATUS)); if (!(m->status & MCI_STATUS_VAL)) continue; @@ -1693,7 +1693,7 @@ out: instrumentation_end(); clear: - mce_wrmsrl(MSR_IA32_MCG_STATUS, 0); + mce_wrmsrq(MSR_IA32_MCG_STATUS, 0); } EXPORT_SYMBOL_GPL(do_machine_check); @@ -1822,7 +1822,7 @@ static void __mcheck_cpu_cap_init(void) u64 cap; u8 b; - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); b = cap & MCG_BANKCNT_MASK; @@ -1863,7 +1863,7 @@ static void __mcheck_cpu_init_generic(void) cr4_set_bits(X86_CR4_MCE); - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); if (cap & MCG_CTL_P) wrmsr(MSR_IA32_MCG_CTL, 0xffffffff, 0xffffffff); } @@ -1878,8 +1878,8 @@ static void __mcheck_cpu_init_clear_banks(void) if (!b->init) continue; - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); - wrmsrl(mca_msr_reg(i, MCA_STATUS), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_STATUS), 0); } } @@ -1905,7 +1905,7 @@ static void __mcheck_cpu_check_banks(void) if (!b->init) continue; - rdmsrl(mca_msr_reg(i, MCA_CTL), msrval); + rdmsrq(mca_msr_reg(i, MCA_CTL), msrval); b->init = !!msrval; } } @@ -2436,7 +2436,7 @@ static void mce_disable_error_reporting(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), 0); + wrmsrq(mca_msr_reg(i, MCA_CTL), 0); } return; } @@ -2786,7 +2786,7 @@ static void mce_reenable_cpu(void) struct mce_bank *b = &mce_banks[i]; if (b->init) - wrmsrl(mca_msr_reg(i, MCA_CTL), b->ctl); + wrmsrq(mca_msr_reg(i, MCA_CTL), b->ctl); } } diff --git a/arch/x86/kernel/cpu/mce/inject.c b/arch/x86/kernel/cpu/mce/inject.c index 06e3cf7229ce..d02c4f556cd0 100644 --- a/arch/x86/kernel/cpu/mce/inject.c +++ b/arch/x86/kernel/cpu/mce/inject.c @@ -24,10 +24,11 @@ #include <linux/pci.h> #include <linux/uaccess.h> -#include <asm/amd_nb.h> +#include <asm/amd/nb.h> #include <asm/apic.h> #include <asm/irq_vectors.h> #include <asm/mce.h> +#include <asm/msr.h> #include <asm/nmi.h> #include <asm/smp.h> @@ -475,27 +476,27 @@ static void prepare_msrs(void *info) struct mce m = *(struct mce *)info; u8 b = m.bank; - wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); + wrmsrq(MSR_IA32_MCG_STATUS, m.mcgstatus); if (boot_cpu_has(X86_FEATURE_SMCA)) { if (m.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); + wrmsrq(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrq(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); } - wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); + wrmsrq(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); if (m.misc) - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrq(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrq(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrq(MSR_IA32_MCx_ADDR(b), m.addr); if (m.misc) - wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); + wrmsrq(MSR_IA32_MCx_MISC(b), m.misc); } } @@ -589,7 +590,7 @@ static int inj_bank_set(void *data, u64 val) u64 cap; /* Get bank count on target CPU so we can handle non-uniform values. */ - rdmsrl_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); + rdmsrq_on_cpu(m->extcpu, MSR_IA32_MCG_CAP, &cap); n_banks = cap & MCG_BANKCNT_MASK; if (val >= n_banks) { @@ -613,7 +614,7 @@ static int inj_bank_set(void *data, u64 val) if (cpu_feature_enabled(X86_FEATURE_SMCA)) { u64 ipid; - if (rdmsrl_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { + if (rdmsrq_on_cpu(m->extcpu, MSR_AMD64_SMCA_MCx_IPID(val), &ipid)) { pr_err("Error reading IPID on CPU%d\n", m->extcpu); return -EINVAL; } @@ -741,15 +742,15 @@ static void check_hw_inj_possible(void) u64 status = MCI_STATUS_VAL, ipid; /* Check whether bank is populated */ - rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); + rdmsrq(MSR_AMD64_SMCA_MCx_IPID(bank), ipid); if (!ipid) continue; toggle_hw_mce_inject(cpu, true); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), status); - rdmsrl_safe(mca_msr_reg(bank, MCA_STATUS), &status); - wrmsrl_safe(mca_msr_reg(bank, MCA_STATUS), 0); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), status); + rdmsrq_safe(mca_msr_reg(bank, MCA_STATUS), &status); + wrmsrq_safe(mca_msr_reg(bank, MCA_STATUS), 0); if (!status) { hw_injection_possible = false; diff --git a/arch/x86/kernel/cpu/mce/intel.c b/arch/x86/kernel/cpu/mce/intel.c index f863df0ff42c..efcf21e9552e 100644 --- a/arch/x86/kernel/cpu/mce/intel.c +++ b/arch/x86/kernel/cpu/mce/intel.c @@ -94,7 +94,7 @@ static bool cmci_supported(int *banks) if (!boot_cpu_has(X86_FEATURE_APIC) || lapic_get_maxlvt() < 6) return false; - rdmsrl(MSR_IA32_MCG_CAP, cap); + rdmsrq(MSR_IA32_MCG_CAP, cap); *banks = min_t(unsigned, MAX_NR_BANKS, cap & MCG_BANKCNT_MASK); return !!(cap & MCG_CMCI_P); } @@ -106,7 +106,7 @@ static bool lmce_supported(void) if (mca_cfg.lmce_disabled) return false; - rdmsrl(MSR_IA32_MCG_CAP, tmp); + rdmsrq(MSR_IA32_MCG_CAP, tmp); /* * LMCE depends on recovery support in the processor. Hence both @@ -123,7 +123,7 @@ static bool lmce_supported(void) * WARN if the MSR isn't locked as init_ia32_feat_ctl() unconditionally * locks the MSR in the event that it wasn't already locked by BIOS. */ - rdmsrl(MSR_IA32_FEAT_CTL, tmp); + rdmsrq(MSR_IA32_FEAT_CTL, tmp); if (WARN_ON_ONCE(!(tmp & FEAT_CTL_LOCKED))) return false; @@ -141,9 +141,9 @@ static void cmci_set_threshold(int bank, int thresh) u64 val; raw_spin_lock_irqsave(&cmci_discover_lock, flags); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val | thresh); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val | thresh); raw_spin_unlock_irqrestore(&cmci_discover_lock, flags); } @@ -184,7 +184,7 @@ static bool cmci_skip_bank(int bank, u64 *val) if (test_bit(bank, mce_banks_ce_disabled)) return true; - rdmsrl(MSR_IA32_MCx_CTL2(bank), *val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), *val); /* Already owned by someone else? */ if (*val & MCI_CTL2_CMCI_EN) { @@ -232,8 +232,8 @@ static void cmci_claim_bank(int bank, u64 val, int bios_zero_thresh, int *bios_w struct mca_storm_desc *storm = this_cpu_ptr(&storm_desc); val |= MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); /* If the enable bit did not stick, this bank should be polled. */ if (!(val & MCI_CTL2_CMCI_EN)) { @@ -324,9 +324,9 @@ static void __cmci_disable_bank(int bank) if (!test_bit(bank, this_cpu_ptr(mce_banks_owned))) return; - rdmsrl(MSR_IA32_MCx_CTL2(bank), val); + rdmsrq(MSR_IA32_MCx_CTL2(bank), val); val &= ~MCI_CTL2_CMCI_EN; - wrmsrl(MSR_IA32_MCx_CTL2(bank), val); + wrmsrq(MSR_IA32_MCx_CTL2(bank), val); __clear_bit(bank, this_cpu_ptr(mce_banks_owned)); if ((val & MCI_CTL2_CMCI_THRESHOLD_MASK) == CMCI_STORM_THRESHOLD) @@ -430,10 +430,10 @@ void intel_init_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); if (!(val & MCG_EXT_CTL_LMCE_EN)) - wrmsrl(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val | MCG_EXT_CTL_LMCE_EN); } void intel_clear_lmce(void) @@ -443,9 +443,9 @@ void intel_clear_lmce(void) if (!lmce_supported()) return; - rdmsrl(MSR_IA32_MCG_EXT_CTL, val); + rdmsrq(MSR_IA32_MCG_EXT_CTL, val); val &= ~MCG_EXT_CTL_LMCE_EN; - wrmsrl(MSR_IA32_MCG_EXT_CTL, val); + wrmsrq(MSR_IA32_MCG_EXT_CTL, val); } /* @@ -460,10 +460,10 @@ static void intel_imc_init(struct cpuinfo_x86 *c) case INTEL_SANDYBRIDGE_X: case INTEL_IVYBRIDGE_X: case INTEL_HASWELL_X: - if (rdmsrl_safe(MSR_ERROR_CONTROL, &error_control)) + if (rdmsrq_safe(MSR_ERROR_CONTROL, &error_control)) return; error_control |= 2; - wrmsrl_safe(MSR_ERROR_CONTROL, error_control); + wrmsrq_safe(MSR_ERROR_CONTROL, error_control); break; } } diff --git a/arch/x86/kernel/cpu/mce/internal.h b/arch/x86/kernel/cpu/mce/internal.h index 95a504ece43e..b5ba598e54cb 100644 --- a/arch/x86/kernel/cpu/mce/internal.h +++ b/arch/x86/kernel/cpu/mce/internal.h @@ -312,7 +312,7 @@ static __always_inline void pentium_machine_check(struct pt_regs *regs) {} static __always_inline void winchip_machine_check(struct pt_regs *regs) {} #endif -noinstr u64 mce_rdmsrl(u32 msr); +noinstr u64 mce_rdmsrq(u32 msr); static __always_inline u32 mca_msr_reg(int bank, enum mca_msr reg) { diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index 96cb992d50ef..097e39327942 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -217,7 +217,6 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi { struct patch_digest *pd = NULL; u8 digest[SHA256_DIGEST_SIZE]; - struct sha256_state s; int i; if (x86_family(bsp_cpuid_1_eax) < 0x17) @@ -235,9 +234,7 @@ static bool verify_sha256_digest(u32 patch_id, u32 cur_rev, const u8 *data, unsi return false; } - sha256_init(&s); - sha256_update(&s, data, len); - sha256_final(&s, digest); + sha256(data, len, digest); if (memcmp(digest, pd->sha256, sizeof(digest))) { pr_err("Patch 0x%x SHA256 digest mismatch!\n", patch_id); @@ -607,7 +604,7 @@ static bool __apply_microcode_amd(struct microcode_amd *mc, u32 *cur_rev, if (!verify_sha256_digest(mc->hdr.patch_id, *cur_rev, (const u8 *)p_addr, psize)) return false; - native_wrmsrl(MSR_AMD64_PATCH_LOADER, p_addr); + native_wrmsrq(MSR_AMD64_PATCH_LOADER, p_addr); if (x86_family(bsp_cpuid_1_eax) == 0x17) { unsigned long p_addr_end = p_addr + psize - 1; @@ -1178,11 +1175,18 @@ static void microcode_fini_cpu_amd(int cpu) uci->mc = NULL; } +static void finalize_late_load_amd(int result) +{ + if (result) + cleanup(); +} + static struct microcode_ops microcode_amd_ops = { .request_microcode_fw = request_microcode_amd, .collect_cpu_info = collect_cpu_info_amd, .apply_microcode = apply_microcode_amd, .microcode_fini_cpu = microcode_fini_cpu_amd, + .finalize_late_load = finalize_late_load_amd, .nmi_safe = true, }; diff --git a/arch/x86/kernel/cpu/microcode/core.c b/arch/x86/kernel/cpu/microcode/core.c index 079f046ee26d..fe50eb5b7c4a 100644 --- a/arch/x86/kernel/cpu/microcode/core.c +++ b/arch/x86/kernel/cpu/microcode/core.c @@ -37,6 +37,7 @@ #include <asm/perf_event.h> #include <asm/processor.h> #include <asm/cmdline.h> +#include <asm/msr.h> #include <asm/setup.h> #include "internal.h" @@ -117,7 +118,7 @@ bool __init microcode_loader_disabled(void) * 3) Certain AMD patch levels are not allowed to be * overwritten. */ - if (!have_cpuid_p() || + if (!cpuid_feature() || native_cpuid_ecx(1) & BIT(31) || amd_check_current_patch_level()) dis_ucode_ldr = true; @@ -696,6 +697,8 @@ static int load_late_locked(void) return load_late_stop_cpus(true); case UCODE_NFOUND: return -ENOENT; + case UCODE_OK: + return 0; default: return -EBADFD; } diff --git a/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h new file mode 100644 index 000000000000..cb6e601701ab --- /dev/null +++ b/arch/x86/kernel/cpu/microcode/intel-ucode-defs.h @@ -0,0 +1,150 @@ +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x03, .steppings = 0x0004, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0001, .driver_data = 0x45 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0002, .driver_data = 0x40 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0004, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x05, .steppings = 0x0008, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0001, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0020, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x0400, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x06, .steppings = 0x2000, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0002, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0004, .driver_data = 0x38 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x07, .steppings = 0x0008, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0002, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0008, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0040, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x08, .steppings = 0x0400, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x09, .steppings = 0x0020, .driver_data = 0x47 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0001, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0a, .steppings = 0x0002, .driver_data = 0x1 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0002, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0b, .steppings = 0x0010, .driver_data = 0x2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0d, .steppings = 0x0040, .driver_data = 0x18 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x0100, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0e, .steppings = 0x1000, .driver_data = 0x59 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0004, .driver_data = 0x5d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0040, .driver_data = 0xd2 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0080, .driver_data = 0x6b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0400, .driver_data = 0x95 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x0800, .driver_data = 0xbc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x0f, .steppings = 0x2000, .driver_data = 0xa4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x16, .steppings = 0x0002, .driver_data = 0x44 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0040, .driver_data = 0x60f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0080, .driver_data = 0x70a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x17, .steppings = 0x0400, .driver_data = 0xa0b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0010, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1a, .steppings = 0x0020, .driver_data = 0x1d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0004, .driver_data = 0x219 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1c, .steppings = 0x0400, .driver_data = 0x107 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1d, .steppings = 0x0002, .driver_data = 0x29 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x1e, .steppings = 0x0020, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0004, .driver_data = 0x11 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x25, .steppings = 0x0020, .driver_data = 0x7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x26, .steppings = 0x0002, .driver_data = 0x105 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2a, .steppings = 0x0080, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2c, .steppings = 0x0004, .driver_data = 0x1f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0040, .driver_data = 0x621 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2d, .steppings = 0x0080, .driver_data = 0x71a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2e, .steppings = 0x0040, .driver_data = 0xd }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x2f, .steppings = 0x0004, .driver_data = 0x3b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0100, .driver_data = 0x838 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x37, .steppings = 0x0200, .driver_data = 0x90d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3a, .steppings = 0x0200, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3c, .steppings = 0x0008, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3d, .steppings = 0x0010, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0010, .driver_data = 0x42e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0040, .driver_data = 0x600 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3e, .steppings = 0x0080, .driver_data = 0x715 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0004, .driver_data = 0x49 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x3f, .steppings = 0x0010, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x45, .steppings = 0x0002, .driver_data = 0x26 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x46, .steppings = 0x0002, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x47, .steppings = 0x0002, .driver_data = 0x22 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0008, .driver_data = 0x368 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4c, .steppings = 0x0010, .driver_data = 0x411 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4d, .steppings = 0x0100, .driver_data = 0x12d }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x4e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0008, .driver_data = 0x1000191 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0010, .driver_data = 0x2007006 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0020, .driver_data = 0x3000010 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0040, .driver_data = 0x4003605 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0080, .driver_data = 0x5003707 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x55, .steppings = 0x0800, .driver_data = 0x7002904 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0004, .driver_data = 0x1c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0008, .driver_data = 0x700001c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0010, .driver_data = 0xf00001a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x56, .steppings = 0x0020, .driver_data = 0xe000015 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0004, .driver_data = 0x14 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0200, .driver_data = 0x48 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5c, .steppings = 0x0400, .driver_data = 0x28 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5e, .steppings = 0x0008, .driver_data = 0xf0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x5f, .steppings = 0x0002, .driver_data = 0x3e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x66, .steppings = 0x0008, .driver_data = 0x2a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0020, .driver_data = 0xc0002f0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6a, .steppings = 0x0040, .driver_data = 0xd0003e7 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x6c, .steppings = 0x0002, .driver_data = 0x10002b0 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0002, .driver_data = 0x42 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7a, .steppings = 0x0100, .driver_data = 0x24 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x7e, .steppings = 0x0020, .driver_data = 0xc6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8a, .steppings = 0x0002, .driver_data = 0x33 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0002, .driver_data = 0xb8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8c, .steppings = 0x0004, .driver_data = 0x38 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8d, .steppings = 0x0002, .driver_data = 0x52 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0200, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0400, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8e, .steppings = 0x1000, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0100, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0080, .driver_data = 0x2b000603 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0040, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0020, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x8f, .steppings = 0x0010, .driver_data = 0x2c000390 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x96, .steppings = 0x0002, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0004, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x97, .steppings = 0x0020, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0004, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbf, .steppings = 0x0020, .driver_data = 0x37 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0008, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9a, .steppings = 0x0010, .driver_data = 0x435 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9c, .steppings = 0x0001, .driver_data = 0x24000026 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0200, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0400, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x0800, .driver_data = 0xf6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x1000, .driver_data = 0xf8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0x9e, .steppings = 0x2000, .driver_data = 0x100 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0004, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0008, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa5, .steppings = 0x0020, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0001, .driver_data = 0xfe }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa6, .steppings = 0x0002, .driver_data = 0xfc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xa7, .steppings = 0x0002, .driver_data = 0x62 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xaa, .steppings = 0x0010, .driver_data = 0x20 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xb7, .steppings = 0x0002, .driver_data = 0x12b }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0004, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0008, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xba, .steppings = 0x0100, .driver_data = 0x4123 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xbe, .steppings = 0x0001, .driver_data = 0x1a }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0004, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0x6, .model = 0xcf, .steppings = 0x0002, .driver_data = 0x21000283 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0080, .driver_data = 0x12 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x00, .steppings = 0x0400, .driver_data = 0x15 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x01, .steppings = 0x0004, .driver_data = 0x2e }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0010, .driver_data = 0x21 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0020, .driver_data = 0x2c }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0040, .driver_data = 0x10 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0080, .driver_data = 0x39 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x02, .steppings = 0x0200, .driver_data = 0x2f }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0004, .driver_data = 0xa }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0008, .driver_data = 0xc }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x03, .steppings = 0x0010, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0002, .driver_data = 0x17 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0008, .driver_data = 0x5 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0010, .driver_data = 0x6 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0080, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0100, .driver_data = 0xe }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0200, .driver_data = 0x3 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x04, .steppings = 0x0400, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0004, .driver_data = 0xf }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0010, .driver_data = 0x4 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0020, .driver_data = 0x8 }, +{ .flags = X86_CPU_ID_FLAG_ENTRY_VALID, .vendor = X86_VENDOR_INTEL, .family = 0xf, .model = 0x06, .steppings = 0x0100, .driver_data = 0x9 }, diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 2a397da43923..371ca6eac00e 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -320,7 +320,7 @@ static enum ucode_state __apply_microcode(struct ucode_cpu_info *uci, } /* write microcode via MSR 0x79 */ - native_wrmsrl(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); + native_wrmsrq(MSR_IA32_UCODE_WRITE, (unsigned long)mc->bits); rev = intel_get_microcode_revision(); if (rev != mc->hdr.rev) diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 3e2533954675..c78f860419d6 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -30,6 +30,7 @@ #include <asm/reboot.h> #include <asm/nmi.h> #include <clocksource/hyperv_timer.h> +#include <asm/msr.h> #include <asm/numa.h> #include <asm/svm.h> @@ -70,7 +71,7 @@ u64 hv_get_non_nested_msr(unsigned int reg) if (hv_is_synic_msr(reg) && ms_hyperv.paravisor_present) hv_ivm_msr_read(reg, &value); else - rdmsrl(reg, value); + rdmsrq(reg, value); return value; } EXPORT_SYMBOL_GPL(hv_get_non_nested_msr); @@ -82,9 +83,9 @@ void hv_set_non_nested_msr(unsigned int reg, u64 value) /* Write proxy bit via wrmsl instruction */ if (hv_is_sint_msr(reg)) - wrmsrl(reg, value | 1 << 20); + wrmsrq(reg, value | 1 << 20); } else { - wrmsrl(reg, value); + wrmsrq(reg, value); } } EXPORT_SYMBOL_GPL(hv_set_non_nested_msr); @@ -345,7 +346,7 @@ static unsigned long hv_get_tsc_khz(void) { unsigned long freq; - rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + rdmsrq(HV_X64_MSR_TSC_FREQUENCY, freq); return freq / 1000; } @@ -541,7 +542,7 @@ static void __init ms_hyperv_init_platform(void) */ u64 hv_lapic_frequency; - rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); + rdmsrq(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_period = hv_lapic_frequency; pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", @@ -574,7 +575,7 @@ static void __init ms_hyperv_init_platform(void) * setting of this MSR bit should happen before init_intel() * is called. */ - wrmsrl(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); + wrmsrq(HV_X64_MSR_TSC_INVARIANT_CONTROL, HV_EXPOSE_INVARIANT_TSC); setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); } diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e2c6b471d230..8c18327eb10b 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -593,7 +593,7 @@ static void get_fixed_ranges(mtrr_type *frs) void mtrr_save_fixed_ranges(void *info) { - if (boot_cpu_has(X86_FEATURE_MTRR)) + if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); } diff --git a/arch/x86/kernel/cpu/resctrl/Makefile b/arch/x86/kernel/cpu/resctrl/Makefile index 0c13b0befd8a..d8a04b195da2 100644 --- a/arch/x86/kernel/cpu/resctrl/Makefile +++ b/arch/x86/kernel/cpu/resctrl/Makefile @@ -2,4 +2,6 @@ obj-$(CONFIG_X86_CPU_RESCTRL) += core.o rdtgroup.o monitor.o obj-$(CONFIG_X86_CPU_RESCTRL) += ctrlmondata.o obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o + +# To allow define_trace.h's recursive include: CFLAGS_pseudo_lock.o = -I$(src) diff --git a/arch/x86/kernel/cpu/resctrl/core.c b/arch/x86/kernel/cpu/resctrl/core.c index cf29681d01e0..7109cbfcad4f 100644 --- a/arch/x86/kernel/cpu/resctrl/core.c +++ b/arch/x86/kernel/cpu/resctrl/core.c @@ -22,6 +22,7 @@ #include <linux/cpuhotplug.h> #include <asm/cpu_device_id.h> +#include <asm/msr.h> #include <asm/resctrl.h> #include "internal.h" @@ -60,7 +61,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L3] = { .r_resctrl = { - .rid = RDT_RESOURCE_L3, .name = "L3", .ctrl_scope = RESCTRL_L3_CACHE, .mon_scope = RESCTRL_L3_CACHE, @@ -74,7 +74,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_L2] = { .r_resctrl = { - .rid = RDT_RESOURCE_L2, .name = "L2", .ctrl_scope = RESCTRL_L2_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_L2), @@ -86,7 +85,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_MBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_MBA, .name = "MB", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_MBA), @@ -96,7 +94,6 @@ struct rdt_hw_resource rdt_resources_all[RDT_NUM_RESOURCES] = { [RDT_RESOURCE_SMBA] = { .r_resctrl = { - .rid = RDT_RESOURCE_SMBA, .name = "SMBA", .ctrl_scope = RESCTRL_L3_CACHE, .ctrl_domains = ctrl_domain_init(RDT_RESOURCE_SMBA), @@ -145,10 +142,10 @@ static inline void cache_alloc_hsw_probe(void) struct rdt_resource *r = &hw_res->r_resctrl; u64 max_cbm = BIT_ULL_MASK(20) - 1, l3_cbm_0; - if (wrmsrl_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) + if (wrmsrq_safe(MSR_IA32_L3_CBM_BASE, max_cbm)) return; - rdmsrl(MSR_IA32_L3_CBM_BASE, l3_cbm_0); + rdmsrq(MSR_IA32_L3_CBM_BASE, l3_cbm_0); /* If all the bits were set in MSR, return success */ if (l3_cbm_0 != max_cbm) @@ -164,21 +161,6 @@ static inline void cache_alloc_hsw_probe(void) rdt_alloc_capable = true; } -bool is_mba_sc(struct rdt_resource *r) -{ - if (!r) - r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - - /* - * The software controller support is only applicable to MBA resource. - * Make sure to check for resource type. - */ - if (r->rid != RDT_RESOURCE_MBA) - return false; - - return r->membw.mba_sc; -} - /* * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values * exposed to user interface and the h/w understandable delay values. @@ -309,7 +291,7 @@ static void mba_wrmsr_amd(struct msr_param *m) unsigned int i; for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } /* @@ -334,7 +316,7 @@ static void mba_wrmsr_intel(struct msr_param *m) /* Write the delay values for mba. */ for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); + wrmsrq(hw_res->msr_base + i, delay_bw_map(hw_dom->ctrl_val[i], m->res)); } static void cat_wrmsr(struct msr_param *m) @@ -344,7 +326,7 @@ static void cat_wrmsr(struct msr_param *m) unsigned int i; for (i = m->low; i < m->high; i++) - wrmsrl(hw_res->msr_base + i, hw_dom->ctrl_val[i]); + wrmsrq(hw_res->msr_base + i, hw_dom->ctrl_val[i]); } u32 resctrl_arch_get_num_closid(struct rdt_resource *r) @@ -737,7 +719,7 @@ struct rdt_options { bool force_off, force_on; }; -static struct rdt_options rdt_options[] __initdata = { +static struct rdt_options rdt_options[] __ro_after_init = { RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), @@ -777,7 +759,7 @@ static int __init set_rdt_options(char *str) } __setup("rdt", set_rdt_options); -bool __init rdt_cpu_has(int flag) +bool rdt_cpu_has(int flag) { bool ret = boot_cpu_has(flag); struct rdt_options *o; @@ -797,7 +779,7 @@ bool __init rdt_cpu_has(int flag) return ret; } -__init bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) +bool resctrl_arch_is_evt_configurable(enum resctrl_event_id evt) { if (!rdt_cpu_has(X86_FEATURE_BMEC)) return false; @@ -1011,7 +993,11 @@ void resctrl_cpu_detect(struct cpuinfo_x86 *c) static int __init resctrl_arch_late_init(void) { struct rdt_resource *r; - int state, ret; + int state, ret, i; + + /* for_each_rdt_resource() requires all rid to be initialised. */ + for (i = 0; i < RDT_NUM_RESOURCES; i++) + rdt_resources_all[i].r_resctrl.rid = i; /* * Initialize functions(or definitions) that are different diff --git a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c index 0a0ac5f6112e..1189c0df4ad7 100644 --- a/arch/x86/kernel/cpu/resctrl/ctrlmondata.c +++ b/arch/x86/kernel/cpu/resctrl/ctrlmondata.c @@ -16,277 +16,9 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/cpu.h> -#include <linux/kernfs.h> -#include <linux/seq_file.h> -#include <linux/slab.h> -#include <linux/tick.h> #include "internal.h" -struct rdt_parse_data { - struct rdtgroup *rdtgrp; - char *buf; -}; - -typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, - struct resctrl_schema *s, - struct rdt_ctrl_domain *d); - -/* - * Check whether MBA bandwidth percentage value is correct. The value is - * checked against the minimum and max bandwidth values specified by the - * hardware. The allocated bandwidth percentage is rounded to the next - * control step available on the hardware. - */ -static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - int ret; - u32 bw; - - /* - * Only linear delay values is supported for current Intel SKUs. - */ - if (!r->membw.delay_linear && r->membw.arch_needs_linear) { - rdt_last_cmd_puts("No support for non-linear MB domains\n"); - return false; - } - - ret = kstrtou32(buf, 10, &bw); - if (ret) { - rdt_last_cmd_printf("Invalid MB value %s\n", buf); - return false; - } - - /* Nothing else to do if software controller is enabled. */ - if (is_mba_sc(r)) { - *data = bw; - return true; - } - - if (bw < r->membw.min_bw || bw > r->membw.max_bw) { - rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", - bw, r->membw.min_bw, r->membw.max_bw); - return false; - } - - *data = roundup(bw, (unsigned long)r->membw.bw_gran); - return true; -} - -static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct resctrl_staged_config *cfg; - u32 closid = data->rdtgrp->closid; - struct rdt_resource *r = s->res; - u32 bw_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - if (!bw_validate(data->buf, &bw_val, r)) - return -EINVAL; - - if (is_mba_sc(r)) { - d->mbps_val[closid] = bw_val; - return 0; - } - - cfg->new_ctrl = bw_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Check whether a cache bit mask is valid. - * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: - * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 - * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 - * - * Haswell does not support a non-contiguous 1s value and additionally - * requires at least two bits set. - * AMD allows non-contiguous bitmasks. - */ -static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) -{ - u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit, val; - int ret; - - ret = kstrtoul(buf, 16, &val); - if (ret) { - rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); - return false; - } - - if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { - rdt_last_cmd_puts("Mask out of range\n"); - return false; - } - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Are non-contiguous bitmasks allowed? */ - if (!r->cache.arch_has_sparse_bitmasks && - (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { - rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); - return false; - } - - if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("Need at least %d bits in the mask\n", - r->cache.min_cbm_bits); - return false; - } - - *data = val; - return true; -} - -/* - * Read one cache bit mask (hex). Check that it is valid for the current - * resource type. - */ -static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, - struct rdt_ctrl_domain *d) -{ - struct rdtgroup *rdtgrp = data->rdtgrp; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 cbm_val; - - cfg = &d->staged_config[s->conf_type]; - if (cfg->have_new_ctrl) { - rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); - return -EINVAL; - } - - /* - * Cannot set up more than one pseudo-locked region in a cache - * hierarchy. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - rdtgroup_pseudo_locked_in_hierarchy(d)) { - rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); - return -EINVAL; - } - - if (!cbm_validate(data->buf, &cbm_val, r)) - return -EINVAL; - - if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_SHAREABLE) && - rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { - rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); - return -EINVAL; - } - - /* - * The CBM may not overlap with the CBM of another closid if - * either is exclusive. - */ - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { - rdt_last_cmd_puts("Overlaps with exclusive group\n"); - return -EINVAL; - } - - if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { - if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - rdt_last_cmd_puts("Overlaps with other group\n"); - return -EINVAL; - } - } - - cfg->new_ctrl = cbm_val; - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * For each domain in this resource we expect to find a series of: - * id=mask - * separated by ";". The "id" is in decimal, and must match one of - * the "id"s for this resource. - */ -static int parse_line(char *line, struct resctrl_schema *s, - struct rdtgroup *rdtgrp) -{ - enum resctrl_conf_type t = s->conf_type; - ctrlval_parser_t *parse_ctrlval = NULL; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - struct rdt_parse_data data; - struct rdt_ctrl_domain *d; - char *dom = NULL, *id; - unsigned long dom_id; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - switch (r->schema_fmt) { - case RESCTRL_SCHEMA_BITMAP: - parse_ctrlval = &parse_cbm; - break; - case RESCTRL_SCHEMA_RANGE: - parse_ctrlval = &parse_bw; - break; - } - - if (WARN_ON_ONCE(!parse_ctrlval)) - return -EINVAL; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && - (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { - rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); - return -EINVAL; - } - -next: - if (!line || line[0] == '\0') - return 0; - dom = strsep(&line, ";"); - id = strsep(&dom, "="); - if (!dom || kstrtoul(id, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); - return -EINVAL; - } - dom = strim(dom); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (d->hdr.id == dom_id) { - data.buf = dom; - data.rdtgrp = rdtgrp; - if (parse_ctrlval(&data, s, d)) - return -EINVAL; - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - cfg = &d->staged_config[t]; - /* - * In pseudo-locking setup mode and just - * parsed a valid CBM that should be - * pseudo-locked. Only one locked region per - * resource group and domain so just do - * the required initialization for single - * region and return. - */ - rdtgrp->plr->s = s; - rdtgrp->plr->d = d; - rdtgrp->plr->cbm = cfg->new_ctrl; - d->plr = rdtgrp->plr; - return 0; - } - goto next; - } - } - return -EINVAL; -} - int resctrl_arch_update_one(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type t, u32 cfg_val) { @@ -351,100 +83,6 @@ int resctrl_arch_update_domains(struct rdt_resource *r, u32 closid) return 0; } -static int rdtgroup_parse_resource(char *resname, char *tok, - struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - - list_for_each_entry(s, &resctrl_schema_all, list) { - if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) - return parse_line(tok, s, rdtgrp); - } - rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); - return -EINVAL; -} - -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct resctrl_schema *s; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - char *tok, *resname; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - /* - * No changes to pseudo-locked region allowed. It has to be removed - * and re-created instead. - */ - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = -EINVAL; - rdt_last_cmd_puts("Resource group is pseudo-locked\n"); - goto out; - } - - rdt_staged_configs_clear(); - - while ((tok = strsep(&buf, "\n")) != NULL) { - resname = strim(strsep(&tok, ":")); - if (!tok) { - rdt_last_cmd_puts("Missing ':'\n"); - ret = -EINVAL; - goto out; - } - if (tok[0] == '\0') { - rdt_last_cmd_printf("Missing '%s' value\n", resname); - ret = -EINVAL; - goto out; - } - ret = rdtgroup_parse_resource(resname, tok, rdtgrp); - if (ret) - goto out; - } - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - - /* - * Writes to mba_sc resources update the software controller, - * not the control MSR. - */ - if (is_mba_sc(r)) - continue; - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret) - goto out; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * If pseudo-locking fails we keep the resource group in - * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service - * active and updated for just the domain the pseudo-locked - * region was requested for. - */ - ret = rdtgroup_pseudo_lock_create(rdtgrp); - } - -out: - rdt_staged_configs_clear(); - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, u32 closid, enum resctrl_conf_type type) { @@ -453,276 +91,3 @@ u32 resctrl_arch_get_config(struct rdt_resource *r, struct rdt_ctrl_domain *d, return hw_dom->ctrl_val[idx]; } - -static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) -{ - struct rdt_resource *r = schema->res; - struct rdt_ctrl_domain *dom; - bool sep = false; - u32 ctrl_val; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - if (is_mba_sc(r)) - ctrl_val = dom->mbps_val[closid]; - else - ctrl_val = resctrl_arch_get_config(r, dom, closid, - schema->conf_type); - - seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); - sep = true; - } - seq_puts(s, "\n"); -} - -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - struct rdtgroup *rdtgrp; - int ret = 0; - u32 closid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - list_for_each_entry(schema, &resctrl_schema_all, list) { - seq_printf(s, "%s:uninitialized\n", schema->name); - } - } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%s:%d=%x\n", - rdtgrp->plr->s->res->name, - rdtgrp->plr->d->hdr.id, - rdtgrp->plr->cbm); - } - } else { - closid = rdtgrp->closid; - list_for_each_entry(schema, &resctrl_schema_all, list) { - if (closid < schema->num_closid) - show_doms(s, schema, closid); - } - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); - return ret; -} - -static int smp_mon_event_count(void *arg) -{ - mon_event_count(arg); - - return 0; -} - -ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (!strcmp(buf, "mbm_local_bytes")) { - if (resctrl_arch_is_mbm_local_enabled()) - rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else - ret = -EINVAL; - } else if (!strcmp(buf, "mbm_total_bytes")) { - if (resctrl_arch_is_mbm_total_enabled()) - rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; - else - ret = -EINVAL; - } else { - ret = -EINVAL; - } - - if (ret) - rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); - - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - switch (rdtgrp->mba_mbps_event) { - case QOS_L3_MBM_LOCAL_EVENT_ID: - seq_puts(s, "mbm_local_bytes\n"); - break; - case QOS_L3_MBM_TOTAL_EVENT_ID: - seq_puts(s, "mbm_total_bytes\n"); - break; - default: - pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); - ret = -EINVAL; - break; - } - } else { - ret = -ENOENT; - } - - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, - struct list_head **pos) -{ - struct rdt_domain_hdr *d; - struct list_head *l; - - list_for_each(l, h) { - d = list_entry(l, struct rdt_domain_hdr, list); - /* When id is found, return its domain. */ - if (id == d->id) - return d; - /* Stop searching when finding id's position in sorted list. */ - if (id < d->id) - break; - } - - if (pos) - *pos = l; - - return NULL; -} - -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first) -{ - int cpu; - - /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - /* - * Setup the parameters to pass to mon_event_count() to read the data. - */ - rr->rgrp = rdtgrp; - rr->evtid = evtid; - rr->r = r; - rr->d = d; - rr->first = first; - rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); - if (IS_ERR(rr->arch_mon_ctx)) { - rr->err = -EINVAL; - return; - } - - cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); - - /* - * cpumask_any_housekeeping() prefers housekeeping CPUs, but - * are all the CPUs nohz_full? If yes, pick a CPU to IPI. - * MPAM's resctrl_arch_rmid_read() is unable to read the - * counters on some platforms if its called in IRQ context. - */ - if (tick_nohz_full_cpu(cpu)) - smp_call_function_any(cpumask, mon_event_count, rr, 1); - else - smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); - - resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); -} - -int rdtgroup_mondata_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rdt_domain_hdr *hdr; - struct rmid_read rr = {0}; - struct rdt_mon_domain *d; - u32 resid, evtid, domid; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - union mon_data_bits md; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto out; - } - - md.priv = of->kn->priv; - resid = md.u.rid; - domid = md.u.domid; - evtid = md.u.evtid; - r = resctrl_arch_get_resource(resid); - - if (md.u.sum) { - /* - * This file requires summing across all domains that share - * the L3 cache id that was provided in the "domid" field of the - * mon_data_bits union. Search all domains in the resource for - * one that matches this cache id. - */ - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->ci->id == domid) { - rr.ci = d->ci; - mon_event_read(&rr, r, NULL, rdtgrp, - &d->ci->shared_cpu_map, evtid, false); - goto checkresult; - } - } - ret = -ENOENT; - goto out; - } else { - /* - * This file provides data from a single domain. Search - * the resource to find the domain with "domid". - */ - hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); - if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { - ret = -ENOENT; - goto out; - } - d = container_of(hdr, struct rdt_mon_domain, hdr); - mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); - } - -checkresult: - - if (rr.err == -EIO) - seq_puts(m, "Error\n"); - else if (rr.err == -EINVAL) - seq_puts(m, "Unavailable\n"); - else - seq_printf(m, "%llu\n", rr.val); - -out: - rdtgroup_kn_unlock(of->kn); - return ret; -} diff --git a/arch/x86/kernel/cpu/resctrl/internal.h b/arch/x86/kernel/cpu/resctrl/internal.h index eaae99602b61..5e3c41b36437 100644 --- a/arch/x86/kernel/cpu/resctrl/internal.h +++ b/arch/x86/kernel/cpu/resctrl/internal.h @@ -3,28 +3,21 @@ #define _ASM_X86_RESCTRL_INTERNAL_H #include <linux/resctrl.h> -#include <linux/sched.h> -#include <linux/kernfs.h> -#include <linux/fs_context.h> -#include <linux/jump_label.h> -#include <linux/tick.h> - -#include <asm/resctrl.h> #define L3_QOS_CDP_ENABLE 0x01ULL #define L2_QOS_CDP_ENABLE 0x01ULL -#define CQM_LIMBOCHECK_INTERVAL 1000 - #define MBM_CNTR_WIDTH_BASE 24 -#define MBM_OVERFLOW_INTERVAL 1000 -#define MAX_MBA_BW 100u + #define MBA_IS_LINEAR 0x4 + #define MBM_CNTR_WIDTH_OFFSET_AMD 20 #define RMID_VAL_ERROR BIT_ULL(63) + #define RMID_VAL_UNAVAIL BIT_ULL(62) + /* * With the above fields in use 62 bits remain in MSR_IA32_QM_CTR for * data to be returned. The counter width is discovered from the hardware @@ -33,278 +26,6 @@ #define MBM_CNTR_WIDTH_OFFSET_MAX (62 - MBM_CNTR_WIDTH_BASE) /** - * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that - * aren't marked nohz_full - * @mask: The mask to pick a CPU from. - * @exclude_cpu:The CPU to avoid picking. - * - * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping - * CPUs that don't use nohz_full, these are preferred. Pass - * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. - * - * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. - */ -static inline unsigned int -cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) -{ - unsigned int cpu, hk_cpu; - - if (exclude_cpu == RESCTRL_PICK_ANY_CPU) - cpu = cpumask_any(mask); - else - cpu = cpumask_any_but(mask, exclude_cpu); - - /* Only continue if tick_nohz_full_mask has been initialized. */ - if (!tick_nohz_full_enabled()) - return cpu; - - /* If the CPU picked isn't marked nohz_full nothing more needs doing. */ - if (cpu < nr_cpu_ids && !tick_nohz_full_cpu(cpu)) - return cpu; - - /* Try to find a CPU that isn't nohz_full to use in preference */ - hk_cpu = cpumask_nth_andnot(0, mask, tick_nohz_full_mask); - if (hk_cpu == exclude_cpu) - hk_cpu = cpumask_nth_andnot(1, mask, tick_nohz_full_mask); - - if (hk_cpu < nr_cpu_ids) - cpu = hk_cpu; - - return cpu; -} - -struct rdt_fs_context { - struct kernfs_fs_context kfc; - bool enable_cdpl2; - bool enable_cdpl3; - bool enable_mba_mbps; - bool enable_debug; -}; - -static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) -{ - struct kernfs_fs_context *kfc = fc->fs_private; - - return container_of(kfc, struct rdt_fs_context, kfc); -} - -/** - * struct mon_evt - Entry in the event list of a resource - * @evtid: event id - * @name: name of the event - * @configurable: true if the event is configurable - * @list: entry in &rdt_resource->evt_list - */ -struct mon_evt { - enum resctrl_event_id evtid; - char *name; - bool configurable; - struct list_head list; -}; - -/** - * union mon_data_bits - Monitoring details for each event file. - * @priv: Used to store monitoring event data in @u - * as kernfs private data. - * @u.rid: Resource id associated with the event file. - * @u.evtid: Event id associated with the event file. - * @u.sum: Set when event must be summed across multiple - * domains. - * @u.domid: When @u.sum is zero this is the domain to which - * the event file belongs. When @sum is one this - * is the id of the L3 cache that all domains to be - * summed share. - * @u: Name of the bit fields struct. - */ -union mon_data_bits { - void *priv; - struct { - unsigned int rid : 10; - enum resctrl_event_id evtid : 7; - unsigned int sum : 1; - unsigned int domid : 14; - } u; -}; - -/** - * struct rmid_read - Data passed across smp_call*() to read event count. - * @rgrp: Resource group for which the counter is being read. If it is a parent - * resource group then its event count is summed with the count from all - * its child resource groups. - * @r: Resource describing the properties of the event being read. - * @d: Domain that the counter should be read from. If NULL then sum all - * domains in @r sharing L3 @ci.id - * @evtid: Which monitor event to read. - * @first: Initialize MBM counter when true. - * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. - * @err: Error encountered when reading counter. - * @val: Returned value of event counter. If @rgrp is a parent resource group, - * @val includes the sum of event counts from its child resource groups. - * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, - * (summed across child resource groups if @rgrp is a parent resource group). - * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). - */ -struct rmid_read { - struct rdtgroup *rgrp; - struct rdt_resource *r; - struct rdt_mon_domain *d; - enum resctrl_event_id evtid; - bool first; - struct cacheinfo *ci; - int err; - u64 val; - void *arch_mon_ctx; -}; - -extern struct list_head resctrl_schema_all; -extern bool resctrl_mounted; - -enum rdt_group_type { - RDTCTRL_GROUP = 0, - RDTMON_GROUP, - RDT_NUM_GROUP, -}; - -/** - * enum rdtgrp_mode - Mode of a RDT resource group - * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations - * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed - * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking - * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations - * allowed AND the allocations are Cache Pseudo-Locked - * @RDT_NUM_MODES: Total number of modes - * - * The mode of a resource group enables control over the allowed overlap - * between allocations associated with different resource groups (classes - * of service). User is able to modify the mode of a resource group by - * writing to the "mode" resctrl file associated with the resource group. - * - * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by - * writing the appropriate text to the "mode" file. A resource group enters - * "pseudo-locked" mode after the schemata is written while the resource - * group is in "pseudo-locksetup" mode. - */ -enum rdtgrp_mode { - RDT_MODE_SHAREABLE = 0, - RDT_MODE_EXCLUSIVE, - RDT_MODE_PSEUDO_LOCKSETUP, - RDT_MODE_PSEUDO_LOCKED, - - /* Must be last */ - RDT_NUM_MODES, -}; - -/** - * struct mongroup - store mon group's data in resctrl fs. - * @mon_data_kn: kernfs node for the mon_data directory - * @parent: parent rdtgrp - * @crdtgrp_list: child rdtgroup node list - * @rmid: rmid for this rdtgroup - */ -struct mongroup { - struct kernfs_node *mon_data_kn; - struct rdtgroup *parent; - struct list_head crdtgrp_list; - u32 rmid; -}; - -/** - * struct rdtgroup - store rdtgroup's data in resctrl file system. - * @kn: kernfs node - * @rdtgroup_list: linked list for all rdtgroups - * @closid: closid for this rdtgroup - * @cpu_mask: CPUs assigned to this rdtgroup - * @flags: status bits - * @waitcount: how many cpus expect to find this - * group when they acquire rdtgroup_mutex - * @type: indicates type of this rdtgroup - either - * monitor only or ctrl_mon group - * @mon: mongroup related data - * @mode: mode of resource group - * @mba_mbps_event: input monitoring event id when mba_sc is enabled - * @plr: pseudo-locked region - */ -struct rdtgroup { - struct kernfs_node *kn; - struct list_head rdtgroup_list; - u32 closid; - struct cpumask cpu_mask; - int flags; - atomic_t waitcount; - enum rdt_group_type type; - struct mongroup mon; - enum rdtgrp_mode mode; - enum resctrl_event_id mba_mbps_event; - struct pseudo_lock_region *plr; -}; - -/* rdtgroup.flags */ -#define RDT_DELETED 1 - -/* rftype.flags */ -#define RFTYPE_FLAGS_CPUS_LIST 1 - -/* - * Define the file type flags for base and info directories. - */ -#define RFTYPE_INFO BIT(0) -#define RFTYPE_BASE BIT(1) -#define RFTYPE_CTRL BIT(4) -#define RFTYPE_MON BIT(5) -#define RFTYPE_TOP BIT(6) -#define RFTYPE_RES_CACHE BIT(8) -#define RFTYPE_RES_MB BIT(9) -#define RFTYPE_DEBUG BIT(10) -#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) -#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) -#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) -#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) -#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) - -/* List of all resource groups */ -extern struct list_head rdt_all_groups; - -extern int max_name_width; - -/** - * struct rftype - describe each file in the resctrl file system - * @name: File name - * @mode: Access mode - * @kf_ops: File operations - * @flags: File specific RFTYPE_FLAGS_* flags - * @fflags: File specific RFTYPE_* flags - * @seq_show: Show content of the file - * @write: Write to the file - */ -struct rftype { - char *name; - umode_t mode; - const struct kernfs_ops *kf_ops; - unsigned long flags; - unsigned long fflags; - - int (*seq_show)(struct kernfs_open_file *of, - struct seq_file *sf, void *v); - /* - * write() is the generic write callback which maps directly to - * kernfs write operation and overrides all other operations. - * Maximum write size is determined by ->max_write_len. - */ - ssize_t (*write)(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -}; - -/** - * struct mbm_state - status for each MBM counter in each domain - * @prev_bw_bytes: Previous bytes value read for bandwidth calculation - * @prev_bw: The most recent bandwidth in MBps - */ -struct mbm_state { - u64 prev_bw_bytes; - u32 prev_bw; -}; - -/** * struct arch_mbm_state - values used to compute resctrl_arch_rmid_read()s * return value. * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) @@ -401,24 +122,7 @@ static inline struct rdt_hw_resource *resctrl_to_arch_res(struct rdt_resource *r return container_of(r, struct rdt_hw_resource, r_resctrl); } -extern struct mutex rdtgroup_mutex; - -static inline const char *rdt_kn_name(const struct kernfs_node *kn) -{ - return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); -} - extern struct rdt_hw_resource rdt_resources_all[]; -extern struct rdtgroup rdtgroup_default; -extern struct dentry *debugfs_resctrl; -extern enum resctrl_event_id mba_mbps_default_event; - -static inline bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) -{ - return rdt_resources_all[l].cdp_enabled; -} - -int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable); void arch_mon_domain_online(struct rdt_resource *r, struct rdt_mon_domain *d); @@ -455,99 +159,14 @@ union cpuid_0x10_x_edx { unsigned int full; }; -void rdt_last_cmd_clear(void); -void rdt_last_cmd_puts(const char *s); -__printf(1, 2) -void rdt_last_cmd_printf(const char *fmt, ...); - void rdt_ctrl_update(void *arg); -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); -void rdtgroup_kn_unlock(struct kernfs_node *kn); -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask); -ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_schemata_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off); -int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, - struct seq_file *s, void *v); -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive); -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm); -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); -int rdtgroup_tasks_assigned(struct rdtgroup *r); -int closids_supported(void); -void closid_free(int closid); -int alloc_rmid(u32 closid); -void free_rmid(u32 closid, u32 rmid); -int rdt_get_mon_l3_config(struct rdt_resource *r); -void resctrl_mon_resource_exit(void); -bool __init rdt_cpu_has(int flag); -void mon_event_count(void *info); -int rdtgroup_mondata_show(struct seq_file *m, void *arg); -void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, - struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, - cpumask_t *cpumask, int evtid, int first); -int __init resctrl_mon_resource_init(void); -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, - unsigned long delay_ms, - int exclude_cpu); -void mbm_handle_overflow(struct work_struct *work); -void __init intel_rdt_mbm_apply_quirk(void); -bool is_mba_sc(struct rdt_resource *r); -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu); -void cqm_handle_limbo(struct work_struct *work); -bool has_busy_rmid(struct rdt_mon_domain *d); -void __check_limbo(struct rdt_mon_domain *d, bool force_free); -void rdt_domain_reconfigure_cdp(struct rdt_resource *r); -void resctrl_file_fflags_init(const char *config, unsigned long fflags); -void rdt_staged_configs_clear(void); -bool closid_allocated(unsigned int closid); -int resctrl_find_cleanest_closid(void); - -#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); -int rdt_pseudo_lock_init(void); -void rdt_pseudo_lock_release(void); -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); -#else -static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} -static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} - -static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) -{ - return false; -} +int rdt_get_mon_l3_config(struct rdt_resource *r); -static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) -{ - return false; -} +bool rdt_cpu_has(int flag); -static inline int rdt_pseudo_lock_init(void) { return 0; } -static inline void rdt_pseudo_lock_release(void) { } -static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - return -EOPNOTSUPP; -} +void __init intel_rdt_mbm_apply_quirk(void); -static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } -#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ +void rdt_domain_reconfigure_cdp(struct rdt_resource *r); #endif /* _ASM_X86_RESCTRL_INTERNAL_H */ diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index a93ed7d2a160..c261558276cd 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -18,62 +18,12 @@ #define pr_fmt(fmt) "resctrl: " fmt #include <linux/cpu.h> -#include <linux/module.h> -#include <linux/sizes.h> -#include <linux/slab.h> +#include <linux/resctrl.h> #include <asm/cpu_device_id.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include "internal.h" -#include "trace.h" - -/** - * struct rmid_entry - dirty tracking for all RMID. - * @closid: The CLOSID for this entry. - * @rmid: The RMID for this entry. - * @busy: The number of domains with cached data using this RMID. - * @list: Member of the rmid_free_lru list when busy == 0. - * - * Depending on the architecture the correct monitor is accessed using - * both @closid and @rmid, or @rmid only. - * - * Take the rdtgroup_mutex when accessing. - */ -struct rmid_entry { - u32 closid; - u32 rmid; - int busy; - struct list_head list; -}; - -/* - * @rmid_free_lru - A least recently used list of free RMIDs - * These RMIDs are guaranteed to have an occupancy less than the - * threshold occupancy - */ -static LIST_HEAD(rmid_free_lru); - -/* - * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. - * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. - * Indexed by CLOSID. Protected by rdtgroup_mutex. - */ -static u32 *closid_num_dirty_rmid; - -/* - * @rmid_limbo_count - count of currently unused but (potentially) - * dirty RMIDs. - * This counts RMIDs that no one is currently using but that - * may have a occupancy value > resctrl_rmid_realloc_threshold. User can - * change the threshold occupancy value. - */ -static unsigned int rmid_limbo_count; - -/* - * @rmid_entry - The entry in the limbo and free lists. - */ -static struct rmid_entry *rmid_ptrs; /* * Global boolean for rdt_monitor which is true if any @@ -86,23 +36,12 @@ bool rdt_mon_capable; */ unsigned int rdt_mon_features; -/* - * This is the threshold cache occupancy in bytes at which we will consider an - * RMID available for re-allocation. - */ -unsigned int resctrl_rmid_realloc_threshold; - -/* - * This is the maximum value for the reallocation threshold, in bytes. - */ -unsigned int resctrl_rmid_realloc_limit; - #define CF(cf) ((unsigned long)(1048576 * (cf) + 0.5)) static int snc_nodes_per_l3_cache = 1; /* - * The correction factor table is documented in Documentation/arch/x86/resctrl.rst. + * The correction factor table is documented in Documentation/filesystems/resctrl.rst. * If rmid > rmid threshold, MBM total and local values should be multiplied * by the correction factor. * @@ -151,6 +90,7 @@ static const struct mbm_correction_factor_table { }; static u32 mbm_cf_rmidthreshold __read_mostly = UINT_MAX; + static u64 mbm_cf __read_mostly; static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) @@ -163,33 +103,6 @@ static inline u64 get_corrected_mbm_count(u32 rmid, unsigned long val) } /* - * x86 and arm64 differ in their handling of monitoring. - * x86's RMID are independent numbers, there is only one source of traffic - * with an RMID value of '1'. - * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of - * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID - * value is no longer unique. - * To account for this, resctrl uses an index. On x86 this is just the RMID, - * on arm64 it encodes the CLOSID and RMID. This gives a unique number. - * - * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code - * must accept an attempt to read every index. - */ -static inline struct rmid_entry *__rmid_entry(u32 idx) -{ - struct rmid_entry *entry; - u32 closid, rmid; - - entry = &rmid_ptrs[idx]; - resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); - - WARN_ON_ONCE(entry->closid != closid); - WARN_ON_ONCE(entry->rmid != rmid); - - return entry; -} - -/* * When Sub-NUMA Cluster (SNC) mode is not enabled (as indicated by * "snc_nodes_per_l3_cache == 1") no translation of the RMID value is * needed. The physical RMID is the same as the logical RMID. @@ -238,7 +151,7 @@ static int __rmid_read_phys(u32 prmid, enum resctrl_event_id eventid, u64 *val) * are error bits. */ wrmsr(MSR_IA32_QM_EVTSEL, eventid, prmid); - rdmsrl(MSR_IA32_QM_CTR, msr_val); + rdmsrq(MSR_IA32_QM_CTR, msr_val); if (msr_val & RMID_VAL_ERROR) return -EIO; @@ -260,12 +173,11 @@ static struct arch_mbm_state *get_arch_mbm_state(struct rdt_hw_mon_domain *hw_do return &hw_dom->arch_mbm_total[rmid]; case QOS_L3_MBM_LOCAL_EVENT_ID: return &hw_dom->arch_mbm_local[rmid]; + default: + /* Never expect to get here */ + WARN_ON_ONCE(1); + return NULL; } - - /* Never expect to get here */ - WARN_ON_ONCE(1); - - return NULL; } void resctrl_arch_reset_rmid(struct rdt_resource *r, struct rdt_mon_domain *d, @@ -346,769 +258,6 @@ int resctrl_arch_rmid_read(struct rdt_resource *r, struct rdt_mon_domain *d, return 0; } -static void limbo_release_entry(struct rmid_entry *entry) -{ - lockdep_assert_held(&rdtgroup_mutex); - - rmid_limbo_count--; - list_add_tail(&entry->list, &rmid_free_lru); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]--; -} - -/* - * Check the RMIDs that are marked as busy for this domain. If the - * reported LLC occupancy is below the threshold clear the busy bit and - * decrement the count. If the busy count gets to zero on an RMID, we - * free the RMID - */ -void __check_limbo(struct rdt_mon_domain *d, bool force_free) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - struct rmid_entry *entry; - u32 idx, cur_idx = 1; - void *arch_mon_ctx; - bool rmid_dirty; - u64 val = 0; - - arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); - if (IS_ERR(arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(arch_mon_ctx)); - return; - } - - /* - * Skip RMID 0 and start from RMID 1 and check all the RMIDs that - * are marked as busy for occupancy < threshold. If the occupancy - * is less than the threshold decrement the busy counter of the - * RMID and move it to the free list when the counter reaches 0. - */ - for (;;) { - idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); - if (idx >= idx_limit) - break; - - entry = __rmid_entry(idx); - if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, - QOS_L3_OCCUP_EVENT_ID, &val, - arch_mon_ctx)) { - rmid_dirty = true; - } else { - rmid_dirty = (val >= resctrl_rmid_realloc_threshold); - - /* - * x86's CLOSID and RMID are independent numbers, so the entry's - * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the - * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't - * used to select the configuration. It is thus necessary to track both - * CLOSID and RMID because there may be dependencies between them - * on some architectures. - */ - trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); - } - - if (force_free || !rmid_dirty) { - clear_bit(idx, d->rmid_busy_llc); - if (!--entry->busy) - limbo_release_entry(entry); - } - cur_idx = idx + 1; - } - - resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); -} - -bool has_busy_rmid(struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - - return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; -} - -static struct rmid_entry *resctrl_find_free_rmid(u32 closid) -{ - struct rmid_entry *itr; - u32 itr_idx, cmp_idx; - - if (list_empty(&rmid_free_lru)) - return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); - - list_for_each_entry(itr, &rmid_free_lru, list) { - /* - * Get the index of this free RMID, and the index it would need - * to be if it were used with this CLOSID. - * If the CLOSID is irrelevant on this architecture, the two - * index values are always the same on every entry and thus the - * very first entry will be returned. - */ - itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); - cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); - - if (itr_idx == cmp_idx) - return itr; - } - - return ERR_PTR(-ENOSPC); -} - -/** - * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated - * RMID are clean, or the CLOSID that has - * the most clean RMID. - * - * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID - * may not be able to allocate clean RMID. To avoid this the allocator will - * choose the CLOSID with the most clean RMID. - * - * When the CLOSID and RMID are independent numbers, the first free CLOSID will - * be returned. - */ -int resctrl_find_cleanest_closid(void) -{ - u32 cleanest_closid = ~0; - int i = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - return -EIO; - - for (i = 0; i < closids_supported(); i++) { - int num_dirty; - - if (closid_allocated(i)) - continue; - - num_dirty = closid_num_dirty_rmid[i]; - if (num_dirty == 0) - return i; - - if (cleanest_closid == ~0) - cleanest_closid = i; - - if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) - cleanest_closid = i; - } - - if (cleanest_closid == ~0) - return -ENOSPC; - - return cleanest_closid; -} - -/* - * For MPAM the RMID value is not unique, and has to be considered with - * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which - * allows all domains to be managed by a single free list. - * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. - */ -int alloc_rmid(u32 closid) -{ - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - entry = resctrl_find_free_rmid(closid); - if (IS_ERR(entry)) - return PTR_ERR(entry); - - list_del(&entry->list); - return entry->rmid; -} - -static void add_rmid_to_limbo(struct rmid_entry *entry) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; - u32 idx; - - lockdep_assert_held(&rdtgroup_mutex); - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); - - entry->busy = 0; - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* - * For the first limbo RMID in the domain, - * setup up the limbo worker. - */ - if (!has_busy_rmid(d)) - cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, - RESCTRL_PICK_ANY_CPU); - set_bit(idx, d->rmid_busy_llc); - entry->busy++; - } - - rmid_limbo_count++; - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) - closid_num_dirty_rmid[entry->closid]++; -} - -void free_rmid(u32 closid, u32 rmid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - struct rmid_entry *entry; - - lockdep_assert_held(&rdtgroup_mutex); - - /* - * Do not allow the default rmid to be free'd. Comparing by index - * allows architectures that ignore the closid parameter to avoid an - * unnecessary check. - */ - if (!resctrl_arch_mon_capable() || - idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID)) - return; - - entry = __rmid_entry(idx); - - if (resctrl_arch_is_llc_occupancy_enabled()) - add_rmid_to_limbo(entry); - else - list_add_tail(&entry->list, &rmid_free_lru); -} - -static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, - u32 rmid, enum resctrl_event_id evtid) -{ - u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); - - switch (evtid) { - case QOS_L3_MBM_TOTAL_EVENT_ID: - return &d->mbm_total[idx]; - case QOS_L3_MBM_LOCAL_EVENT_ID: - return &d->mbm_local[idx]; - default: - return NULL; - } -} - -static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - int cpu = smp_processor_id(); - struct rdt_mon_domain *d; - struct mbm_state *m; - int err, ret; - u64 tval = 0; - - if (rr->first) { - resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (m) - memset(m, 0, sizeof(struct mbm_state)); - return 0; - } - - if (rr->d) { - /* Reading a single domain, must be on a CPU in that domain. */ - if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) - return -EINVAL; - rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (rr->err) - return rr->err; - - rr->val += tval; - - return 0; - } - - /* Summing domains that share a cache, must be on a CPU for that cache. */ - if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) - return -EINVAL; - - /* - * Legacy files must report the sum of an event across all - * domains that share the same L3 cache instance. - * Report success if a read from any domain succeeds, -EINVAL - * (translated to "Unavailable" for user space) if reading from - * all domains fail for any reason. - */ - ret = -EINVAL; - list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { - if (d->ci->id != rr->ci->id) - continue; - err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, - rr->evtid, &tval, rr->arch_mon_ctx); - if (!err) { - rr->val += tval; - ret = 0; - } - } - - if (ret) - rr->err = ret; - - return ret; -} - -/* - * mbm_bw_count() - Update bw count from values previously read by - * __mon_event_count(). - * @closid: The closid used to identify the cached mbm_state. - * @rmid: The rmid used to identify the cached mbm_state. - * @rr: The struct rmid_read populated by __mon_event_count(). - * - * Supporting function to calculate the memory bandwidth - * and delta bandwidth in MBps. The chunks value previously read by - * __mon_event_count() is compared with the chunks value from the previous - * invocation. This must be called once per second to maintain values in MBps. - */ -static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) -{ - u64 cur_bw, bytes, cur_bytes; - struct mbm_state *m; - - m = get_mbm_state(rr->d, closid, rmid, rr->evtid); - if (WARN_ON_ONCE(!m)) - return; - - cur_bytes = rr->val; - bytes = cur_bytes - m->prev_bw_bytes; - m->prev_bw_bytes = cur_bytes; - - cur_bw = bytes / SZ_1M; - - m->prev_bw = cur_bw; -} - -/* - * This is scheduled by mon_event_read() to read the CQM/MBM counters - * on a domain. - */ -void mon_event_count(void *info) -{ - struct rdtgroup *rdtgrp, *entry; - struct rmid_read *rr = info; - struct list_head *head; - int ret; - - rdtgrp = rr->rgrp; - - ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); - - /* - * For Ctrl groups read data from child monitor groups and - * add them together. Count events which are read successfully. - * Discard the rmid_read's reporting errors. - */ - head = &rdtgrp->mon.crdtgrp_list; - - if (rdtgrp->type == RDTCTRL_GROUP) { - list_for_each_entry(entry, head, mon.crdtgrp_list) { - if (__mon_event_count(entry->closid, entry->mon.rmid, - rr) == 0) - ret = 0; - } - } - - /* - * __mon_event_count() calls for newly created monitor groups may - * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. - * Discard error if any of the monitor event reads succeeded. - */ - if (ret == 0) - rr->err = 0; -} - -static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, - struct rdt_resource *r) -{ - struct rdt_ctrl_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -/* - * Feedback loop for MBA software controller (mba_sc) - * - * mba_sc is a feedback loop where we periodically read MBM counters and - * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so - * that: - * - * current bandwidth(cur_bw) < user specified bandwidth(user_bw) - * - * This uses the MBM counters to measure the bandwidth and MBA throttle - * MSRs to control the bandwidth for a particular rdtgrp. It builds on the - * fact that resctrl rdtgroups have both monitoring and control. - * - * The frequency of the checks is 1s and we just tag along the MBM overflow - * timer. Having 1s interval makes the calculation of bandwidth simpler. - * - * Although MBA's goal is to restrict the bandwidth to a maximum, there may - * be a need to increase the bandwidth to avoid unnecessarily restricting - * the L2 <-> L3 traffic. - * - * Since MBA controls the L2 external bandwidth where as MBM measures the - * L3 external bandwidth the following sequence could lead to such a - * situation. - * - * Consider an rdtgroup which had high L3 <-> memory traffic in initial - * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but - * after some time rdtgroup has mostly L2 <-> L3 traffic. - * - * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its - * throttle MSRs already have low percentage values. To avoid - * unnecessarily restricting such rdtgroups, we also increase the bandwidth. - */ -static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) -{ - u32 closid, rmid, cur_msr_val, new_msr_val; - struct mbm_state *pmbm_data, *cmbm_data; - struct rdt_ctrl_domain *dom_mba; - enum resctrl_event_id evt_id; - struct rdt_resource *r_mba; - struct list_head *head; - struct rdtgroup *entry; - u32 cur_bw, user_bw; - - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - evt_id = rgrp->mba_mbps_event; - - closid = rgrp->closid; - rmid = rgrp->mon.rmid; - pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); - if (WARN_ON_ONCE(!pmbm_data)) - return; - - dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); - if (!dom_mba) { - pr_warn_once("Failure to get domain for MBA update\n"); - return; - } - - cur_bw = pmbm_data->prev_bw; - user_bw = dom_mba->mbps_val[closid]; - - /* MBA resource doesn't support CDP */ - cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); - - /* - * For Ctrl groups read data from child monitor groups. - */ - head = &rgrp->mon.crdtgrp_list; - list_for_each_entry(entry, head, mon.crdtgrp_list) { - cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); - if (WARN_ON_ONCE(!cmbm_data)) - return; - cur_bw += cmbm_data->prev_bw; - } - - /* - * Scale up/down the bandwidth linearly for the ctrl group. The - * bandwidth step is the bandwidth granularity specified by the - * hardware. - * Always increase throttling if current bandwidth is above the - * target set by user. - * But avoid thrashing up and down on every poll by checking - * whether a decrease in throttling is likely to push the group - * back over target. E.g. if currently throttling to 30% of bandwidth - * on a system with 10% granularity steps, check whether moving to - * 40% would go past the limit by multiplying current bandwidth by - * "(30 + 10) / 30". - */ - if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { - new_msr_val = cur_msr_val - r_mba->membw.bw_gran; - } else if (cur_msr_val < MAX_MBA_BW && - (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { - new_msr_val = cur_msr_val + r_mba->membw.bw_gran; - } else { - return; - } - - resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); -} - -static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid, enum resctrl_event_id evtid) -{ - struct rmid_read rr = {0}; - - rr.r = r; - rr.d = d; - rr.evtid = evtid; - rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); - if (IS_ERR(rr.arch_mon_ctx)) { - pr_warn_ratelimited("Failed to allocate monitor context: %ld", - PTR_ERR(rr.arch_mon_ctx)); - return; - } - - __mon_event_count(closid, rmid, &rr); - - /* - * If the software controller is enabled, compute the - * bandwidth for this event id. - */ - if (is_mba_sc(NULL)) - mbm_bw_count(closid, rmid, &rr); - - resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); -} - -static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, - u32 closid, u32 rmid) -{ - /* - * This is protected from concurrent reads from user as both - * the user and overflow handler hold the global mutex. - */ - if (resctrl_arch_is_mbm_total_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); - - if (resctrl_arch_is_mbm_local_enabled()) - mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); -} - -/* - * Handler to scan the limbo list and move the RMIDs - * to free list whose occupancy < threshold_occupancy. - */ -void cqm_handle_limbo(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); - struct rdt_mon_domain *d; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); - - __check_limbo(d, false); - - if (has_busy_rmid(d)) { - d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, - delay); - } - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this - * domain. - * @dom: The domain the limbo handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->cqm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); -} - -void mbm_handle_overflow(struct work_struct *work) -{ - unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); - struct rdtgroup *prgrp, *crgrp; - struct rdt_mon_domain *d; - struct list_head *head; - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* - * If the filesystem has been unmounted this work no longer needs to - * run. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - goto out_unlock; - - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - d = container_of(work, struct rdt_mon_domain, mbm_over.work); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) - mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); - - if (is_mba_sc(NULL)) - update_mba_bw(prgrp, d); - } - - /* - * Re-check for housekeeping CPUs. This allows the overflow handler to - * move off a nohz_full CPU quickly. - */ - d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, - RESCTRL_PICK_ANY_CPU); - schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -/** - * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this - * domain. - * @dom: The domain the overflow handler should run for. - * @delay_ms: How far in the future the handler should run. - * @exclude_cpu: Which CPU the handler should not run on, - * RESCTRL_PICK_ANY_CPU to pick any CPU. - */ -void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, - int exclude_cpu) -{ - unsigned long delay = msecs_to_jiffies(delay_ms); - int cpu; - - /* - * When a domain comes online there is no guarantee the filesystem is - * mounted. If not, there is no need to catch counter overflow. - */ - if (!resctrl_mounted || !resctrl_arch_mon_capable()) - return; - cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); - dom->mbm_work_cpu = cpu; - - if (cpu < nr_cpu_ids) - schedule_delayed_work_on(cpu, &dom->mbm_over, delay); -} - -static int dom_data_init(struct rdt_resource *r) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rmid_entry *entry = NULL; - int err = 0, i; - u32 idx; - - mutex_lock(&rdtgroup_mutex); - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - u32 *tmp; - - /* - * If the architecture hasn't provided a sanitised value here, - * this may result in larger arrays than necessary. Resctrl will - * use a smaller system wide value based on the resources in - * use. - */ - tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); - if (!tmp) { - err = -ENOMEM; - goto out_unlock; - } - - closid_num_dirty_rmid = tmp; - } - - rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); - if (!rmid_ptrs) { - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - err = -ENOMEM; - goto out_unlock; - } - - for (i = 0; i < idx_limit; i++) { - entry = &rmid_ptrs[i]; - INIT_LIST_HEAD(&entry->list); - - resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); - list_add_tail(&entry->list, &rmid_free_lru); - } - - /* - * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and - * are always allocated. These are used for the rdtgroup_default - * control group, which will be setup later in resctrl_init(). - */ - idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, - RESCTRL_RESERVED_RMID); - entry = __rmid_entry(idx); - list_del(&entry->list); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -static void dom_data_exit(struct rdt_resource *r) -{ - mutex_lock(&rdtgroup_mutex); - - if (!r->mon_capable) - goto out_unlock; - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { - kfree(closid_num_dirty_rmid); - closid_num_dirty_rmid = NULL; - } - - kfree(rmid_ptrs); - rmid_ptrs = NULL; - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -static struct mon_evt llc_occupancy_event = { - .name = "llc_occupancy", - .evtid = QOS_L3_OCCUP_EVENT_ID, -}; - -static struct mon_evt mbm_total_event = { - .name = "mbm_total_bytes", - .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, -}; - -static struct mon_evt mbm_local_event = { - .name = "mbm_local_bytes", - .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, -}; - -/* - * Initialize the event list for the resource. - * - * Note that MBM events are also part of RDT_RESOURCE_L3 resource - * because as per the SDM the total and local memory bandwidth - * are enumerated as part of L3 monitoring. - */ -static void l3_mon_evt_init(struct rdt_resource *r) -{ - INIT_LIST_HEAD(&r->evt_list); - - if (resctrl_arch_is_llc_occupancy_enabled()) - list_add_tail(&llc_occupancy_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_total_enabled()) - list_add_tail(&mbm_total_event.list, &r->evt_list); - if (resctrl_arch_is_mbm_local_enabled()) - list_add_tail(&mbm_local_event.list, &r->evt_list); -} - /* * The power-on reset value of MSR_RMID_SNC_CONFIG is 0x1 * which indicates that RMIDs are configured in legacy mode. @@ -1192,51 +341,6 @@ static __init int snc_get_config(void) return ret; } -/** - * resctrl_mon_resource_init() - Initialise global monitoring structures. - * - * Allocate and initialise global monitor resources that do not belong to a - * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. - * Called once during boot after the struct rdt_resource's have been configured - * but before the filesystem is mounted. - * Resctrl's cpuhp callbacks may be called before this point to bring a domain - * online. - * - * Returns 0 for success, or -ENOMEM. - */ -int __init resctrl_mon_resource_init(void) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - int ret; - - if (!r->mon_capable) - return 0; - - ret = dom_data_init(r); - if (ret) - return ret; - - l3_mon_evt_init(r); - - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { - mbm_total_event.configurable = true; - resctrl_file_fflags_init("mbm_total_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } - if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { - mbm_local_event.configurable = true; - resctrl_file_fflags_init("mbm_local_bytes_config", - RFTYPE_MON_INFO | RFTYPE_RES_CACHE); - } - - if (resctrl_arch_is_mbm_local_enabled()) - mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; - else if (resctrl_arch_is_mbm_total_enabled()) - mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; - - return 0; -} - int __init rdt_get_mon_l3_config(struct rdt_resource *r) { unsigned int mbm_offset = boot_cpu_data.x86_cache_mbm_width_offset; @@ -1284,13 +388,6 @@ int __init rdt_get_mon_l3_config(struct rdt_resource *r) return 0; } -void resctrl_mon_resource_exit(void) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - - dom_data_exit(r); -} - void __init intel_rdt_mbm_apply_quirk(void) { int cf_index; diff --git a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c index 92ea1472bde9..de580eca3363 100644 --- a/arch/x86/kernel/cpu/resctrl/pseudo_lock.c +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock.c @@ -11,26 +11,22 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt +#include <linux/cacheflush.h> #include <linux/cpu.h> -#include <linux/cpumask.h> -#include <linux/debugfs.h> -#include <linux/kthread.h> -#include <linux/mman.h> #include <linux/perf_event.h> #include <linux/pm_qos.h> -#include <linux/slab.h> -#include <linux/uaccess.h> +#include <linux/resctrl.h> -#include <asm/cacheflush.h> #include <asm/cpu_device_id.h> -#include <asm/resctrl.h> #include <asm/perf_event.h> +#include <asm/msr.h> #include "../../events/perf_event.h" /* For X86_CONFIG() */ #include "internal.h" #define CREATE_TRACE_POINTS -#include "trace.h" + +#include "pseudo_lock_trace.h" /* * The bits needed to disable hardware prefetching varies based on the @@ -38,29 +34,6 @@ */ static u64 prefetch_disable_bits; -/* - * Major number assigned to and shared by all devices exposing - * pseudo-locked regions. - */ -static unsigned int pseudo_lock_major; -static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); - -static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) -{ - const struct rdtgroup *rdtgrp; - - rdtgrp = dev_get_drvdata(dev); - if (mode) - *mode = 0600; - guard(mutex)(&rdtgroup_mutex); - return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); -} - -static const struct class pseudo_lock_class = { - .name = "pseudo_lock", - .devnode = pseudo_lock_devnode, -}; - /** * resctrl_arch_get_prefetch_disable_bits - prefetch disable bits of supported * platforms @@ -122,298 +95,6 @@ u64 resctrl_arch_get_prefetch_disable_bits(void) } /** - * pseudo_lock_minor_get - Obtain available minor number - * @minor: Pointer to where new minor number will be stored - * - * A bitmask is used to track available minor numbers. Here the next free - * minor number is marked as unavailable and returned. - * - * Return: 0 on success, <0 on failure. - */ -static int pseudo_lock_minor_get(unsigned int *minor) -{ - unsigned long first_bit; - - first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); - - if (first_bit == MINORBITS) - return -ENOSPC; - - __clear_bit(first_bit, &pseudo_lock_minor_avail); - *minor = first_bit; - - return 0; -} - -/** - * pseudo_lock_minor_release - Return minor number to available - * @minor: The minor number made available - */ -static void pseudo_lock_minor_release(unsigned int minor) -{ - __set_bit(minor, &pseudo_lock_minor_avail); -} - -/** - * region_find_by_minor - Locate a pseudo-lock region by inode minor number - * @minor: The minor number of the device representing pseudo-locked region - * - * When the character device is accessed we need to determine which - * pseudo-locked region it belongs to. This is done by matching the minor - * number of the device to the pseudo-locked region it belongs. - * - * Minor numbers are assigned at the time a pseudo-locked region is associated - * with a cache instance. - * - * Return: On success return pointer to resource group owning the pseudo-locked - * region, NULL on failure. - */ -static struct rdtgroup *region_find_by_minor(unsigned int minor) -{ - struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->plr && rdtgrp->plr->minor == minor) { - rdtgrp_match = rdtgrp; - break; - } - } - return rdtgrp_match; -} - -/** - * struct pseudo_lock_pm_req - A power management QoS request list entry - * @list: Entry within the @pm_reqs list for a pseudo-locked region - * @req: PM QoS request - */ -struct pseudo_lock_pm_req { - struct list_head list; - struct dev_pm_qos_request req; -}; - -static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req, *next; - - list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { - dev_pm_qos_remove_request(&pm_req->req); - list_del(&pm_req->list); - kfree(pm_req); - } -} - -/** - * pseudo_lock_cstates_constrain - Restrict cores from entering C6 - * @plr: Pseudo-locked region - * - * To prevent the cache from being affected by power management entering - * C6 has to be avoided. This is accomplished by requesting a latency - * requirement lower than lowest C6 exit latency of all supported - * platforms as found in the cpuidle state tables in the intel_idle driver. - * At this time it is possible to do so with a single latency requirement - * for all supported platforms. - * - * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, - * the ACPI latencies need to be considered while keeping in mind that C2 - * may be set to map to deeper sleep states. In this case the latency - * requirement needs to prevent entering C2 also. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) -{ - struct pseudo_lock_pm_req *pm_req; - int cpu; - int ret; - - for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { - pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); - if (!pm_req) { - rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); - ret = -ENOMEM; - goto out_err; - } - ret = dev_pm_qos_add_request(get_cpu_device(cpu), - &pm_req->req, - DEV_PM_QOS_RESUME_LATENCY, - 30); - if (ret < 0) { - rdt_last_cmd_printf("Failed to add latency req CPU%d\n", - cpu); - kfree(pm_req); - ret = -1; - goto out_err; - } - list_add(&pm_req->list, &plr->pm_reqs); - } - - return 0; - -out_err: - pseudo_lock_cstates_relax(plr); - return ret; -} - -/** - * pseudo_lock_region_clear - Reset pseudo-lock region data - * @plr: pseudo-lock region - * - * All content of the pseudo-locked region is reset - any memory allocated - * freed. - * - * Return: void - */ -static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) -{ - plr->size = 0; - plr->line_size = 0; - kfree(plr->kmem); - plr->kmem = NULL; - plr->s = NULL; - if (plr->d) - plr->d->plr = NULL; - plr->d = NULL; - plr->cbm = 0; - plr->debugfs_dir = NULL; -} - -/** - * pseudo_lock_region_init - Initialize pseudo-lock region information - * @plr: pseudo-lock region - * - * Called after user provided a schemata to be pseudo-locked. From the - * schemata the &struct pseudo_lock_region is on entry already initialized - * with the resource, domain, and capacity bitmask. Here the information - * required for pseudo-locking is deduced from this data and &struct - * pseudo_lock_region initialized further. This information includes: - * - size in bytes of the region to be pseudo-locked - * - cache line size to know the stride with which data needs to be accessed - * to be pseudo-locked - * - a cpu associated with the cache instance on which the pseudo-locking - * flow can be executed - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_init(struct pseudo_lock_region *plr) -{ - enum resctrl_scope scope = plr->s->res->ctrl_scope; - struct cacheinfo *ci; - int ret; - - if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) - return -ENODEV; - - /* Pick the first cpu we find that is associated with the cache. */ - plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); - - if (!cpu_online(plr->cpu)) { - rdt_last_cmd_printf("CPU %u associated with cache not online\n", - plr->cpu); - ret = -ENODEV; - goto out_region; - } - - ci = get_cpu_cacheinfo_level(plr->cpu, scope); - if (ci) { - plr->line_size = ci->coherency_line_size; - plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); - return 0; - } - - ret = -1; - rdt_last_cmd_puts("Unable to determine cache line size\n"); -out_region: - pseudo_lock_region_clear(plr); - return ret; -} - -/** - * pseudo_lock_init - Initialize a pseudo-lock region - * @rdtgrp: resource group to which new pseudo-locked region will belong - * - * A pseudo-locked region is associated with a resource group. When this - * association is created the pseudo-locked region is initialized. The - * details of the pseudo-locked region are not known at this time so only - * allocation is done and association established. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_init(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr; - - plr = kzalloc(sizeof(*plr), GFP_KERNEL); - if (!plr) - return -ENOMEM; - - init_waitqueue_head(&plr->lock_thread_wq); - INIT_LIST_HEAD(&plr->pm_reqs); - rdtgrp->plr = plr; - return 0; -} - -/** - * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked - * @plr: pseudo-lock region - * - * Initialize the details required to set up the pseudo-locked region and - * allocate the contiguous memory that will be pseudo-locked to the cache. - * - * Return: 0 on success, <0 on failure. Descriptive error will be written - * to last_cmd_status buffer. - */ -static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) -{ - int ret; - - ret = pseudo_lock_region_init(plr); - if (ret < 0) - return ret; - - /* - * We do not yet support contiguous regions larger than - * KMALLOC_MAX_SIZE. - */ - if (plr->size > KMALLOC_MAX_SIZE) { - rdt_last_cmd_puts("Requested region exceeds maximum size\n"); - ret = -E2BIG; - goto out_region; - } - - plr->kmem = kzalloc(plr->size, GFP_KERNEL); - if (!plr->kmem) { - rdt_last_cmd_puts("Unable to allocate memory\n"); - ret = -ENOMEM; - goto out_region; - } - - ret = 0; - goto out; -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * pseudo_lock_free - Free a pseudo-locked region - * @rdtgrp: resource group to which pseudo-locked region belonged - * - * The pseudo-locked region's resources have already been released, or not - * yet created at this point. Now it can be freed and disassociated from the - * resource group. - * - * Return: void - */ -static void pseudo_lock_free(struct rdtgroup *rdtgrp) -{ - pseudo_lock_region_clear(rdtgrp->plr); - kfree(rdtgrp->plr); - rdtgrp->plr = NULL; -} - -/** * resctrl_arch_pseudo_lock_fn - Load kernel memory into cache * @_plr: the pseudo-lock region descriptor * @@ -481,8 +162,8 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) * the buffer and evict pseudo-locked memory read earlier from the * cache. */ - saved_msr = __rdmsr(MSR_MISC_FEATURE_CONTROL); - __wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + saved_msr = native_rdmsrq(MSR_MISC_FEATURE_CONTROL); + native_wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); closid_p = this_cpu_read(pqr_state.cur_closid); rmid_p = this_cpu_read(pqr_state.cur_rmid); mem_r = plr->kmem; @@ -494,7 +175,7 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) * pseudo-locked followed by reading of kernel memory to load it * into the cache. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, plr->closid); /* * Cache was flushed earlier. Now access kernel memory to read it @@ -531,10 +212,10 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) * Critical section end: restore closid with capacity bitmask that * does not overlap with pseudo-locked region. */ - __wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); + native_wrmsr(MSR_IA32_PQR_ASSOC, rmid_p, closid_p); /* Re-enable the hardware prefetcher(s) */ - wrmsrl(MSR_MISC_FEATURE_CONTROL, saved_msr); + wrmsrq(MSR_MISC_FEATURE_CONTROL, saved_msr); local_irq_enable(); plr->thread_done = 1; @@ -543,340 +224,6 @@ int resctrl_arch_pseudo_lock_fn(void *_plr) } /** - * rdtgroup_monitor_in_progress - Test if monitoring in progress - * @rdtgrp: resource group being queried - * - * Return: 1 if monitor groups have been created for this resource - * group, 0 otherwise. - */ -static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) -{ - return !list_empty(&rdtgrp->mon.crdtgrp_list); -} - -/** - * rdtgroup_locksetup_user_restrict - Restrict user access to group - * @rdtgrp: resource group needing access restricted - * - * A resource group used for cache pseudo-locking cannot have cpus or tasks - * assigned to it. This is communicated to the user by restricting access - * to all the files that can be used to make such changes. - * - * Permissions restored with rdtgroup_locksetup_user_restore() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restriction of access an attempt will be made to restore permissions but - * the state of the mode of these files will be uncertain when a failure - * occurs. - */ -static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); -err_cpus: - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); -err_tasks: - rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); -out: - return ret; -} - -/** - * rdtgroup_locksetup_user_restore - Restore user access to group - * @rdtgrp: resource group needing access restored - * - * Restore all file access previously removed using - * rdtgroup_locksetup_user_restrict() - * - * Return: 0 on success, <0 on failure. If a failure occurs during the - * restoration of access an attempt will be made to restrict permissions - * again but the state of the mode of these files will be uncertain when - * a failure occurs. - */ -static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) -{ - int ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); - if (ret) - return ret; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); - if (ret) - goto err_tasks; - - ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); - if (ret) - goto err_cpus; - - if (resctrl_arch_mon_capable()) { - ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); - if (ret) - goto err_cpus_list; - } - - ret = 0; - goto out; - -err_cpus_list: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); -err_cpus: - rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); -err_tasks: - rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); -out: - return ret; -} - -/** - * rdtgroup_locksetup_enter - Resource group enters locksetup mode - * @rdtgrp: resource group requested to enter locksetup mode - * - * A resource group enters locksetup mode to reflect that it would be used - * to represent a pseudo-locked region and is in the process of being set - * up to do so. A resource group used for a pseudo-locked region would - * lose the closid associated with it so we cannot allow it to have any - * tasks or cpus assigned nor permit tasks or cpus to be assigned in the - * future. Monitoring of a pseudo-locked region is not allowed either. - * - * The above and more restrictions on a pseudo-locked region are checked - * for and enforced before the resource group enters the locksetup mode. - * - * Returns: 0 if the resource group successfully entered locksetup mode, <0 - * on failure. On failure the last_cmd_status buffer is updated with text to - * communicate details of failure to the user. - */ -int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) -{ - int ret; - - /* - * The default resource group can neither be removed nor lose the - * default closid associated with it. - */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); - return -EINVAL; - } - - /* - * Cache Pseudo-locking not supported when CDP is enabled. - * - * Some things to consider if you would like to enable this - * support (using L3 CDP as example): - * - When CDP is enabled two separate resources are exposed, - * L3DATA and L3CODE, but they are actually on the same cache. - * The implication for pseudo-locking is that if a - * pseudo-locked region is created on a domain of one - * resource (eg. L3CODE), then a pseudo-locked region cannot - * be created on that same domain of the other resource - * (eg. L3DATA). This is because the creation of a - * pseudo-locked region involves a call to wbinvd that will - * affect all cache allocations on particular domain. - * - Considering the previous, it may be possible to only - * expose one of the CDP resources to pseudo-locking and - * hide the other. For example, we could consider to only - * expose L3DATA and since the L3 cache is unified it is - * still possible to place instructions there are execute it. - * - If only one region is exposed to pseudo-locking we should - * still keep in mind that availability of a portion of cache - * for pseudo-locking should take into account both resources. - * Similarly, if a pseudo-locked region is created in one - * resource, the portion of cache used by it should be made - * unavailable to all future allocations from both resources. - */ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || - resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { - rdt_last_cmd_puts("CDP enabled\n"); - return -EINVAL; - } - - /* - * Not knowing the bits to disable prefetching implies that this - * platform does not support Cache Pseudo-Locking. - */ - if (resctrl_arch_get_prefetch_disable_bits() == 0) { - rdt_last_cmd_puts("Pseudo-locking not supported\n"); - return -EINVAL; - } - - if (rdtgroup_monitor_in_progress(rdtgrp)) { - rdt_last_cmd_puts("Monitoring in progress\n"); - return -EINVAL; - } - - if (rdtgroup_tasks_assigned(rdtgrp)) { - rdt_last_cmd_puts("Tasks assigned to resource group\n"); - return -EINVAL; - } - - if (!cpumask_empty(&rdtgrp->cpu_mask)) { - rdt_last_cmd_puts("CPUs assigned to resource group\n"); - return -EINVAL; - } - - if (rdtgroup_locksetup_user_restrict(rdtgrp)) { - rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); - return -EIO; - } - - ret = pseudo_lock_init(rdtgrp); - if (ret) { - rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); - goto out_release; - } - - /* - * If this system is capable of monitoring a rmid would have been - * allocated when the control group was created. This is not needed - * anymore when this group would be used for pseudo-locking. This - * is safe to call on platforms not capable of monitoring. - */ - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - ret = 0; - goto out; - -out_release: - rdtgroup_locksetup_user_restore(rdtgrp); -out: - return ret; -} - -/** - * rdtgroup_locksetup_exit - resource group exist locksetup mode - * @rdtgrp: resource group - * - * When a resource group exits locksetup mode the earlier restrictions are - * lifted. - * - * Return: 0 on success, <0 on failure - */ -int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) -{ - int ret; - - if (resctrl_arch_mon_capable()) { - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - } - - ret = rdtgroup_locksetup_user_restore(rdtgrp); - if (ret) { - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - pseudo_lock_free(rdtgrp); - return 0; -} - -/** - * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked - * @d: RDT domain - * @cbm: CBM to test - * - * @d represents a cache instance and @cbm a capacity bitmask that is - * considered for it. Determine if @cbm overlaps with any existing - * pseudo-locked region on @d. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: true if @cbm overlaps with pseudo-locked region on @d, false - * otherwise. - */ -bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int cbm_len; - unsigned long cbm_b; - - if (d->plr) { - cbm_len = d->plr->s->res->cache.cbm_len; - cbm_b = d->plr->cbm; - if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) - return true; - } - return false; -} - -/** - * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy - * @d: RDT domain under test - * - * The setup of a pseudo-locked region affects all cache instances within - * the hierarchy of the region. It is thus essential to know if any - * pseudo-locked regions exist within a cache hierarchy to prevent any - * attempts to create new pseudo-locked regions in the same hierarchy. - * - * Return: true if a pseudo-locked region exists in the hierarchy of @d or - * if it is not possible to test due to memory allocation issue, - * false otherwise. - */ -bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) -{ - struct rdt_ctrl_domain *d_i; - cpumask_var_t cpu_with_psl; - struct rdt_resource *r; - bool ret = false; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) - return true; - - /* - * First determine which cpus have pseudo-locked regions - * associated with them. - */ - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { - if (d_i->plr) - cpumask_or(cpu_with_psl, cpu_with_psl, - &d_i->hdr.cpu_mask); - } - } - - /* - * Next test if new pseudo-locked region would intersect with - * existing region. - */ - if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) - ret = true; - - free_cpumask_var(cpu_with_psl); - return ret; -} - -/** * resctrl_arch_measure_cycles_lat_fn - Measure cycle latency to read * pseudo-locked memory * @_plr: pseudo-lock region to measure @@ -904,7 +251,7 @@ int resctrl_arch_measure_cycles_lat_fn(void *_plr) * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); mem_r = READ_ONCE(plr->kmem); /* * Dummy execute of the time measurement to load the needed @@ -1000,7 +347,7 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * Disable hardware prefetchers. */ rdmsr(MSR_MISC_FEATURE_CONTROL, saved_low, saved_high); - wrmsr(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits, 0x0); + wrmsrq(MSR_MISC_FEATURE_CONTROL, prefetch_disable_bits); /* Initialize rest of local variables */ /* @@ -1018,8 +365,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * used in L1 cache, second to capture accurate value that does not * include cache misses incurred because of instruction loads. */ - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * From SDM: Performing back-to-back fast reads are not guaranteed * to be monotonic. @@ -1027,8 +374,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_before); - rdpmcl(miss_pmcnum, miss_before); + hits_before = rdpmc(hit_pmcnum); + miss_before = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. @@ -1050,8 +397,8 @@ static int measure_residency_fn(struct perf_event_attr *miss_attr, * before proceeding. */ rmb(); - rdpmcl(hit_pmcnum, hits_after); - rdpmcl(miss_pmcnum, miss_after); + hits_after = rdpmc(hit_pmcnum); + miss_after = rdpmc(miss_pmcnum); /* * Use LFENCE to ensure all previous instructions are retired * before proceeding. @@ -1168,433 +515,3 @@ out: wake_up_interruptible(&plr->lock_thread_wq); return 0; } - -/** - * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region - * @rdtgrp: Resource group to which the pseudo-locked region belongs. - * @sel: Selector of which measurement to perform on a pseudo-locked region. - * - * The measurement of latency to access a pseudo-locked region should be - * done from a cpu that is associated with that pseudo-locked region. - * Determine which cpu is associated with this region and start a thread on - * that cpu to perform the measurement, wait for that thread to complete. - * - * Return: 0 on success, <0 on failure - */ -static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int cpu; - int ret = -1; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out; - } - - if (!plr->d) { - ret = -ENODEV; - goto out; - } - - plr->thread_done = 0; - cpu = cpumask_first(&plr->d->hdr.cpu_mask); - if (!cpu_online(cpu)) { - ret = -ENODEV; - goto out; - } - - plr->cpu = cpu; - - if (sel == 1) - thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, - plr, cpu, "pseudo_lock_measure/%u"); - else if (sel == 2) - thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, - plr, cpu, "pseudo_lock_measure/%u"); - else if (sel == 3) - thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, - plr, cpu, "pseudo_lock_measure/%u"); - else - goto out; - - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - goto out; - } - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) - goto out; - - ret = 0; - -out: - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -static ssize_t pseudo_lock_measure_trigger(struct file *file, - const char __user *user_buf, - size_t count, loff_t *ppos) -{ - struct rdtgroup *rdtgrp = file->private_data; - size_t buf_size; - char buf[32]; - int ret; - int sel; - - buf_size = min(count, (sizeof(buf) - 1)); - if (copy_from_user(buf, user_buf, buf_size)) - return -EFAULT; - - buf[buf_size] = '\0'; - ret = kstrtoint(buf, 10, &sel); - if (ret == 0) { - if (sel != 1 && sel != 2 && sel != 3) - return -EINVAL; - ret = debugfs_file_get(file->f_path.dentry); - if (ret) - return ret; - ret = pseudo_lock_measure_cycles(rdtgrp, sel); - if (ret == 0) - ret = count; - debugfs_file_put(file->f_path.dentry); - } - - return ret; -} - -static const struct file_operations pseudo_measure_fops = { - .write = pseudo_lock_measure_trigger, - .open = simple_open, - .llseek = default_llseek, -}; - -/** - * rdtgroup_pseudo_lock_create - Create a pseudo-locked region - * @rdtgrp: resource group to which pseudo-lock region belongs - * - * Called when a resource group in the pseudo-locksetup mode receives a - * valid schemata that should be pseudo-locked. Since the resource group is - * in pseudo-locksetup mode the &struct pseudo_lock_region has already been - * allocated and initialized with the essential information. If a failure - * occurs the resource group remains in the pseudo-locksetup mode with the - * &struct pseudo_lock_region associated with it, but cleared from all - * information and ready for the user to re-attempt pseudo-locking by - * writing the schemata again. - * - * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 - * on failure. Descriptive error will be written to last_cmd_status buffer. - */ -int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - struct task_struct *thread; - unsigned int new_minor; - struct device *dev; - char *kn_name __free(kfree) = NULL; - int ret; - - ret = pseudo_lock_region_alloc(plr); - if (ret < 0) - return ret; - - ret = pseudo_lock_cstates_constrain(plr); - if (ret < 0) { - ret = -EINVAL; - goto out_region; - } - kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); - if (!kn_name) { - ret = -ENOMEM; - goto out_cstates; - } - - plr->thread_done = 0; - - thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, - plr->cpu, "pseudo_lock/%u"); - if (IS_ERR(thread)) { - ret = PTR_ERR(thread); - rdt_last_cmd_printf("Locking thread returned error %d\n", ret); - goto out_cstates; - } - - ret = wait_event_interruptible(plr->lock_thread_wq, - plr->thread_done == 1); - if (ret < 0) { - /* - * If the thread does not get on the CPU for whatever - * reason and the process which sets up the region is - * interrupted then this will leave the thread in runnable - * state and once it gets on the CPU it will dereference - * the cleared, but not freed, plr struct resulting in an - * empty pseudo-locking loop. - */ - rdt_last_cmd_puts("Locking thread interrupted\n"); - goto out_cstates; - } - - ret = pseudo_lock_minor_get(&new_minor); - if (ret < 0) { - rdt_last_cmd_puts("Unable to obtain a new minor number\n"); - goto out_cstates; - } - - /* - * Unlock access but do not release the reference. The - * pseudo-locked region will still be here on return. - * - * The mutex has to be released temporarily to avoid a potential - * deadlock with the mm->mmap_lock which is obtained in the - * device_create() and debugfs_create_dir() callpath below as well as - * before the mmap() callback is called. - */ - mutex_unlock(&rdtgroup_mutex); - - if (!IS_ERR_OR_NULL(debugfs_resctrl)) { - plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); - if (!IS_ERR_OR_NULL(plr->debugfs_dir)) - debugfs_create_file("pseudo_lock_measure", 0200, - plr->debugfs_dir, rdtgrp, - &pseudo_measure_fops); - } - - dev = device_create(&pseudo_lock_class, NULL, - MKDEV(pseudo_lock_major, new_minor), - rdtgrp, "%s", kn_name); - - mutex_lock(&rdtgroup_mutex); - - if (IS_ERR(dev)) { - ret = PTR_ERR(dev); - rdt_last_cmd_printf("Failed to create character device: %d\n", - ret); - goto out_debugfs; - } - - /* We released the mutex - check if group was removed while we did so */ - if (rdtgrp->flags & RDT_DELETED) { - ret = -ENODEV; - goto out_device; - } - - plr->minor = new_minor; - - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; - closid_free(rdtgrp->closid); - rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); - rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); - - ret = 0; - goto out; - -out_device: - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); -out_debugfs: - debugfs_remove_recursive(plr->debugfs_dir); - pseudo_lock_minor_release(new_minor); -out_cstates: - pseudo_lock_cstates_relax(plr); -out_region: - pseudo_lock_region_clear(plr); -out: - return ret; -} - -/** - * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region - * @rdtgrp: resource group to which the pseudo-locked region belongs - * - * The removal of a pseudo-locked region can be initiated when the resource - * group is removed from user space via a "rmdir" from userspace or the - * unmount of the resctrl filesystem. On removal the resource group does - * not go back to pseudo-locksetup mode before it is removed, instead it is - * removed directly. There is thus asymmetry with the creation where the - * &struct pseudo_lock_region is removed here while it was not created in - * rdtgroup_pseudo_lock_create(). - * - * Return: void - */ -void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) -{ - struct pseudo_lock_region *plr = rdtgrp->plr; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - /* - * Default group cannot be a pseudo-locked region so we can - * free closid here. - */ - closid_free(rdtgrp->closid); - goto free; - } - - pseudo_lock_cstates_relax(plr); - debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); - device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); - pseudo_lock_minor_release(plr->minor); - -free: - pseudo_lock_free(rdtgrp); -} - -static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = region_find_by_minor(iminor(inode)); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - filp->private_data = rdtgrp; - atomic_inc(&rdtgrp->waitcount); - /* Perform a non-seekable open - llseek is not supported */ - filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); - - mutex_unlock(&rdtgroup_mutex); - - return 0; -} - -static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) -{ - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - filp->private_data = NULL; - atomic_dec(&rdtgrp->waitcount); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static int pseudo_lock_dev_mremap(struct vm_area_struct *area) -{ - /* Not supported */ - return -EINVAL; -} - -static const struct vm_operations_struct pseudo_mmap_ops = { - .mremap = pseudo_lock_dev_mremap, -}; - -static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) -{ - unsigned long vsize = vma->vm_end - vma->vm_start; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; - struct pseudo_lock_region *plr; - struct rdtgroup *rdtgrp; - unsigned long physical; - unsigned long psize; - - mutex_lock(&rdtgroup_mutex); - - rdtgrp = filp->private_data; - WARN_ON(!rdtgrp); - if (!rdtgrp) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - plr = rdtgrp->plr; - - if (!plr->d) { - mutex_unlock(&rdtgroup_mutex); - return -ENODEV; - } - - /* - * Task is required to run with affinity to the cpus associated - * with the pseudo-locked region. If this is not the case the task - * may be scheduled elsewhere and invalidate entries in the - * pseudo-locked region. - */ - if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - physical = __pa(plr->kmem) >> PAGE_SHIFT; - psize = plr->size - off; - - if (off > plr->size) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - /* - * Ensure changes are carried directly to the memory being mapped, - * do not allow copy-on-write mapping. - */ - if (!(vma->vm_flags & VM_SHARED)) { - mutex_unlock(&rdtgroup_mutex); - return -EINVAL; - } - - if (vsize > psize) { - mutex_unlock(&rdtgroup_mutex); - return -ENOSPC; - } - - memset(plr->kmem + off, 0, vsize); - - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, - vsize, vma->vm_page_prot)) { - mutex_unlock(&rdtgroup_mutex); - return -EAGAIN; - } - vma->vm_ops = &pseudo_mmap_ops; - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static const struct file_operations pseudo_lock_dev_fops = { - .owner = THIS_MODULE, - .read = NULL, - .write = NULL, - .open = pseudo_lock_dev_open, - .release = pseudo_lock_dev_release, - .mmap = pseudo_lock_dev_mmap, -}; - -int rdt_pseudo_lock_init(void) -{ - int ret; - - ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); - if (ret < 0) - return ret; - - pseudo_lock_major = ret; - - ret = class_register(&pseudo_lock_class); - if (ret) { - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - return ret; - } - - return 0; -} - -void rdt_pseudo_lock_release(void) -{ - class_unregister(&pseudo_lock_class); - unregister_chrdev(pseudo_lock_major, "pseudo_lock"); - pseudo_lock_major = 0; -} diff --git a/arch/x86/kernel/cpu/resctrl/trace.h b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h index 2a506316b303..7c8aef08010f 100644 --- a/arch/x86/kernel/cpu/resctrl/trace.h +++ b/arch/x86/kernel/cpu/resctrl/pseudo_lock_trace.h @@ -2,8 +2,8 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM resctrl -#if !defined(_TRACE_RESCTRL_H) || defined(TRACE_HEADER_MULTI_READ) -#define _TRACE_RESCTRL_H +#if !defined(_X86_RESCTRL_PSEUDO_LOCK_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _X86_RESCTRL_PSEUDO_LOCK_TRACE_H #include <linux/tracepoint.h> @@ -35,25 +35,11 @@ TRACE_EVENT(pseudo_lock_l3, TP_printk("hits=%llu miss=%llu", __entry->l3_hits, __entry->l3_miss)); -TRACE_EVENT(mon_llc_occupancy_limbo, - TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), - TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), - TP_STRUCT__entry(__field(u32, ctrl_hw_id) - __field(u32, mon_hw_id) - __field(int, domain_id) - __field(u64, llc_occupancy_bytes)), - TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; - __entry->mon_hw_id = mon_hw_id; - __entry->domain_id = domain_id; - __entry->llc_occupancy_bytes = llc_occupancy_bytes;), - TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", - __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, - __entry->llc_occupancy_bytes) - ); - -#endif /* _TRACE_RESCTRL_H */ +#endif /* _X86_RESCTRL_PSEUDO_LOCK_TRACE_H */ #undef TRACE_INCLUDE_PATH #define TRACE_INCLUDE_PATH . -#define TRACE_INCLUDE_FILE trace + +#define TRACE_INCLUDE_FILE pseudo_lock_trace + #include <trace/define_trace.h> diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c b/arch/x86/kernel/cpu/resctrl/rdtgroup.c index cc4a54145c83..885026468440 100644 --- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c +++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c @@ -18,6 +18,7 @@ #include <linux/fs_parser.h> #include <linux/sysfs.h> #include <linux/kernfs.h> +#include <linux/resctrl.h> #include <linux/seq_buf.h> #include <linux/seq_file.h> #include <linux/sched/signal.h> @@ -28,341 +29,17 @@ #include <uapi/linux/magic.h> -#include <asm/resctrl.h> +#include <asm/msr.h> #include "internal.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); -DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); - -/* Mutex to protect rdtgroup access. */ -DEFINE_MUTEX(rdtgroup_mutex); - -static struct kernfs_root *rdt_root; -struct rdtgroup rdtgroup_default; -LIST_HEAD(rdt_all_groups); - -/* list of entries for the schemata file */ -LIST_HEAD(resctrl_schema_all); - -/* The filesystem can only be mounted once. */ -bool resctrl_mounted; - -/* Kernel fs node for "info" directory under root */ -static struct kernfs_node *kn_info; - -/* Kernel fs node for "mon_groups" directory under root */ -static struct kernfs_node *kn_mongrp; - -/* Kernel fs node for "mon_data" directory under root */ -static struct kernfs_node *kn_mondata; - -/* - * Used to store the max resource name width to display the schemata names in - * a tabular format. - */ -int max_name_width; - -static struct seq_buf last_cmd_status; -static char last_cmd_status_buf[512]; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx); -static void rdtgroup_destroy_root(void); - -struct dentry *debugfs_resctrl; - -/* - * Memory bandwidth monitoring event to use for the default CTRL_MON group - * and each new CTRL_MON group created by the user. Only relevant when - * the filesystem is mounted with the "mba_MBps" option so it does not - * matter that it remains uninitialized on systems that do not support - * the "mba_MBps" option. - */ -enum resctrl_event_id mba_mbps_default_event; - -static bool resctrl_debug; - -void rdt_last_cmd_clear(void) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_clear(&last_cmd_status); -} - -void rdt_last_cmd_puts(const char *s) -{ - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_puts(&last_cmd_status, s); -} - -void rdt_last_cmd_printf(const char *fmt, ...) -{ - va_list ap; - - va_start(ap, fmt); - lockdep_assert_held(&rdtgroup_mutex); - seq_buf_vprintf(&last_cmd_status, fmt, ap); - va_end(ap); -} - -void rdt_staged_configs_clear(void) -{ - struct rdt_ctrl_domain *dom; - struct rdt_resource *r; - - lockdep_assert_held(&rdtgroup_mutex); - - for_each_alloc_capable_rdt_resource(r) { - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) - memset(dom->staged_config, 0, sizeof(dom->staged_config)); - } -} - -static bool resctrl_is_mbm_enabled(void) -{ - return (resctrl_arch_is_mbm_total_enabled() || - resctrl_arch_is_mbm_local_enabled()); -} - -static bool resctrl_is_mbm_event(int e) -{ - return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && - e <= QOS_L3_MBM_LOCAL_EVENT_ID); -} - -/* - * Trivial allocator for CLOSIDs. Since h/w only supports a small number, - * we can keep a bitmap of free CLOSIDs in a single integer. - * - * Using a global CLOSID across all resources has some advantages and - * some drawbacks: - * + We can simply set current's closid to assign a task to a resource - * group. - * + Context switch code can avoid extra memory references deciding which - * CLOSID to load into the PQR_ASSOC MSR - * - We give up some options in configuring resource groups across multi-socket - * systems. - * - Our choices on how to configure each resource become progressively more - * limited as the number of resources grows. - */ -static unsigned long closid_free_map; -static int closid_free_map_len; - -int closids_supported(void) -{ - return closid_free_map_len; -} - -static void closid_init(void) -{ - struct resctrl_schema *s; - u32 rdt_min_closid = 32; - - /* Compute rdt_min_closid across all resources */ - list_for_each_entry(s, &resctrl_schema_all, list) - rdt_min_closid = min(rdt_min_closid, s->num_closid); - - closid_free_map = BIT_MASK(rdt_min_closid) - 1; - - /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ - __clear_bit(RESCTRL_RESERVED_CLOSID, &closid_free_map); - closid_free_map_len = rdt_min_closid; -} - -static int closid_alloc(void) -{ - int cleanest_closid; - u32 closid; - - lockdep_assert_held(&rdtgroup_mutex); - - if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && - resctrl_arch_is_llc_occupancy_enabled()) { - cleanest_closid = resctrl_find_cleanest_closid(); - if (cleanest_closid < 0) - return cleanest_closid; - closid = cleanest_closid; - } else { - closid = ffs(closid_free_map); - if (closid == 0) - return -ENOSPC; - closid--; - } - __clear_bit(closid, &closid_free_map); - - return closid; -} - -void closid_free(int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - __set_bit(closid, &closid_free_map); -} - -/** - * closid_allocated - test if provided closid is in use - * @closid: closid to be tested - * - * Return: true if @closid is currently associated with a resource group, - * false if @closid is free - */ -bool closid_allocated(unsigned int closid) -{ - lockdep_assert_held(&rdtgroup_mutex); - - return !test_bit(closid, &closid_free_map); -} - -/** - * rdtgroup_mode_by_closid - Return mode of resource group with closid - * @closid: closid if the resource group - * - * Each resource group is associated with a @closid. Here the mode - * of a resource group can be queried by searching for it using its closid. - * - * Return: mode as &enum rdtgrp_mode of resource group with closid @closid - */ -enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) -{ - struct rdtgroup *rdtgrp; - - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (rdtgrp->closid == closid) - return rdtgrp->mode; - } - - return RDT_NUM_MODES; -} - -static const char * const rdt_mode_str[] = { - [RDT_MODE_SHAREABLE] = "shareable", - [RDT_MODE_EXCLUSIVE] = "exclusive", - [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", - [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", -}; - -/** - * rdtgroup_mode_str - Return the string representation of mode - * @mode: the resource group mode as &enum rdtgroup_mode - * - * Return: string representation of valid mode, "unknown" otherwise - */ -static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) -{ - if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) - return "unknown"; - - return rdt_mode_str[mode]; -} -/* set uid and gid of rdtgroup dirs and files to that of the creator */ -static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) -{ - struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, - .ia_uid = current_fsuid(), - .ia_gid = current_fsgid(), }; - - if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && - gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) - return 0; - - return kernfs_setattr(kn, &iattr); -} - -static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) -{ - struct kernfs_node *kn; - int ret; - - kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, - 0, rft->kf_ops, rft, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return 0; -} - -static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) -{ - struct kernfs_open_file *of = m->private; - struct rftype *rft = of->kn->priv; - - if (rft->seq_show) - return rft->seq_show(of, m, arg); - return 0; -} - -static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, - size_t nbytes, loff_t off) -{ - struct rftype *rft = of->kn->priv; - - if (rft->write) - return rft->write(of, buf, nbytes, off); - - return -EINVAL; -} - -static const struct kernfs_ops rdtgroup_kf_single_ops = { - .atomic_write_len = PAGE_SIZE, - .write = rdtgroup_file_write, - .seq_show = rdtgroup_seqfile_show, -}; - -static const struct kernfs_ops kf_mondata_ops = { - .atomic_write_len = PAGE_SIZE, - .seq_show = rdtgroup_mondata_show, -}; - -static bool is_cpu_list(struct kernfs_open_file *of) -{ - struct rftype *rft = of->kn->priv; - - return rft->flags & RFTYPE_FLAGS_CPUS_LIST; -} - -static int rdtgroup_cpus_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - struct cpumask *mask; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - - if (rdtgrp) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - mask = &rdtgrp->plr->d->hdr.cpu_mask; - seq_printf(s, is_cpu_list(of) ? - "%*pbl\n" : "%*pb\n", - cpumask_pr_args(mask)); - } - } else { - seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", - cpumask_pr_args(&rdtgrp->cpu_mask)); - } - } else { - ret = -ENOENT; - } - rdtgroup_kn_unlock(of->kn); +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); - return ret; -} +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); /* - * This is safe against resctrl_sched_in() called from __switch_to() + * This is safe against resctrl_arch_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call * from update_closid_rmid() is protected against __switch_to() because * preemption is disabled. @@ -381,1223 +58,7 @@ void resctrl_arch_sync_cpu_closid_rmid(void *info) * executing task might have its own closid selected. Just reuse * the context switch code. */ - resctrl_sched_in(current); -} - -/* - * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, - * - * Per task closids/rmids must have been set up before calling this function. - * @r may be NULL. - */ -static void -update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) -{ - struct resctrl_cpu_defaults defaults, *p = NULL; - - if (r) { - defaults.closid = r->closid; - defaults.rmid = r->mon.rmid; - p = &defaults; - } - - on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); -} - -static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask) -{ - struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; - struct list_head *head; - - /* Check whether cpus belong to parent ctrl group */ - cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); - return -EINVAL; - } - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Give any dropped cpus to parent rdtgroup */ - cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); - update_closid_rmid(tmpmask, prgrp); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu rmid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - if (crgrp == rdtgrp) - continue; - cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, - tmpmask); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - return 0; -} - -static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) -{ - struct rdtgroup *crgrp; - - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); - /* update the child mon group masks as well*/ - list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) - cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); -} - -static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, - cpumask_var_t tmpmask, cpumask_var_t tmpmask1) -{ - struct rdtgroup *r, *crgrp; - struct list_head *head; - - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (!cpumask_empty(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - rdt_last_cmd_puts("Can't drop CPUs from default group\n"); - return -EINVAL; - } - - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - update_closid_rmid(tmpmask, &rdtgroup_default); - } - - /* - * If we added cpus, remove them from previous group and - * the prev group's child groups that owned them - * and update per-cpu closid/rmid. - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (!cpumask_empty(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); - if (!cpumask_empty(tmpmask1)) - cpumask_rdtgrp_clear(r, tmpmask1); - } - update_closid_rmid(tmpmask, rdtgrp); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); - - /* - * Clear child mon group masks since there is a new parent mask - * now and update the rmid for the cpus the child lost. - */ - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); - update_closid_rmid(tmpmask, rdtgrp); - cpumask_clear(&crgrp->cpu_mask); - } - - return 0; -} - -static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - cpumask_var_t tmpmask, newmask, tmpmask1; - struct rdtgroup *rdtgrp; - int ret; - - if (!buf) - return -EINVAL; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - return -ENOMEM; - } - if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - return -ENOMEM; - } - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - ret = -ENOENT; - goto unlock; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - if (is_cpu_list(of)) - ret = cpulist_parse(buf, newmask); - else - ret = cpumask_parse(buf, newmask); - - if (ret) { - rdt_last_cmd_puts("Bad CPU list/mask\n"); - goto unlock; - } - - /* check that user didn't specify any offline cpus */ - cpumask_andnot(tmpmask, newmask, cpu_online_mask); - if (!cpumask_empty(tmpmask)) { - ret = -EINVAL; - rdt_last_cmd_puts("Can only assign online CPUs\n"); - goto unlock; - } - - if (rdtgrp->type == RDTCTRL_GROUP) - ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); - else if (rdtgrp->type == RDTMON_GROUP) - ret = cpus_mon_write(rdtgrp, newmask, tmpmask); - else - ret = -EINVAL; - -unlock: - rdtgroup_kn_unlock(of->kn); - free_cpumask_var(tmpmask); - free_cpumask_var(newmask); - free_cpumask_var(tmpmask1); - - return ret ?: nbytes; -} - -/** - * rdtgroup_remove - the helper to remove resource group safely - * @rdtgrp: resource group to remove - * - * On resource group creation via a mkdir, an extra kernfs_node reference is - * taken to ensure that the rdtgroup structure remains accessible for the - * rdtgroup_kn_unlock() calls where it is removed. - * - * Drop the extra reference here, then free the rdtgroup structure. - * - * Return: void - */ -static void rdtgroup_remove(struct rdtgroup *rdtgrp) -{ - kernfs_put(rdtgrp->kn); - kfree(rdtgrp); -} - -static void _update_task_closid_rmid(void *task) -{ - /* - * If the task is still current on this CPU, update PQR_ASSOC MSR. - * Otherwise, the MSR is updated when the task is scheduled in. - */ - if (task == current) - resctrl_sched_in(task); -} - -static void update_task_closid_rmid(struct task_struct *t) -{ - if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) - smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); - else - _update_task_closid_rmid(t); -} - -static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) -{ - u32 closid, rmid = rdtgrp->mon.rmid; - - if (rdtgrp->type == RDTCTRL_GROUP) - closid = rdtgrp->closid; - else if (rdtgrp->type == RDTMON_GROUP) - closid = rdtgrp->mon.parent->closid; - else - return false; - - return resctrl_arch_match_closid(tsk, closid) && - resctrl_arch_match_rmid(tsk, closid, rmid); -} - -static int __rdtgroup_move_task(struct task_struct *tsk, - struct rdtgroup *rdtgrp) -{ - /* If the task is already in rdtgrp, no need to move the task. */ - if (task_in_rdtgroup(tsk, rdtgrp)) - return 0; - - /* - * Set the task's closid/rmid before the PQR_ASSOC MSR can be - * updated by them. - * - * For ctrl_mon groups, move both closid and rmid. - * For monitor groups, can move the tasks only from - * their parent CTRL group. - */ - if (rdtgrp->type == RDTMON_GROUP && - !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { - rdt_last_cmd_puts("Can't move task to different control group\n"); - return -EINVAL; - } - - if (rdtgrp->type == RDTMON_GROUP) - resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, - rdtgrp->mon.rmid); - else - resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, - rdtgrp->mon.rmid); - - /* - * Ensure the task's closid and rmid are written before determining if - * the task is current that will decide if it will be interrupted. - * This pairs with the full barrier between the rq->curr update and - * resctrl_sched_in() during context switch. - */ - smp_mb(); - - /* - * By now, the task's closid and rmid are set. If the task is current - * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource - * group go into effect. If the task is not current, the MSR will be - * updated when the task is scheduled in. - */ - update_task_closid_rmid(tsk); - - return 0; -} - -static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && - resctrl_arch_match_closid(t, r->closid)); -} - -static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) -{ - return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && - resctrl_arch_match_rmid(t, r->mon.parent->closid, - r->mon.rmid)); -} - -/** - * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group - * @r: Resource group - * - * Return: 1 if tasks have been assigned to @r, 0 otherwise - */ -int rdtgroup_tasks_assigned(struct rdtgroup *r) -{ - struct task_struct *p, *t; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - ret = 1; - break; - } - } - rcu_read_unlock(); - - return ret; -} - -static int rdtgroup_task_write_permission(struct task_struct *task, - struct kernfs_open_file *of) -{ - const struct cred *tcred = get_task_cred(task); - const struct cred *cred = current_cred(); - int ret = 0; - - /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. - */ - if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && - !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->euid, tcred->suid)) { - rdt_last_cmd_printf("No permission to move task %d\n", task->pid); - ret = -EPERM; - } - - put_cred(tcred); - return ret; -} - -static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, - struct kernfs_open_file *of) -{ - struct task_struct *tsk; - int ret; - - rcu_read_lock(); - if (pid) { - tsk = find_task_by_vpid(pid); - if (!tsk) { - rcu_read_unlock(); - rdt_last_cmd_printf("No task %d\n", pid); - return -ESRCH; - } - } else { - tsk = current; - } - - get_task_struct(tsk); - rcu_read_unlock(); - - ret = rdtgroup_task_write_permission(tsk, of); - if (!ret) - ret = __rdtgroup_move_task(tsk, rdtgrp); - - put_task_struct(tsk); - return ret; -} - -static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - char *pid_str; - int ret = 0; - pid_t pid; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - rdt_last_cmd_clear(); - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto unlock; - } - - while (buf && buf[0] != '\0' && buf[0] != '\n') { - pid_str = strim(strsep(&buf, ",")); - - if (kstrtoint(pid_str, 0, &pid)) { - rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); - ret = -EINVAL; - break; - } - - if (pid < 0) { - rdt_last_cmd_printf("Invalid pid %d\n", pid); - ret = -EINVAL; - break; - } - - ret = rdtgroup_move_task(pid, rdtgrp, of); - if (ret) { - rdt_last_cmd_printf("Error while processing task %d\n", pid); - break; - } - } - -unlock: - rdtgroup_kn_unlock(of->kn); - - return ret ?: nbytes; -} - -static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) -{ - struct task_struct *p, *t; - pid_t pid; - - rcu_read_lock(); - for_each_process_thread(p, t) { - if (is_closid_match(t, r) || is_rmid_match(t, r)) { - pid = task_pid_vnr(t); - if (pid) - seq_printf(s, "%d\n", pid); - } - } - rcu_read_unlock(); -} - -static int rdtgroup_tasks_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - show_rdt_tasks(rdtgrp, s); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_closid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->closid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -static int rdtgroup_rmid_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - int ret = 0; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (rdtgrp) - seq_printf(s, "%u\n", rdtgrp->mon.rmid); - else - ret = -ENOENT; - rdtgroup_kn_unlock(of->kn); - - return ret; -} - -#ifdef CONFIG_PROC_CPU_RESCTRL - -/* - * A task can only be part of one resctrl control group and of one monitor - * group which is associated to that control group. - * - * 1) res: - * mon: - * - * resctrl is not available. - * - * 2) res:/ - * mon: - * - * Task is part of the root resctrl control group, and it is not associated - * to any monitor group. - * - * 3) res:/ - * mon:mon0 - * - * Task is part of the root resctrl control group and monitor group mon0. - * - * 4) res:group0 - * mon: - * - * Task is part of resctrl control group group0, and it is not associated - * to any monitor group. - * - * 5) res:group0 - * mon:mon1 - * - * Task is part of resctrl control group group0 and monitor group mon1. - */ -int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, - struct pid *pid, struct task_struct *tsk) -{ - struct rdtgroup *rdtg; - int ret = 0; - - mutex_lock(&rdtgroup_mutex); - - /* Return empty if resctrl has not been mounted. */ - if (!resctrl_mounted) { - seq_puts(s, "res:\nmon:\n"); - goto unlock; - } - - list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { - struct rdtgroup *crg; - - /* - * Task information is only relevant for shareable - * and exclusive groups. - */ - if (rdtg->mode != RDT_MODE_SHAREABLE && - rdtg->mode != RDT_MODE_EXCLUSIVE) - continue; - - if (!resctrl_arch_match_closid(tsk, rdtg->closid)) - continue; - - seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", - rdt_kn_name(rdtg->kn)); - seq_puts(s, "mon:"); - list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, - mon.crdtgrp_list) { - if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, - crg->mon.rmid)) - continue; - seq_printf(s, "%s", rdt_kn_name(crg->kn)); - break; - } - seq_putc(s, '\n'); - goto unlock; - } - /* - * The above search should succeed. Otherwise return - * with an error. - */ - ret = -ENOENT; -unlock: - mutex_unlock(&rdtgroup_mutex); - - return ret; -} -#endif - -static int rdt_last_cmd_status_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - int len; - - mutex_lock(&rdtgroup_mutex); - len = seq_buf_used(&last_cmd_status); - if (len) - seq_printf(seq, "%.*s", len, last_cmd_status_buf); - else - seq_puts(seq, "ok\n"); - mutex_unlock(&rdtgroup_mutex); - return 0; -} - -static void *rdt_kn_parent_priv(struct kernfs_node *kn) -{ - /* - * The parent pointer is only valid within RCU section since it can be - * replaced. - */ - guard(rcu)(); - return rcu_dereference(kn->__parent)->priv; -} - -static int rdt_num_closids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - - seq_printf(seq, "%u\n", s->num_closid); - return 0; -} - -static int rdt_default_ctrl_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); - return 0; -} - -static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.min_cbm_bits); - return 0; -} - -static int rdt_shareable_bits_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%x\n", r->cache.shareable_bits); - return 0; -} - -/* - * rdt_bit_usage_show - Display current usage of resources - * - * A domain is a shared resource that can now be allocated differently. Here - * we display the current regions of the domain as an annotated bitmask. - * For each domain of this resource its allocation bitmask - * is annotated as below to indicate the current usage of the corresponding bit: - * 0 - currently unused - * X - currently available for sharing and used by software and hardware - * H - currently used by hardware only but available for software use - * S - currently used and shareable by software only - * E - currently used exclusively by one resource group - * P - currently pseudo-locked by one resource group - */ -static int rdt_bit_usage_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - /* - * Use unsigned long even though only 32 bits are used to ensure - * test_bit() is used safely. - */ - unsigned long sw_shareable = 0, hw_shareable = 0; - unsigned long exclusive = 0, pseudo_locked = 0; - struct rdt_resource *r = s->res; - struct rdt_ctrl_domain *dom; - int i, hwb, swb, excl, psl; - enum rdtgrp_mode mode; - bool sep = false; - u32 ctrl_val; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - hw_shareable = r->cache.shareable_bits; - list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(seq, ';'); - sw_shareable = 0; - exclusive = 0; - seq_printf(seq, "%d=", dom->hdr.id); - for (i = 0; i < closids_supported(); i++) { - if (!closid_allocated(i)) - continue; - ctrl_val = resctrl_arch_get_config(r, dom, i, - s->conf_type); - mode = rdtgroup_mode_by_closid(i); - switch (mode) { - case RDT_MODE_SHAREABLE: - sw_shareable |= ctrl_val; - break; - case RDT_MODE_EXCLUSIVE: - exclusive |= ctrl_val; - break; - case RDT_MODE_PSEUDO_LOCKSETUP: - /* - * RDT_MODE_PSEUDO_LOCKSETUP is possible - * here but not included since the CBM - * associated with this CLOSID in this mode - * is not initialized and no task or cpu can be - * assigned this CLOSID. - */ - break; - case RDT_MODE_PSEUDO_LOCKED: - case RDT_NUM_MODES: - WARN(1, - "invalid mode for closid %d\n", i); - break; - } - } - for (i = r->cache.cbm_len - 1; i >= 0; i--) { - pseudo_locked = dom->plr ? dom->plr->cbm : 0; - hwb = test_bit(i, &hw_shareable); - swb = test_bit(i, &sw_shareable); - excl = test_bit(i, &exclusive); - psl = test_bit(i, &pseudo_locked); - if (hwb && swb) - seq_putc(seq, 'X'); - else if (hwb && !swb) - seq_putc(seq, 'H'); - else if (!hwb && swb) - seq_putc(seq, 'S'); - else if (excl) - seq_putc(seq, 'E'); - else if (psl) - seq_putc(seq, 'P'); - else /* Unused bits remain */ - seq_putc(seq, '0'); - } - sep = true; - } - seq_putc(seq, '\n'); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return 0; -} - -static int rdt_min_bw_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.min_bw); - return 0; -} - -static int rdt_num_rmids_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - seq_printf(seq, "%d\n", r->num_rmid); - - return 0; -} - -static int rdt_mon_features_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - struct mon_evt *mevt; - - list_for_each_entry(mevt, &r->evt_list, list) { - seq_printf(seq, "%s\n", mevt->name); - if (mevt->configurable) - seq_printf(seq, "%s_config\n", mevt->name); - } - - return 0; -} - -static int rdt_bw_gran_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.bw_gran); - return 0; -} - -static int rdt_delay_linear_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->membw.delay_linear); - return 0; -} - -static int max_threshold_occ_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); - - return 0; -} - -static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - switch (r->membw.throttle_mode) { - case THREAD_THROTTLE_PER_THREAD: - seq_puts(seq, "per-thread\n"); - return 0; - case THREAD_THROTTLE_MAX: - seq_puts(seq, "max\n"); - return 0; - case THREAD_THROTTLE_UNDEFINED: - seq_puts(seq, "undefined\n"); - return 0; - } - - WARN_ON_ONCE(1); - - return 0; -} - -static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - unsigned int bytes; - int ret; - - ret = kstrtouint(buf, 0, &bytes); - if (ret) - return ret; - - if (bytes > resctrl_rmid_realloc_limit) - return -EINVAL; - - resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); - - return nbytes; -} - -/* - * rdtgroup_mode_show - Display mode of this resource group - */ -static int rdtgroup_mode_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct rdtgroup *rdtgrp; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); - - rdtgroup_kn_unlock(of->kn); - return 0; -} - -static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) -{ - switch (my_type) { - case CDP_CODE: - return CDP_DATA; - case CDP_DATA: - return CDP_CODE; - default: - case CDP_NONE: - return CDP_NONE; - } -} - -static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); - struct rdt_resource *r = s->res; - - seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); - - return 0; -} - -/** - * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other - * @r: Resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @type: CDP type of @r. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Checks if provided @cbm intended to be used for @closid on domain - * @d overlaps with any other closids or other hardware usage associated - * with this domain. If @exclusive is true then only overlaps with - * resource groups in exclusive mode will be considered. If @exclusive - * is false then overlaps with any resource group or hardware entities - * will be considered. - * - * @cbm is unsigned long, even if only 32 bits are used, to make the - * bitmap functions work correctly. - * - * Return: false if CBM does not overlap, true if it does. - */ -static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, - enum resctrl_conf_type type, bool exclusive) -{ - enum rdtgrp_mode mode; - unsigned long ctrl_b; - int i; - - /* Check for any overlap with regions used by hardware directly */ - if (!exclusive) { - ctrl_b = r->cache.shareable_bits; - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) - return true; - } - - /* Check for overlap with other resource groups */ - for (i = 0; i < closids_supported(); i++) { - ctrl_b = resctrl_arch_get_config(r, d, i, type); - mode = rdtgroup_mode_by_closid(i); - if (closid_allocated(i) && i != closid && - mode != RDT_MODE_PSEUDO_LOCKSETUP) { - if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { - if (exclusive) { - if (mode == RDT_MODE_EXCLUSIVE) - return true; - continue; - } - return true; - } - } - } - - return false; -} - -/** - * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware - * @s: Schema for the resource to which domain instance @d belongs. - * @d: The domain instance for which @closid is being tested. - * @cbm: Capacity bitmask being tested. - * @closid: Intended closid for @cbm. - * @exclusive: Only check if overlaps with exclusive resource groups - * - * Resources that can be allocated using a CBM can use the CBM to control - * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test - * for overlap. Overlap test is not limited to the specific resource for - * which the CBM is intended though - when dealing with CDP resources that - * share the underlying hardware the overlap check should be performed on - * the CDP resource sharing the hardware also. - * - * Refer to description of __rdtgroup_cbm_overlaps() for the details of the - * overlap test. - * - * Return: true if CBM overlap detected, false if there is no overlap - */ -bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, - unsigned long cbm, int closid, bool exclusive) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - struct rdt_resource *r = s->res; - - if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, - exclusive)) - return true; - - if (!resctrl_arch_get_cdp_enabled(r->rid)) - return false; - return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); -} - -/** - * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive - * @rdtgrp: Resource group identified through its closid. - * - * An exclusive resource group implies that there should be no sharing of - * its allocated resources. At the time this group is considered to be - * exclusive this test can determine if its current schemata supports this - * setting by testing for overlap with all other resource groups. - * - * Return: true if resource group can be exclusive, false if there is overlap - * with allocations of other resource groups and thus this resource group - * cannot be exclusive. - */ -static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) -{ - int closid = rdtgrp->closid; - struct rdt_ctrl_domain *d; - struct resctrl_schema *s; - struct rdt_resource *r; - bool has_cache = false; - u32 ctrl; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) - continue; - has_cache = true; - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - ctrl = resctrl_arch_get_config(r, d, closid, - s->conf_type); - if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { - rdt_last_cmd_puts("Schemata overlaps\n"); - return false; - } - } - } - - if (!has_cache) { - rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); - return false; - } - - return true; -} - -/* - * rdtgroup_mode_write - Modify the resource group's mode - */ -static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, loff_t off) -{ - struct rdtgroup *rdtgrp; - enum rdtgrp_mode mode; - int ret = 0; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - buf[nbytes - 1] = '\0'; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - rdt_last_cmd_clear(); - - mode = rdtgrp->mode; - - if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || - (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || - (!strcmp(buf, "pseudo-locksetup") && - mode == RDT_MODE_PSEUDO_LOCKSETUP) || - (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) - goto out; - - if (mode == RDT_MODE_PSEUDO_LOCKED) { - rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); - ret = -EINVAL; - goto out; - } - - if (!strcmp(buf, "shareable")) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_SHAREABLE; - } else if (!strcmp(buf, "exclusive")) { - if (!rdtgroup_mode_test_exclusive(rdtgrp)) { - ret = -EINVAL; - goto out; - } - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - ret = rdtgroup_locksetup_exit(rdtgrp); - if (ret) - goto out; - } - rdtgrp->mode = RDT_MODE_EXCLUSIVE; - } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && - !strcmp(buf, "pseudo-locksetup")) { - ret = rdtgroup_locksetup_enter(rdtgrp); - if (ret) - goto out; - rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; - } else { - rdt_last_cmd_puts("Unknown or unsupported mode\n"); - ret = -EINVAL; - } - -out: - rdtgroup_kn_unlock(of->kn); - return ret ?: nbytes; -} - -/** - * rdtgroup_cbm_to_size - Translate CBM to size in bytes - * @r: RDT resource to which @d belongs. - * @d: RDT domain instance. - * @cbm: bitmask for which the size should be computed. - * - * The bitmask provided associated with the RDT domain instance @d will be - * translated into how many bytes it represents. The size in bytes is - * computed by first dividing the total cache size by the CBM length to - * determine how many bytes each bit in the bitmask represents. The result - * is multiplied with the number of bits set in the bitmask. - * - * @cbm is unsigned long, even if only 32 bits are used to make the - * bitmap functions work correctly. - */ -unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, - struct rdt_ctrl_domain *d, unsigned long cbm) -{ - unsigned int size = 0; - struct cacheinfo *ci; - int num_b; - - if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) - return size; - - num_b = bitmap_weight(&cbm, r->cache.cbm_len); - ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); - if (ci) - size = ci->size / r->cache.cbm_len * num_b; - - return size; -} - -/* - * rdtgroup_size_show - Display size in bytes of allocated regions - * - * The "size" file mirrors the layout of the "schemata" file, printing the - * size in bytes of each region instead of the capacity bitmask. - */ -static int rdtgroup_size_show(struct kernfs_open_file *of, - struct seq_file *s, void *v) -{ - struct resctrl_schema *schema; - enum resctrl_conf_type type; - struct rdt_ctrl_domain *d; - struct rdtgroup *rdtgrp; - struct rdt_resource *r; - unsigned int size; - int ret = 0; - u32 closid; - bool sep; - u32 ctrl; - - rdtgrp = rdtgroup_kn_lock_live(of->kn); - if (!rdtgrp) { - rdtgroup_kn_unlock(of->kn); - return -ENOENT; - } - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - if (!rdtgrp->plr->d) { - rdt_last_cmd_clear(); - rdt_last_cmd_puts("Cache domain offline\n"); - ret = -ENODEV; - } else { - seq_printf(s, "%*s:", max_name_width, - rdtgrp->plr->s->name); - size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, - rdtgrp->plr->d, - rdtgrp->plr->cbm); - seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); - } - goto out; - } - - closid = rdtgrp->closid; - - list_for_each_entry(schema, &resctrl_schema_all, list) { - r = schema->res; - type = schema->conf_type; - sep = false; - seq_printf(s, "%*s:", max_name_width, schema->name); - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (sep) - seq_putc(s, ';'); - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { - size = 0; - } else { - if (is_mba_sc(r)) - ctrl = d->mbps_val[closid]; - else - ctrl = resctrl_arch_get_config(r, d, - closid, - type); - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) - size = ctrl; - else - size = rdtgroup_cbm_to_size(r, d, ctrl); - } - seq_printf(s, "%d=%u", d->hdr.id, size); - sep = true; - } - seq_putc(s, '\n'); - } - -out: - rdtgroup_kn_unlock(of->kn); - - return ret; + resctrl_arch_sched_in(current); } #define INVALID_CONFIG_INDEX UINT_MAX @@ -1635,68 +96,12 @@ void resctrl_arch_mon_event_config_read(void *_config_info) pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } - rdmsrl(MSR_IA32_EVT_CFG_BASE + index, msrval); + rdmsrq(MSR_IA32_EVT_CFG_BASE + index, msrval); /* Report only the valid event configuration bits */ config_info->mon_config = msrval & MAX_EVT_CONFIG_BITS; } -static void mondata_config_read(struct resctrl_mon_config_info *mon_info) -{ - smp_call_function_any(&mon_info->d->hdr.cpu_mask, - resctrl_arch_mon_event_config_read, mon_info, 1); -} - -static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) -{ - struct resctrl_mon_config_info mon_info; - struct rdt_mon_domain *dom; - bool sep = false; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - if (sep) - seq_puts(s, ";"); - - memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); - mon_info.r = r; - mon_info.d = dom; - mon_info.evtid = evtid; - mondata_config_read(&mon_info); - - seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); - sep = true; - } - seq_puts(s, "\n"); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return 0; -} - -static int mbm_total_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); - - return 0; -} - -static int mbm_local_bytes_config_show(struct kernfs_open_file *of, - struct seq_file *seq, void *v) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - - mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); - - return 0; -} - void resctrl_arch_mon_event_config_write(void *_config_info) { struct resctrl_mon_config_info *config_info = _config_info; @@ -1707,638 +112,21 @@ void resctrl_arch_mon_event_config_write(void *_config_info) pr_warn_once("Invalid event id %d\n", config_info->evtid); return; } - wrmsr(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config, 0); -} - -static void mbm_config_write_domain(struct rdt_resource *r, - struct rdt_mon_domain *d, u32 evtid, u32 val) -{ - struct resctrl_mon_config_info mon_info = {0}; - - /* - * Read the current config value first. If both are the same then - * no need to write it again. - */ - mon_info.r = r; - mon_info.d = d; - mon_info.evtid = evtid; - mondata_config_read(&mon_info); - if (mon_info.mon_config == val) - return; - - mon_info.mon_config = val; - - /* - * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the - * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE - * are scoped at the domain level. Writing any of these MSRs - * on one CPU is observed by all the CPUs in the domain. - */ - smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, - &mon_info, 1); - - /* - * When an Event Configuration is changed, the bandwidth counters - * for all RMIDs and Events will be cleared by the hardware. The - * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for - * every RMID on the next read to any event for every RMID. - * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) - * cleared while it is tracked by the hardware. Clear the - * mbm_local and mbm_total counts for all the RMIDs. - */ - resctrl_arch_reset_rmid_all(r, d); -} - -static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) -{ - char *dom_str = NULL, *id_str; - unsigned long dom_id, val; - struct rdt_mon_domain *d; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - -next: - if (!tok || tok[0] == '\0') - return 0; - - /* Start processing the strings for each domain */ - dom_str = strim(strsep(&tok, ";")); - id_str = strsep(&dom_str, "="); - - if (!id_str || kstrtoul(id_str, 10, &dom_id)) { - rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); - return -EINVAL; - } - - if (!dom_str || kstrtoul(dom_str, 16, &val)) { - rdt_last_cmd_puts("Non-numeric event configuration value\n"); - return -EINVAL; - } - - /* Value from user cannot be more than the supported set of events */ - if ((val & r->mbm_cfg_mask) != val) { - rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", - r->mbm_cfg_mask); - return -EINVAL; - } - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - if (d->hdr.id == dom_id) { - mbm_config_write_domain(r, d, evtid, val); - goto next; - } - } - - return -EINVAL; -} - -static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, - char *buf, size_t nbytes, - loff_t off) -{ - struct rdt_resource *r = rdt_kn_parent_priv(of->kn); - int ret; - - /* Valid input requires a trailing newline */ - if (nbytes == 0 || buf[nbytes - 1] != '\n') - return -EINVAL; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - buf[nbytes - 1] = '\0'; - - ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - return ret ?: nbytes; -} - -/* rdtgroup information files for one cache resource. */ -static struct rftype res_common_files[] = { - { - .name = "last_cmd_status", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_last_cmd_status_show, - .fflags = RFTYPE_TOP_INFO, - }, - { - .name = "num_closids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, - .fflags = RFTYPE_CTRL_INFO, - }, - { - .name = "mon_features", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_mon_features_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "num_rmids", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_rmids_show, - .fflags = RFTYPE_MON_INFO, - }, - { - .name = "cbm_mask", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_default_ctrl_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_cbm_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_cbm_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "shareable_bits", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_shareable_bits_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "bit_usage", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bit_usage_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "min_bandwidth", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_min_bw_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "bandwidth_gran", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_bw_gran_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - { - .name = "delay_linear", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_delay_linear_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, - }, - /* - * Platform specific which (if any) capabilities are provided by - * thread_throttle_mode. Defer "fflags" initialization to platform - * discovery. - */ - { - .name = "thread_throttle_mode", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_thread_throttle_mode_show, - }, - { - .name = "max_threshold_occupancy", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = max_threshold_occ_write, - .seq_show = max_threshold_occ_show, - .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "mbm_total_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_total_bytes_config_show, - .write = mbm_total_bytes_config_write, - }, - { - .name = "mbm_local_bytes_config", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = mbm_local_bytes_config_show, - .write = mbm_local_bytes_config_write, - }, - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - .fflags = RFTYPE_BASE, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - .fflags = RFTYPE_BASE, - }, - { - .name = "mon_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_rmid_show, - .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "mba_MBps_event", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mba_mbps_event_write, - .seq_show = rdtgroup_mba_mbps_event_show, - }, - { - .name = "mode", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_mode_write, - .seq_show = rdtgroup_mode_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "size", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_size_show, - .fflags = RFTYPE_CTRL_BASE, - }, - { - .name = "sparse_masks", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_has_sparse_bitmasks_show, - .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, - }, - { - .name = "ctrl_hw_id", - .mode = 0444, - .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdtgroup_closid_show, - .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, - }, - -}; - -static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) -{ - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - lockdep_assert_held(&rdtgroup_mutex); - - if (resctrl_debug) - fflags |= RFTYPE_DEBUG; - - for (rft = rfts; rft < rfts + len; rft++) { - if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) { - if ((fflags & rft->fflags) == rft->fflags) - kernfs_remove_by_name(kn, rft->name); - } - return ret; -} - -static struct rftype *rdtgroup_get_rftype_by_name(const char *name) -{ - struct rftype *rfts, *rft; - int len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - return rft; - } - - return NULL; -} - -static void thread_throttle_mode_init(void) -{ - enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; - struct rdt_resource *r_mba, *r_smba; - - r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - if (r_mba->alloc_capable && - r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_mba->membw.throttle_mode; - - r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); - if (r_smba->alloc_capable && - r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) - throttle_mode = r_smba->membw.throttle_mode; - - if (throttle_mode == THREAD_THROTTLE_UNDEFINED) - return; - - resctrl_file_fflags_init("thread_throttle_mode", - RFTYPE_CTRL_INFO | RFTYPE_RES_MB); -} - -void resctrl_file_fflags_init(const char *config, unsigned long fflags) -{ - struct rftype *rft; - - rft = rdtgroup_get_rftype_by_name(config); - if (rft) - rft->fflags = fflags; -} - -/** - * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * - * The permissions of named resctrl file, directory, or link are modified - * to not allow read, write, or execute by any user. - * - * WARNING: This function is intended to communicate to the user that the - * resctrl file has been locked down - that it is not relevant to the - * particular state the system finds itself in. It should not be relied - * on to protect from user access because after the file's permissions - * are restricted the user can still change the permissions using chmod - * from the command line. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn; - int ret = 0; - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - iattr.ia_mode = S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode = S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode = S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -/** - * rdtgroup_kn_mode_restore - Restore user access to named resctrl file - * @r: The resource group with which the file is associated. - * @name: Name of the file - * @mask: Mask of permissions that should be restored - * - * Restore the permissions of the named file. If @name is a directory the - * permissions of its parent will be used. - * - * Return: 0 on success, <0 on failure. - */ -int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, - umode_t mask) -{ - struct iattr iattr = {.ia_valid = ATTR_MODE,}; - struct kernfs_node *kn, *parent; - struct rftype *rfts, *rft; - int ret, len; - - rfts = res_common_files; - len = ARRAY_SIZE(res_common_files); - - for (rft = rfts; rft < rfts + len; rft++) { - if (!strcmp(rft->name, name)) - iattr.ia_mode = rft->mode & mask; - } - - kn = kernfs_find_and_get_ns(r->kn, name, NULL); - if (!kn) - return -ENOENT; - - switch (kernfs_type(kn)) { - case KERNFS_DIR: - parent = kernfs_get_parent(kn); - if (parent) { - iattr.ia_mode |= parent->mode; - kernfs_put(parent); - } - iattr.ia_mode |= S_IFDIR; - break; - case KERNFS_FILE: - iattr.ia_mode |= S_IFREG; - break; - case KERNFS_LINK: - iattr.ia_mode |= S_IFLNK; - break; - } - - ret = kernfs_setattr(kn, &iattr); - kernfs_put(kn); - return ret; -} - -static int rdtgroup_mkdir_info_resdir(void *priv, char *name, - unsigned long fflags) -{ - struct kernfs_node *kn_subdir; - int ret; - - kn_subdir = kernfs_create_dir(kn_info, name, - kn_info->mode, priv); - if (IS_ERR(kn_subdir)) - return PTR_ERR(kn_subdir); - - ret = rdtgroup_kn_set_ugid(kn_subdir); - if (ret) - return ret; - - ret = rdtgroup_add_files(kn_subdir, fflags); - if (!ret) - kernfs_activate(kn_subdir); - - return ret; -} - -static unsigned long fflags_from_resource(struct rdt_resource *r) -{ - switch (r->rid) { - case RDT_RESOURCE_L3: - case RDT_RESOURCE_L2: - return RFTYPE_RES_CACHE; - case RDT_RESOURCE_MBA: - case RDT_RESOURCE_SMBA: - return RFTYPE_RES_MB; - } - - return WARN_ON_ONCE(1); -} - -static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - unsigned long fflags; - char name[32]; - int ret; - - /* create the directory */ - kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); - if (IS_ERR(kn_info)) - return PTR_ERR(kn_info); - - ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); - if (ret) - goto out_destroy; - - /* loop over enabled controls, these are all alloc_capable */ - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; - ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); - if (ret) - goto out_destroy; - } - - for_each_mon_capable_rdt_resource(r) { - fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; - sprintf(name, "%s_MON", r->name); - ret = rdtgroup_mkdir_info_resdir(r, name, fflags); - if (ret) - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(kn_info); - if (ret) - goto out_destroy; - - kernfs_activate(kn_info); - - return 0; - -out_destroy: - kernfs_remove(kn_info); - return ret; -} - -static int -mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, - char *name, struct kernfs_node **dest_kn) -{ - struct kernfs_node *kn; - int ret; - - /* create the directory */ - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - if (dest_kn) - *dest_kn = kn; - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - - kernfs_activate(kn); - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; + wrmsrq(MSR_IA32_EVT_CFG_BASE + index, config_info->mon_config); } static void l3_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); + wrmsrq(MSR_IA32_L3_QOS_CFG, *enable ? L3_QOS_CDP_ENABLE : 0ULL); } static void l2_qos_cfg_update(void *arg) { bool *enable = arg; - wrmsrl(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); -} - -static inline bool is_mba_linear(void) -{ - return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; + wrmsrq(MSR_IA32_L2_QOS_CFG, *enable ? L2_QOS_CDP_ENABLE : 0ULL); } static int set_cache_qos_cfg(int level, bool enable) @@ -2396,76 +184,6 @@ void rdt_domain_reconfigure_cdp(struct rdt_resource *r) l3_qos_cfg_update(&hw_res->cdp_enabled); } -static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - u32 num_closid = resctrl_arch_get_num_closid(r); - int cpu = cpumask_any(&d->hdr.cpu_mask); - int i; - - d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), - GFP_KERNEL, cpu_to_node(cpu)); - if (!d->mbps_val) - return -ENOMEM; - - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - - return 0; -} - -static void mba_sc_domain_destroy(struct rdt_resource *r, - struct rdt_ctrl_domain *d) -{ - kfree(d->mbps_val); - d->mbps_val = NULL; -} - -/* - * MBA software controller is supported only if - * MBM is supported and MBA is in linear scale, - * and the MBM monitor scope is the same as MBA - * control scope. - */ -static bool supports_mba_mbps(void) -{ - struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - - return (resctrl_is_mbm_enabled() && - r->alloc_capable && is_mba_linear() && - r->ctrl_scope == rmbm->mon_scope); -} - -/* - * Enable or disable the MBA software controller - * which helps user specify bandwidth in MBps. - */ -static int set_mba_sc(bool mba_sc) -{ - struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); - u32 num_closid = resctrl_arch_get_num_closid(r); - struct rdt_ctrl_domain *d; - unsigned long fflags; - int i; - - if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) - return -EINVAL; - - r->membw.mba_sc = mba_sc; - - rdtgroup_default.mba_mbps_event = mba_mbps_default_event; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - for (i = 0; i < num_closid; i++) - d->mbps_val[i] = MBA_MAX_MBPS; - } - - fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; - resctrl_file_fflags_init("mba_MBps_event", fflags); - - return 0; -} - static int cdp_enable(int level) { struct rdt_resource *r_l = &rdt_resources_all[level].r_resctrl; @@ -2506,419 +224,9 @@ int resctrl_arch_set_cdp_enabled(enum resctrl_res_level l, bool enable) return 0; } -/* - * We don't allow rdtgroup directories to be created anywhere - * except the root directory. Thus when looking for the rdtgroup - * structure for a kernfs node we are either looking at a directory, - * in which case the rdtgroup structure is pointed at by the "priv" - * field, otherwise we have a file, and need only look to the parent - * to find the rdtgroup. - */ -static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) -{ - if (kernfs_type(kn) == KERNFS_DIR) { - /* - * All the resource directories use "kn->priv" - * to point to the "struct rdtgroup" for the - * resource. "info" and its subdirectories don't - * have rdtgroup structures, so return NULL here. - */ - if (kn == kn_info || - rcu_access_pointer(kn->__parent) == kn_info) - return NULL; - else - return kn->priv; - } else { - return rdt_kn_parent_priv(kn); - } -} - -static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - atomic_inc(&rdtgrp->waitcount); - kernfs_break_active_protection(kn); -} - -static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) -{ - if (atomic_dec_and_test(&rdtgrp->waitcount) && - (rdtgrp->flags & RDT_DELETED)) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - kernfs_unbreak_active_protection(kn); - rdtgroup_remove(rdtgrp); - } else { - kernfs_unbreak_active_protection(kn); - } -} - -struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return NULL; - - rdtgroup_kn_get(rdtgrp, kn); - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - /* Was this group deleted while we waited? */ - if (rdtgrp->flags & RDT_DELETED) - return NULL; - - return rdtgrp; -} - -void rdtgroup_kn_unlock(struct kernfs_node *kn) -{ - struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); - - if (!rdtgrp) - return; - - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - - rdtgroup_kn_put(rdtgrp, kn); -} - -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **mon_data_kn); - -static void rdt_disable_ctx(void) -{ - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); - set_mba_sc(false); - - resctrl_debug = false; -} - -static int rdt_enable_ctx(struct rdt_fs_context *ctx) -{ - int ret = 0; - - if (ctx->enable_cdpl2) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); - if (ret) - goto out_done; - } - - if (ctx->enable_cdpl3) { - ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); - if (ret) - goto out_cdpl2; - } - - if (ctx->enable_mba_mbps) { - ret = set_mba_sc(true); - if (ret) - goto out_cdpl3; - } - - if (ctx->enable_debug) - resctrl_debug = true; - - return 0; - -out_cdpl3: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); -out_cdpl2: - resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); -out_done: - return ret; -} - -static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) -{ - struct resctrl_schema *s; - const char *suffix = ""; - int ret, cl; - - s = kzalloc(sizeof(*s), GFP_KERNEL); - if (!s) - return -ENOMEM; - - s->res = r; - s->num_closid = resctrl_arch_get_num_closid(r); - if (resctrl_arch_get_cdp_enabled(r->rid)) - s->num_closid /= 2; - - s->conf_type = type; - switch (type) { - case CDP_CODE: - suffix = "CODE"; - break; - case CDP_DATA: - suffix = "DATA"; - break; - case CDP_NONE: - suffix = ""; - break; - } - - ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); - if (ret >= sizeof(s->name)) { - kfree(s); - return -EINVAL; - } - - cl = strlen(s->name); - - /* - * If CDP is supported by this resource, but not enabled, - * include the suffix. This ensures the tabular format of the - * schemata file does not change between mounts of the filesystem. - */ - if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) - cl += 4; - - if (cl > max_name_width) - max_name_width = cl; - - switch (r->schema_fmt) { - case RESCTRL_SCHEMA_BITMAP: - s->fmt_str = "%d=%x"; - break; - case RESCTRL_SCHEMA_RANGE: - s->fmt_str = "%d=%u"; - break; - } - - if (WARN_ON_ONCE(!s->fmt_str)) { - kfree(s); - return -EINVAL; - } - - INIT_LIST_HEAD(&s->list); - list_add(&s->list, &resctrl_schema_all); - - return 0; -} - -static int schemata_list_create(void) -{ - struct rdt_resource *r; - int ret = 0; - - for_each_alloc_capable_rdt_resource(r) { - if (resctrl_arch_get_cdp_enabled(r->rid)) { - ret = schemata_list_add(r, CDP_CODE); - if (ret) - break; - - ret = schemata_list_add(r, CDP_DATA); - } else { - ret = schemata_list_add(r, CDP_NONE); - } - - if (ret) - break; - } - - return ret; -} - -static void schemata_list_destroy(void) -{ - struct resctrl_schema *s, *tmp; - - list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { - list_del(&s->list); - kfree(s); - } -} - -static int rdt_get_tree(struct fs_context *fc) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - unsigned long flags = RFTYPE_CTRL_BASE; - struct rdt_mon_domain *dom; - struct rdt_resource *r; - int ret; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - /* - * resctrl file system can only be mounted once. - */ - if (resctrl_mounted) { - ret = -EBUSY; - goto out; - } - - ret = rdtgroup_setup_root(ctx); - if (ret) - goto out; - - ret = rdt_enable_ctx(ctx); - if (ret) - goto out_root; - - ret = schemata_list_create(); - if (ret) { - schemata_list_destroy(); - goto out_ctx; - } - - closid_init(); - - if (resctrl_arch_mon_capable()) - flags |= RFTYPE_MON; - - ret = rdtgroup_add_files(rdtgroup_default.kn, flags); - if (ret) - goto out_schemata_free; - - kernfs_activate(rdtgroup_default.kn); - - ret = rdtgroup_create_info_dir(rdtgroup_default.kn); - if (ret < 0) - goto out_schemata_free; - - if (resctrl_arch_mon_capable()) { - ret = mongroup_create_dir(rdtgroup_default.kn, - &rdtgroup_default, "mon_groups", - &kn_mongrp); - if (ret < 0) - goto out_info; - - ret = mkdir_mondata_all(rdtgroup_default.kn, - &rdtgroup_default, &kn_mondata); - if (ret < 0) - goto out_mongrp; - rdtgroup_default.mon.mon_data_kn = kn_mondata; - } - - ret = rdt_pseudo_lock_init(); - if (ret) - goto out_mondata; - - ret = kernfs_get_tree(fc); - if (ret < 0) - goto out_psl; - - if (resctrl_arch_alloc_capable()) - resctrl_arch_enable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_enable_mon(); - - if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) - resctrl_mounted = true; - - if (resctrl_is_mbm_enabled()) { - r = resctrl_arch_get_resource(RDT_RESOURCE_L3); - list_for_each_entry(dom, &r->mon_domains, hdr.list) - mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - goto out; - -out_psl: - rdt_pseudo_lock_release(); -out_mondata: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mondata); -out_mongrp: - if (resctrl_arch_mon_capable()) - kernfs_remove(kn_mongrp); -out_info: - kernfs_remove(kn_info); -out_schemata_free: - schemata_list_destroy(); -out_ctx: - rdt_disable_ctx(); -out_root: - rdtgroup_destroy_root(); -out: - rdt_last_cmd_clear(); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); - return ret; -} - -enum rdt_param { - Opt_cdp, - Opt_cdpl2, - Opt_mba_mbps, - Opt_debug, - nr__rdt_params -}; - -static const struct fs_parameter_spec rdt_fs_parameters[] = { - fsparam_flag("cdp", Opt_cdp), - fsparam_flag("cdpl2", Opt_cdpl2), - fsparam_flag("mba_MBps", Opt_mba_mbps), - fsparam_flag("debug", Opt_debug), - {} -}; - -static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) -{ - struct rdt_fs_context *ctx = rdt_fc2context(fc); - struct fs_parse_result result; - const char *msg; - int opt; - - opt = fs_parse(fc, rdt_fs_parameters, param, &result); - if (opt < 0) - return opt; - - switch (opt) { - case Opt_cdp: - ctx->enable_cdpl3 = true; - return 0; - case Opt_cdpl2: - ctx->enable_cdpl2 = true; - return 0; - case Opt_mba_mbps: - msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; - if (!supports_mba_mbps()) - return invalfc(fc, msg); - ctx->enable_mba_mbps = true; - return 0; - case Opt_debug: - ctx->enable_debug = true; - return 0; - } - - return -EINVAL; -} - -static void rdt_fs_context_free(struct fs_context *fc) +bool resctrl_arch_get_cdp_enabled(enum resctrl_res_level l) { - struct rdt_fs_context *ctx = rdt_fc2context(fc); - - kernfs_free_fs_context(fc); - kfree(ctx); -} - -static const struct fs_context_operations rdt_fs_context_ops = { - .free = rdt_fs_context_free, - .parse_param = rdt_parse_param, - .get_tree = rdt_get_tree, -}; - -static int rdt_init_fs_context(struct fs_context *fc) -{ - struct rdt_fs_context *ctx; - - ctx = kzalloc(sizeof(struct rdt_fs_context), GFP_KERNEL); - if (!ctx) - return -ENOMEM; - - ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; - fc->fs_private = &ctx->kfc; - fc->ops = &rdt_fs_context_ops; - put_user_ns(fc->user_ns); - fc->user_ns = get_user_ns(&init_user_ns); - fc->global = true; - return 0; + return rdt_resources_all[l].cdp_enabled; } void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) @@ -2952,1460 +260,3 @@ void resctrl_arch_reset_all_ctrls(struct rdt_resource *r) return; } - -/* - * Move tasks from one to the other group. If @from is NULL, then all tasks - * in the systems are moved unconditionally (used for teardown). - * - * If @mask is not NULL the cpus on which moved tasks are running are set - * in that mask so the update smp function call is restricted to affected - * cpus. - */ -static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, - struct cpumask *mask) -{ - struct task_struct *p, *t; - - read_lock(&tasklist_lock); - for_each_process_thread(p, t) { - if (!from || is_closid_match(t, from) || - is_rmid_match(t, from)) { - resctrl_arch_set_closid_rmid(t, to->closid, - to->mon.rmid); - - /* - * Order the closid/rmid stores above before the loads - * in task_curr(). This pairs with the full barrier - * between the rq->curr update and resctrl_sched_in() - * during context switch. - */ - smp_mb(); - - /* - * If the task is on a CPU, set the CPU in the mask. - * The detection is inaccurate as tasks might move or - * schedule before the smp function call takes place. - * In such a case the function call is pointless, but - * there is no other side effect. - */ - if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) - cpumask_set_cpu(task_cpu(t), mask); - } - } - read_unlock(&tasklist_lock); -} - -static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) -{ - struct rdtgroup *sentry, *stmp; - struct list_head *head; - - head = &rdtgrp->mon.crdtgrp_list; - list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { - free_rmid(sentry->closid, sentry->mon.rmid); - list_del(&sentry->mon.crdtgrp_list); - - if (atomic_read(&sentry->waitcount) != 0) - sentry->flags = RDT_DELETED; - else - rdtgroup_remove(sentry); - } -} - -/* - * Forcibly remove all of subdirectories under root. - */ -static void rmdir_all_sub(void) -{ - struct rdtgroup *rdtgrp, *tmp; - - /* Move all tasks to the default resource group */ - rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); - - list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { - /* Free any child rmids */ - free_all_child_rdtgrp(rdtgrp); - - /* Remove each rdtgroup other than root */ - if (rdtgrp == &rdtgroup_default) - continue; - - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) - rdtgroup_pseudo_lock_remove(rdtgrp); - - /* - * Give any CPUs back to the default group. We cannot copy - * cpu_online_mask because a CPU might have executed the - * offline callback already, but is still marked online. - */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - kernfs_remove(rdtgrp->kn); - list_del(&rdtgrp->rdtgroup_list); - - if (atomic_read(&rdtgrp->waitcount) != 0) - rdtgrp->flags = RDT_DELETED; - else - rdtgroup_remove(rdtgrp); - } - /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ - update_closid_rmid(cpu_online_mask, &rdtgroup_default); - - kernfs_remove(kn_info); - kernfs_remove(kn_mongrp); - kernfs_remove(kn_mondata); -} - -static void rdt_kill_sb(struct super_block *sb) -{ - struct rdt_resource *r; - - cpus_read_lock(); - mutex_lock(&rdtgroup_mutex); - - rdt_disable_ctx(); - - /* Put everything back to default values. */ - for_each_alloc_capable_rdt_resource(r) - resctrl_arch_reset_all_ctrls(r); - - rmdir_all_sub(); - rdt_pseudo_lock_release(); - rdtgroup_default.mode = RDT_MODE_SHAREABLE; - schemata_list_destroy(); - rdtgroup_destroy_root(); - if (resctrl_arch_alloc_capable()) - resctrl_arch_disable_alloc(); - if (resctrl_arch_mon_capable()) - resctrl_arch_disable_mon(); - resctrl_mounted = false; - kernfs_kill_sb(sb); - mutex_unlock(&rdtgroup_mutex); - cpus_read_unlock(); -} - -static struct file_system_type rdt_fs_type = { - .name = "resctrl", - .init_fs_context = rdt_init_fs_context, - .parameters = rdt_fs_parameters, - .kill_sb = rdt_kill_sb, -}; - -static int mon_addfile(struct kernfs_node *parent_kn, const char *name, - void *priv) -{ - struct kernfs_node *kn; - int ret = 0; - - kn = __kernfs_create_file(parent_kn, name, 0444, - GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, - &kf_mondata_ops, priv, NULL, NULL); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - kernfs_remove(kn); - return ret; - } - - return ret; -} - -static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) -{ - struct kernfs_node *kn; - - kn = kernfs_find_and_get(pkn, name); - if (!kn) - return; - kernfs_put(kn); - - if (kn->dir.subdirs <= 1) - kernfs_remove(kn); - else - kernfs_remove_by_name(kn, subname); -} - -/* - * Remove all subdirectories of mon_data of ctrl_mon groups - * and monitor groups for the given domain. - * Remove files and directories containing "sum" of domain data - * when last domain being summed is removed. - */ -static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct rdtgroup *prgrp, *crgrp; - char subname[32]; - bool snc_mode; - char name[32]; - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - if (snc_mode) - sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); - - list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) - mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); - } -} - -static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp, - bool do_sum) -{ - struct rmid_read rr = {0}; - union mon_data_bits priv; - struct mon_evt *mevt; - int ret; - - if (WARN_ON(list_empty(&r->evt_list))) - return -EPERM; - - priv.u.rid = r->rid; - priv.u.domid = do_sum ? d->ci->id : d->hdr.id; - priv.u.sum = do_sum; - list_for_each_entry(mevt, &r->evt_list, list) { - priv.u.evtid = mevt->evtid; - ret = mon_addfile(kn, mevt->name, priv.priv); - if (ret) - return ret; - - if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) - mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); - } - - return 0; -} - -static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, - struct rdt_mon_domain *d, - struct rdt_resource *r, struct rdtgroup *prgrp) -{ - struct kernfs_node *kn, *ckn; - char name[32]; - bool snc_mode; - int ret = 0; - - lockdep_assert_held(&rdtgroup_mutex); - - snc_mode = r->mon_scope == RESCTRL_L3_NODE; - sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); - kn = kernfs_find_and_get(parent_kn, name); - if (kn) { - /* - * rdtgroup_mutex will prevent this directory from being - * removed. No need to keep this hold. - */ - kernfs_put(kn); - } else { - kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); - if (IS_ERR(kn)) - return PTR_ERR(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; - ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); - if (ret) - goto out_destroy; - } - - if (snc_mode) { - sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); - ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); - if (IS_ERR(ckn)) { - ret = -EINVAL; - goto out_destroy; - } - - ret = rdtgroup_kn_set_ugid(ckn); - if (ret) - goto out_destroy; - - ret = mon_add_all_files(ckn, d, r, prgrp, false); - if (ret) - goto out_destroy; - } - - kernfs_activate(kn); - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/* - * Add all subdirectories of mon_data for "ctrl_mon" groups - * and "monitor" groups with given domain id. - */ -static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, - struct rdt_mon_domain *d) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *prgrp, *crgrp; - struct list_head *head; - - list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { - parent_kn = prgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, prgrp); - - head = &prgrp->mon.crdtgrp_list; - list_for_each_entry(crgrp, head, mon.crdtgrp_list) { - parent_kn = crgrp->mon.mon_data_kn; - mkdir_mondata_subdir(parent_kn, d, r, crgrp); - } - } -} - -static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, - struct rdt_resource *r, - struct rdtgroup *prgrp) -{ - struct rdt_mon_domain *dom; - int ret; - - /* Walking r->domains, ensure it can't race with cpuhp */ - lockdep_assert_cpus_held(); - - list_for_each_entry(dom, &r->mon_domains, hdr.list) { - ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); - if (ret) - return ret; - } - - return 0; -} - -/* - * This creates a directory mon_data which contains the monitored data. - * - * mon_data has one directory for each domain which are named - * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data - * with L3 domain looks as below: - * ./mon_data: - * mon_L3_00 - * mon_L3_01 - * mon_L3_02 - * ... - * - * Each domain directory has one file per event: - * ./mon_L3_00/: - * llc_occupancy - * - */ -static int mkdir_mondata_all(struct kernfs_node *parent_kn, - struct rdtgroup *prgrp, - struct kernfs_node **dest_kn) -{ - struct rdt_resource *r; - struct kernfs_node *kn; - int ret; - - /* - * Create the mon_data directory first. - */ - ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); - if (ret) - return ret; - - if (dest_kn) - *dest_kn = kn; - - /* - * Create the subdirectories for each domain. Note that all events - * in a domain like L3 are grouped into a resource whose domain is L3 - */ - for_each_mon_capable_rdt_resource(r) { - ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); - if (ret) - goto out_destroy; - } - - return 0; - -out_destroy: - kernfs_remove(kn); - return ret; -} - -/** - * cbm_ensure_valid - Enforce validity on provided CBM - * @_val: Candidate CBM - * @r: RDT resource to which the CBM belongs - * - * The provided CBM represents all cache portions available for use. This - * may be represented by a bitmap that does not consist of contiguous ones - * and thus be an invalid CBM. - * Here the provided CBM is forced to be a valid CBM by only considering - * the first set of contiguous bits as valid and clearing all bits. - * The intention here is to provide a valid default CBM with which a new - * resource group is initialized. The user can follow this with a - * modification to the CBM if the default does not satisfy the - * requirements. - */ -static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) -{ - unsigned int cbm_len = r->cache.cbm_len; - unsigned long first_bit, zero_bit; - unsigned long val = _val; - - if (!val) - return 0; - - first_bit = find_first_bit(&val, cbm_len); - zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); - - /* Clear any remaining bits to ensure contiguous region */ - bitmap_clear(&val, zero_bit, cbm_len - zero_bit); - return (u32)val; -} - -/* - * Initialize cache resources per RDT domain - * - * Set the RDT domain up to start off with all usable allocations. That is, - * all shareable and unused bits. All-zero CBM is invalid. - */ -static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, - u32 closid) -{ - enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); - enum resctrl_conf_type t = s->conf_type; - struct resctrl_staged_config *cfg; - struct rdt_resource *r = s->res; - u32 used_b = 0, unused_b = 0; - unsigned long tmp_cbm; - enum rdtgrp_mode mode; - u32 peer_ctl, ctrl_val; - int i; - - cfg = &d->staged_config[t]; - cfg->have_new_ctrl = false; - cfg->new_ctrl = r->cache.shareable_bits; - used_b = r->cache.shareable_bits; - for (i = 0; i < closids_supported(); i++) { - if (closid_allocated(i) && i != closid) { - mode = rdtgroup_mode_by_closid(i); - if (mode == RDT_MODE_PSEUDO_LOCKSETUP) - /* - * ctrl values for locksetup aren't relevant - * until the schemata is written, and the mode - * becomes RDT_MODE_PSEUDO_LOCKED. - */ - continue; - /* - * If CDP is active include peer domain's - * usage to ensure there is no overlap - * with an exclusive group. - */ - if (resctrl_arch_get_cdp_enabled(r->rid)) - peer_ctl = resctrl_arch_get_config(r, d, i, - peer_type); - else - peer_ctl = 0; - ctrl_val = resctrl_arch_get_config(r, d, i, - s->conf_type); - used_b |= ctrl_val | peer_ctl; - if (mode == RDT_MODE_SHAREABLE) - cfg->new_ctrl |= ctrl_val | peer_ctl; - } - } - if (d->plr && d->plr->cbm > 0) - used_b |= d->plr->cbm; - unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); - unused_b &= BIT_MASK(r->cache.cbm_len) - 1; - cfg->new_ctrl |= unused_b; - /* - * Force the initial CBM to be valid, user can - * modify the CBM based on system availability. - */ - cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); - /* - * Assign the u32 CBM to an unsigned long to ensure that - * bitmap_weight() does not access out-of-bound memory. - */ - tmp_cbm = cfg->new_ctrl; - if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { - rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); - return -ENOSPC; - } - cfg->have_new_ctrl = true; - - return 0; -} - -/* - * Initialize cache resources with default values. - * - * A new RDT group is being created on an allocation capable (CAT) - * supporting system. Set this group up to start off with all usable - * allocations. - * - * If there are no more shareable bits available on any domain then - * the entire allocation will fail. - */ -static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) -{ - struct rdt_ctrl_domain *d; - int ret; - - list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { - ret = __init_one_rdt_domain(d, s, closid); - if (ret < 0) - return ret; - } - - return 0; -} - -/* Initialize MBA resource with default values. */ -static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) -{ - struct resctrl_staged_config *cfg; - struct rdt_ctrl_domain *d; - - list_for_each_entry(d, &r->ctrl_domains, hdr.list) { - if (is_mba_sc(r)) { - d->mbps_val[closid] = MBA_MAX_MBPS; - continue; - } - - cfg = &d->staged_config[CDP_NONE]; - cfg->new_ctrl = resctrl_get_default_ctrl(r); - cfg->have_new_ctrl = true; - } -} - -/* Initialize the RDT group's allocations. */ -static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) -{ - struct resctrl_schema *s; - struct rdt_resource *r; - int ret = 0; - - rdt_staged_configs_clear(); - - list_for_each_entry(s, &resctrl_schema_all, list) { - r = s->res; - if (r->rid == RDT_RESOURCE_MBA || - r->rid == RDT_RESOURCE_SMBA) { - rdtgroup_init_mba(r, rdtgrp->closid); - if (is_mba_sc(r)) - continue; - } else { - ret = rdtgroup_init_cat(s, rdtgrp->closid); - if (ret < 0) - goto out; - } - - ret = resctrl_arch_update_domains(r, rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Failed to initialize allocations\n"); - goto out; - } - - } - - rdtgrp->mode = RDT_MODE_SHAREABLE; - -out: - rdt_staged_configs_clear(); - return ret; -} - -static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) -{ - int ret; - - if (!resctrl_arch_mon_capable()) - return 0; - - ret = alloc_rmid(rdtgrp->closid); - if (ret < 0) { - rdt_last_cmd_puts("Out of RMIDs\n"); - return ret; - } - rdtgrp->mon.rmid = ret; - - ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - return ret; - } - - return 0; -} - -static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) -{ - if (resctrl_arch_mon_capable()) - free_rmid(rgrp->closid, rgrp->mon.rmid); -} - -/* - * We allow creating mon groups only with in a directory called "mon_groups" - * which is present in every ctrl_mon group. Check if this is a valid - * "mon_groups" directory. - * - * 1. The directory should be named "mon_groups". - * 2. The mon group itself should "not" be named "mon_groups". - * This makes sure "mon_groups" directory always has a ctrl_mon group - * as parent. - */ -static bool is_mon_groups(struct kernfs_node *kn, const char *name) -{ - return (!strcmp(rdt_kn_name(kn), "mon_groups") && - strcmp(name, "mon_groups")); -} - -static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, - const char *name, umode_t mode, - enum rdt_group_type rtype, struct rdtgroup **r) -{ - struct rdtgroup *prdtgrp, *rdtgrp; - unsigned long files = 0; - struct kernfs_node *kn; - int ret; - - prdtgrp = rdtgroup_kn_lock_live(parent_kn); - if (!prdtgrp) { - ret = -ENODEV; - goto out_unlock; - } - - /* - * Check that the parent directory for a monitor group is a "mon_groups" - * directory. - */ - if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { - ret = -EPERM; - goto out_unlock; - } - - if (rtype == RDTMON_GROUP && - (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { - ret = -EINVAL; - rdt_last_cmd_puts("Pseudo-locking in progress\n"); - goto out_unlock; - } - - /* allocate the rdtgroup. */ - rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); - if (!rdtgrp) { - ret = -ENOSPC; - rdt_last_cmd_puts("Kernel out of memory\n"); - goto out_unlock; - } - *r = rdtgrp; - rdtgrp->mon.parent = prdtgrp; - rdtgrp->type = rtype; - INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); - - /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - rdt_last_cmd_puts("kernfs create error\n"); - goto out_free_rgrp; - } - rdtgrp->kn = kn; - - /* - * kernfs_remove() will drop the reference count on "kn" which - * will free it. But we still need it to stick around for the - * rdtgroup_kn_unlock(kn) call. Take one extra reference here, - * which will be dropped by kernfs_put() in rdtgroup_remove(). - */ - kernfs_get(kn); - - ret = rdtgroup_kn_set_ugid(kn); - if (ret) { - rdt_last_cmd_puts("kernfs perm error\n"); - goto out_destroy; - } - - if (rtype == RDTCTRL_GROUP) { - files = RFTYPE_BASE | RFTYPE_CTRL; - if (resctrl_arch_mon_capable()) - files |= RFTYPE_MON; - } else { - files = RFTYPE_BASE | RFTYPE_MON; - } - - ret = rdtgroup_add_files(kn, files); - if (ret) { - rdt_last_cmd_puts("kernfs fill error\n"); - goto out_destroy; - } - - /* - * The caller unlocks the parent_kn upon success. - */ - return 0; - -out_destroy: - kernfs_put(rdtgrp->kn); - kernfs_remove(rdtgrp->kn); -out_free_rgrp: - kfree(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) -{ - kernfs_remove(rgrp->kn); - rdtgroup_remove(rgrp); -} - -/* - * Create a monitor group under "mon_groups" directory of a control - * and monitor group(ctrl_mon). This is a resource group - * to monitor a subset of tasks and cpus in its parent ctrl_mon group. - */ -static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp, *prgrp; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); - if (ret) - return ret; - - prgrp = rdtgrp->mon.parent; - rdtgrp->closid = prgrp->closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) { - mkdir_rdt_prepare_clean(rdtgrp); - goto out_unlock; - } - - kernfs_activate(rdtgrp->kn); - - /* - * Add the rdtgrp to the list of rdtgrps the parent - * ctrl_mon group has to track. - */ - list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); - -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -/* - * These are rdtgroups created under the root directory. Can be used - * to allocate and monitor resources. - */ -static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, - const char *name, umode_t mode) -{ - struct rdtgroup *rdtgrp; - struct kernfs_node *kn; - u32 closid; - int ret; - - ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); - if (ret) - return ret; - - kn = rdtgrp->kn; - ret = closid_alloc(); - if (ret < 0) { - rdt_last_cmd_puts("Out of CLOSIDs\n"); - goto out_common_fail; - } - closid = ret; - ret = 0; - - rdtgrp->closid = closid; - - ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); - if (ret) - goto out_closid_free; - - kernfs_activate(rdtgrp->kn); - - ret = rdtgroup_init_alloc(rdtgrp); - if (ret < 0) - goto out_rmid_free; - - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); - - if (resctrl_arch_mon_capable()) { - /* - * Create an empty mon_groups directory to hold the subset - * of tasks and cpus to monitor. - */ - ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); - if (ret) { - rdt_last_cmd_puts("kernfs subdir error\n"); - goto out_del_list; - } - if (is_mba_sc(NULL)) - rdtgrp->mba_mbps_event = mba_mbps_default_event; - } - - goto out_unlock; - -out_del_list: - list_del(&rdtgrp->rdtgroup_list); -out_rmid_free: - mkdir_rdt_prepare_rmid_free(rdtgrp); -out_closid_free: - closid_free(closid); -out_common_fail: - mkdir_rdt_prepare_clean(rdtgrp); -out_unlock: - rdtgroup_kn_unlock(parent_kn); - return ret; -} - -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) -{ - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; - - /* - * If the parent directory is the root directory and RDT - * allocation is supported, add a control and monitoring - * subdirectory - */ - if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) - return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); - - /* Else, attempt to add a monitoring subdirectory. */ - if (resctrl_arch_mon_capable()) - return rdtgroup_mkdir_mon(parent_kn, name, mode); - - return -EPERM; -} - -static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - u32 closid, rmid; - int cpu; - - /* Give any tasks back to the parent group */ - rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); - - /* - * Update per cpu closid/rmid of the moved CPUs first. - * Note: the closid will not change, but the arch code still needs it. - */ - closid = prdtgrp->closid; - rmid = prdtgrp->mon.rmid; - for_each_cpu(cpu, &rdtgrp->cpu_mask) - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - rdtgrp->flags = RDT_DELETED; - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - - /* - * Remove the rdtgrp from the parent ctrl_mon group's list - */ - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_del(&rdtgrp->mon.crdtgrp_list); - - kernfs_remove(rdtgrp->kn); - - return 0; -} - -static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) -{ - rdtgrp->flags = RDT_DELETED; - list_del(&rdtgrp->rdtgroup_list); - - kernfs_remove(rdtgrp->kn); - return 0; -} - -static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) -{ - u32 closid, rmid; - int cpu; - - /* Give any tasks back to the default group */ - rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); - - /* Give any CPUs back to the default group */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - - /* Update per cpu closid and rmid of the moved CPUs first */ - closid = rdtgroup_default.closid; - rmid = rdtgroup_default.mon.rmid; - for_each_cpu(cpu, &rdtgrp->cpu_mask) - resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); - - /* - * Update the MSR on moved CPUs and CPUs which have moved - * task running on them. - */ - cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - update_closid_rmid(tmpmask, NULL); - - free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); - closid_free(rdtgrp->closid); - - rdtgroup_ctrl_remove(rdtgrp); - - /* - * Free all the child monitor group rmids. - */ - free_all_child_rdtgrp(rdtgrp); - - return 0; -} - -static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) -{ - /* - * Valid within the RCU section it was obtained or while rdtgroup_mutex - * is held. - */ - return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); -} - -static int rdtgroup_rmdir(struct kernfs_node *kn) -{ - struct kernfs_node *parent_kn; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret = 0; - - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; - - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; - } - parent_kn = rdt_kn_parent(kn); - - /* - * If the rdtgroup is a ctrl_mon group and parent directory - * is the root directory, remove the ctrl_mon group. - * - * If the rdtgroup is a mon group and parent directory - * is a valid "mon_groups" directory, remove the mon group. - */ - if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && - rdtgrp != &rdtgroup_default) { - if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || - rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { - ret = rdtgroup_ctrl_remove(rdtgrp); - } else { - ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); - } - } else if (rdtgrp->type == RDTMON_GROUP && - is_mon_groups(parent_kn, rdt_kn_name(kn))) { - ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); - } else { - ret = -EPERM; - } - -out: - rdtgroup_kn_unlock(kn); - free_cpumask_var(tmpmask); - return ret; -} - -/** - * mongrp_reparent() - replace parent CTRL_MON group of a MON group - * @rdtgrp: the MON group whose parent should be replaced - * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp - * @cpus: cpumask provided by the caller for use during this call - * - * Replaces the parent CTRL_MON group for a MON group, resulting in all member - * tasks' CLOSID immediately changing to that of the new parent group. - * Monitoring data for the group is unaffected by this operation. - */ -static void mongrp_reparent(struct rdtgroup *rdtgrp, - struct rdtgroup *new_prdtgrp, - cpumask_var_t cpus) -{ - struct rdtgroup *prdtgrp = rdtgrp->mon.parent; - - WARN_ON(rdtgrp->type != RDTMON_GROUP); - WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); - - /* Nothing to do when simply renaming a MON group. */ - if (prdtgrp == new_prdtgrp) - return; - - WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); - list_move_tail(&rdtgrp->mon.crdtgrp_list, - &new_prdtgrp->mon.crdtgrp_list); - - rdtgrp->mon.parent = new_prdtgrp; - rdtgrp->closid = new_prdtgrp->closid; - - /* Propagate updated closid to all tasks in this group. */ - rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); - - update_closid_rmid(cpus, NULL); -} - -static int rdtgroup_rename(struct kernfs_node *kn, - struct kernfs_node *new_parent, const char *new_name) -{ - struct kernfs_node *kn_parent; - struct rdtgroup *new_prdtgrp; - struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; - int ret; - - rdtgrp = kernfs_to_rdtgroup(kn); - new_prdtgrp = kernfs_to_rdtgroup(new_parent); - if (!rdtgrp || !new_prdtgrp) - return -ENOENT; - - /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ - rdtgroup_kn_get(rdtgrp, kn); - rdtgroup_kn_get(new_prdtgrp, new_parent); - - mutex_lock(&rdtgroup_mutex); - - rdt_last_cmd_clear(); - - /* - * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if - * either kernfs_node is a file. - */ - if (kernfs_type(kn) != KERNFS_DIR || - kernfs_type(new_parent) != KERNFS_DIR) { - rdt_last_cmd_puts("Source and destination must be directories"); - ret = -EPERM; - goto out; - } - - if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { - ret = -ENOENT; - goto out; - } - - kn_parent = rdt_kn_parent(kn); - if (rdtgrp->type != RDTMON_GROUP || !kn_parent || - !is_mon_groups(kn_parent, rdt_kn_name(kn))) { - rdt_last_cmd_puts("Source must be a MON group\n"); - ret = -EPERM; - goto out; - } - - if (!is_mon_groups(new_parent, new_name)) { - rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); - ret = -EPERM; - goto out; - } - - /* - * If the MON group is monitoring CPUs, the CPUs must be assigned to the - * current parent CTRL_MON group and therefore cannot be assigned to - * the new parent, making the move illegal. - */ - if (!cpumask_empty(&rdtgrp->cpu_mask) && - rdtgrp->mon.parent != new_prdtgrp) { - rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); - ret = -EPERM; - goto out; - } - - /* - * Allocate the cpumask for use in mongrp_reparent() to avoid the - * possibility of failing to allocate it after kernfs_rename() has - * succeeded. - */ - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { - ret = -ENOMEM; - goto out; - } - - /* - * Perform all input validation and allocations needed to ensure - * mongrp_reparent() will succeed before calling kernfs_rename(), - * otherwise it would be necessary to revert this call if - * mongrp_reparent() failed. - */ - ret = kernfs_rename(kn, new_parent, new_name); - if (!ret) - mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); - - free_cpumask_var(tmpmask); - -out: - mutex_unlock(&rdtgroup_mutex); - rdtgroup_kn_put(rdtgrp, kn); - rdtgroup_kn_put(new_prdtgrp, new_parent); - return ret; -} - -static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) -{ - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) - seq_puts(seq, ",cdp"); - - if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) - seq_puts(seq, ",cdpl2"); - - if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) - seq_puts(seq, ",mba_MBps"); - - if (resctrl_debug) - seq_puts(seq, ",debug"); - - return 0; -} - -static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { - .mkdir = rdtgroup_mkdir, - .rmdir = rdtgroup_rmdir, - .rename = rdtgroup_rename, - .show_options = rdtgroup_show_options, -}; - -static int rdtgroup_setup_root(struct rdt_fs_context *ctx) -{ - rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, - KERNFS_ROOT_CREATE_DEACTIVATED | - KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, - &rdtgroup_default); - if (IS_ERR(rdt_root)) - return PTR_ERR(rdt_root); - - ctx->kfc.root = rdt_root; - rdtgroup_default.kn = kernfs_root_to_node(rdt_root); - - return 0; -} - -static void rdtgroup_destroy_root(void) -{ - kernfs_destroy_root(rdt_root); - rdtgroup_default.kn = NULL; -} - -static void __init rdtgroup_setup_default(void) -{ - mutex_lock(&rdtgroup_mutex); - - rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; - rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; - rdtgroup_default.type = RDTCTRL_GROUP; - INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); - - list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - - mutex_unlock(&rdtgroup_mutex); -} - -static void domain_destroy_mon_state(struct rdt_mon_domain *d) -{ - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - kfree(d->mbm_local); -} - -void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) - mba_sc_domain_destroy(r, d); - - mutex_unlock(&rdtgroup_mutex); -} - -void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - mutex_lock(&rdtgroup_mutex); - - /* - * If resctrl is mounted, remove all the - * per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - rmdir_mondata_subdir_allrdtgrp(r, d); - - if (resctrl_is_mbm_enabled()) - cancel_delayed_work(&d->mbm_over); - if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { - /* - * When a package is going down, forcefully - * decrement rmid->ebusy. There is no way to know - * that the L3 was flushed and hence may lead to - * incorrect counts in rare scenarios, but leaving - * the RMID as busy creates RMID leaks if the - * package never comes back. - */ - __check_limbo(d, true); - cancel_delayed_work(&d->cqm_limbo); - } - - domain_destroy_mon_state(d); - - mutex_unlock(&rdtgroup_mutex); -} - -/** - * domain_setup_mon_state() - Initialise domain monitoring structures. - * @r: The resource for the newly online domain. - * @d: The newly online domain. - * - * Allocate monitor resources that belong to this domain. - * Called when the first CPU of a domain comes online, regardless of whether - * the filesystem is mounted. - * During boot this may be called before global allocations have been made by - * resctrl_mon_resource_init(). - * - * Returns 0 for success, or -ENOMEM. - */ -static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - u32 idx_limit = resctrl_arch_system_num_rmid_idx(); - size_t tsize; - - if (resctrl_arch_is_llc_occupancy_enabled()) { - d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); - if (!d->rmid_busy_llc) - return -ENOMEM; - } - if (resctrl_arch_is_mbm_total_enabled()) { - tsize = sizeof(*d->mbm_total); - d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_total) { - bitmap_free(d->rmid_busy_llc); - return -ENOMEM; - } - } - if (resctrl_arch_is_mbm_local_enabled()) { - tsize = sizeof(*d->mbm_local); - d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); - if (!d->mbm_local) { - bitmap_free(d->rmid_busy_llc); - kfree(d->mbm_total); - return -ENOMEM; - } - } - - return 0; -} - -int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) -{ - int err = 0; - - mutex_lock(&rdtgroup_mutex); - - if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { - /* RDT_RESOURCE_MBA is never mon_capable */ - err = mba_sc_domain_allocate(r, d); - } - - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) -{ - int err; - - mutex_lock(&rdtgroup_mutex); - - err = domain_setup_mon_state(r, d); - if (err) - goto out_unlock; - - if (resctrl_is_mbm_enabled()) { - INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); - mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, - RESCTRL_PICK_ANY_CPU); - } - - if (resctrl_arch_is_llc_occupancy_enabled()) - INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); - - /* - * If the filesystem is not mounted then only the default resource group - * exists. Creation of its directories is deferred until mount time - * by rdt_get_tree() calling mkdir_mondata_all(). - * If resctrl is mounted, add per domain monitor data directories. - */ - if (resctrl_mounted && resctrl_arch_mon_capable()) - mkdir_mondata_subdir_allrdtgrp(r, d); - -out_unlock: - mutex_unlock(&rdtgroup_mutex); - - return err; -} - -void resctrl_online_cpu(unsigned int cpu) -{ - mutex_lock(&rdtgroup_mutex); - /* The CPU is set in default rdtgroup after online. */ - cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - mutex_unlock(&rdtgroup_mutex); -} - -static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) -{ - struct rdtgroup *cr; - - list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { - if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) - break; - } -} - -static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, - struct rdt_resource *r) -{ - struct rdt_mon_domain *d; - - lockdep_assert_cpus_held(); - - list_for_each_entry(d, &r->mon_domains, hdr.list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) - return d; - } - - return NULL; -} - -void resctrl_offline_cpu(unsigned int cpu) -{ - struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); - struct rdt_mon_domain *d; - struct rdtgroup *rdtgrp; - - mutex_lock(&rdtgroup_mutex); - list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { - clear_childcpus(rdtgrp, cpu); - break; - } - } - - if (!l3->mon_capable) - goto out_unlock; - - d = get_mon_domain_from_cpu(cpu, l3); - if (d) { - if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { - cancel_delayed_work(&d->mbm_over); - mbm_setup_overflow_handler(d, 0, cpu); - } - if (resctrl_arch_is_llc_occupancy_enabled() && - cpu == d->cqm_work_cpu && has_busy_rmid(d)) { - cancel_delayed_work(&d->cqm_limbo); - cqm_setup_limbo_handler(d, 0, cpu); - } - } - -out_unlock: - mutex_unlock(&rdtgroup_mutex); -} - -/* - * resctrl_init - resctrl filesystem initialization - * - * Setup resctrl file system including set up root, create mount point, - * register resctrl filesystem, and initialize files under root directory. - * - * Return: 0 on success or -errno - */ -int __init resctrl_init(void) -{ - int ret = 0; - - seq_buf_init(&last_cmd_status, last_cmd_status_buf, - sizeof(last_cmd_status_buf)); - - rdtgroup_setup_default(); - - thread_throttle_mode_init(); - - ret = resctrl_mon_resource_init(); - if (ret) - return ret; - - ret = sysfs_create_mount_point(fs_kobj, "resctrl"); - if (ret) { - resctrl_mon_resource_exit(); - return ret; - } - - ret = register_filesystem(&rdt_fs_type); - if (ret) - goto cleanup_mountpoint; - - /* - * Adding the resctrl debugfs directory here may not be ideal since - * it would let the resctrl debugfs directory appear on the debugfs - * filesystem before the resctrl filesystem is mounted. - * It may also be ok since that would enable debugging of RDT before - * resctrl is mounted. - * The reason why the debugfs directory is created here and not in - * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and - * during the debugfs directory creation also &sb->s_type->i_mutex_key - * (the lockdep class of inode->i_rwsem). Other filesystem - * interactions (eg. SyS_getdents) have the lock ordering: - * &sb->s_type->i_mutex_key --> &mm->mmap_lock - * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex - * is taken, thus creating dependency: - * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause - * issues considering the other two lock dependencies. - * By creating the debugfs directory here we avoid a dependency - * that may cause deadlock (even though file operations cannot - * occur until the filesystem is mounted, but I do not know how to - * tell lockdep that). - */ - debugfs_resctrl = debugfs_create_dir("resctrl", NULL); - - return 0; - -cleanup_mountpoint: - sysfs_remove_mount_point(fs_kobj, "resctrl"); - resctrl_mon_resource_exit(); - - return ret; -} - -void __exit resctrl_exit(void) -{ - debugfs_remove_recursive(debugfs_resctrl); - unregister_filesystem(&rdt_fs_type); - sysfs_remove_mount_point(fs_kobj, "resctrl"); - - resctrl_mon_resource_exit(); -} diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 16f3ca30626a..dbf6d71bdf18 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -27,6 +27,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_APERFMPERF, CPUID_ECX, 0, 0x00000006, 0 }, { X86_FEATURE_EPB, CPUID_ECX, 3, 0x00000006, 0 }, { X86_FEATURE_INTEL_PPIN, CPUID_EBX, 0, 0x00000007, 1 }, + { X86_FEATURE_APX, CPUID_EDX, 21, 0x00000007, 1 }, { X86_FEATURE_RRSBA_CTRL, CPUID_EDX, 2, 0x00000007, 2 }, { X86_FEATURE_BHI_CTRL, CPUID_EDX, 4, 0x00000007, 2 }, { X86_FEATURE_CQM_LLC, CPUID_EDX, 1, 0x0000000f, 0 }, @@ -53,7 +54,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, - { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 }, + { X86_FEATURE_AMD_HTR_CORES, CPUID_EAX, 30, 0x80000026, 0 }, { 0, 0, 0, 0, 0 } }; diff --git a/arch/x86/kernel/cpu/sgx/driver.h b/arch/x86/kernel/cpu/sgx/driver.h index 4eddb4d571ef..30f39f92c98f 100644 --- a/arch/x86/kernel/cpu/sgx/driver.h +++ b/arch/x86/kernel/cpu/sgx/driver.h @@ -2,7 +2,6 @@ #ifndef __ARCH_SGX_DRIVER_H__ #define __ARCH_SGX_DRIVER_H__ -#include <crypto/hash.h> #include <linux/kref.h> #include <linux/mmu_notifier.h> #include <linux/radix-tree.h> diff --git a/arch/x86/kernel/cpu/sgx/ioctl.c b/arch/x86/kernel/cpu/sgx/ioctl.c index 776a20172867..66f1efa16fbb 100644 --- a/arch/x86/kernel/cpu/sgx/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/ioctl.c @@ -3,6 +3,7 @@ #include <asm/mman.h> #include <asm/sgx.h> +#include <crypto/sha2.h> #include <linux/mman.h> #include <linux/delay.h> #include <linux/file.h> @@ -463,31 +464,6 @@ static long sgx_ioc_enclave_add_pages(struct sgx_encl *encl, void __user *arg) return ret; } -static int __sgx_get_key_hash(struct crypto_shash *tfm, const void *modulus, - void *hash) -{ - SHASH_DESC_ON_STACK(shash, tfm); - - shash->tfm = tfm; - - return crypto_shash_digest(shash, modulus, SGX_MODULUS_SIZE, hash); -} - -static int sgx_get_key_hash(const void *modulus, void *hash) -{ - struct crypto_shash *tfm; - int ret; - - tfm = crypto_alloc_shash("sha256", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - - ret = __sgx_get_key_hash(tfm, modulus, hash); - - crypto_free_shash(tfm); - return ret; -} - static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, void *token) { @@ -523,9 +499,7 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, sgx_xfrm_reserved_mask) return -EINVAL; - ret = sgx_get_key_hash(sigstruct->modulus, mrsigner); - if (ret) - return ret; + sha256(sigstruct->modulus, SGX_MODULUS_SIZE, (u8 *)mrsigner); mutex_lock(&encl->lock); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 8ce352fc72ac..2de01b379aa3 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -14,6 +14,7 @@ #include <linux/slab.h> #include <linux/sysfs.h> #include <linux/vmalloc.h> +#include <asm/msr.h> #include <asm/sgx.h> #include "driver.h" #include "encl.h" @@ -719,6 +720,8 @@ int arch_memory_failure(unsigned long pfn, int flags) goto out; } + sgx_unmark_page_reclaimable(page); + /* * TBD: Add additional plumbing to enable pre-emptive * action for asynchronous poison notification. Until @@ -871,7 +874,7 @@ void sgx_update_lepubkeyhash(u64 *lepubkeyhash) WARN_ON_ONCE(preemptible()); for (i = 0; i < 4; i++) - wrmsrl(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); + wrmsrq(MSR_IA32_SGXLEPUBKEYHASH0 + i, lepubkeyhash[i]); } const struct file_operations sgx_provision_fops = { diff --git a/arch/x86/kernel/cpu/topology.c b/arch/x86/kernel/cpu/topology.c index 01456236a6dd..e35ccdc84910 100644 --- a/arch/x86/kernel/cpu/topology.c +++ b/arch/x86/kernel/cpu/topology.c @@ -30,6 +30,7 @@ #include <asm/hypervisor.h> #include <asm/io_apic.h> #include <asm/mpspec.h> +#include <asm/msr.h> #include <asm/smp.h> #include "cpu.h" @@ -154,7 +155,7 @@ static __init bool check_for_real_bsp(u32 apic_id) * kernel must rely on the firmware enumeration order. */ if (has_apic_base) { - rdmsrl(MSR_IA32_APICBASE, msr); + rdmsrq(MSR_IA32_APICBASE, msr); is_bsp = !!(msr & MSR_IA32_APICBASE_BSP); } diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c index 03b3c9c3a45e..843b1655ab45 100644 --- a/arch/x86/kernel/cpu/topology_amd.c +++ b/arch/x86/kernel/cpu/topology_amd.c @@ -3,6 +3,7 @@ #include <asm/apic.h> #include <asm/memtype.h> +#include <asm/msr.h> #include <asm/processor.h> #include "cpu.h" @@ -133,7 +134,7 @@ static void parse_fam10h_node_id(struct topo_scan *tscan) if (!boot_cpu_has(X86_FEATURE_NODEID_MSR)) return; - rdmsrl(MSR_FAM10H_NODE_ID, nid.msr); + rdmsrq(MSR_FAM10H_NODE_ID, nid.msr); store_node(tscan, nid.nodes_per_pkg + 1, nid.node_id); tscan->c->topo.llc_id = nid.node_id; } @@ -160,7 +161,7 @@ static void topoext_fixup(struct topo_scan *tscan) if (msr_set_bit(0xc0011005, 54) <= 0) return; - rdmsrl(0xc0011005, msrval); + rdmsrq(0xc0011005, msrval); if (msrval & BIT_64(54)) { set_cpu_cap(c, X86_FEATURE_TOPOEXT); pr_info_once(FW_INFO "CPU: Re-enabling disabled Topology Extensions Support.\n"); @@ -182,7 +183,7 @@ static void parse_topology_amd(struct topo_scan *tscan) if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) has_topoext = cpu_parse_topology_ext(tscan); - if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) + if (cpu_feature_enabled(X86_FEATURE_AMD_HTR_CORES)) tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); if (!has_topoext && !parse_8000_0008(tscan)) diff --git a/arch/x86/kernel/cpu/tsx.c b/arch/x86/kernel/cpu/tsx.c index b31ee4f1657a..49782724a943 100644 --- a/arch/x86/kernel/cpu/tsx.c +++ b/arch/x86/kernel/cpu/tsx.c @@ -12,6 +12,7 @@ #include <asm/cmdline.h> #include <asm/cpu.h> +#include <asm/msr.h> #include "cpu.h" @@ -24,7 +25,7 @@ static void tsx_disable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Force all transactions to immediately abort */ tsx |= TSX_CTRL_RTM_DISABLE; @@ -37,14 +38,14 @@ static void tsx_disable(void) */ tsx |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static void tsx_enable(void) { u64 tsx; - rdmsrl(MSR_IA32_TSX_CTRL, tsx); + rdmsrq(MSR_IA32_TSX_CTRL, tsx); /* Enable the RTM feature in the cpu */ tsx &= ~TSX_CTRL_RTM_DISABLE; @@ -56,7 +57,7 @@ static void tsx_enable(void) */ tsx &= ~TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, tsx); + wrmsrq(MSR_IA32_TSX_CTRL, tsx); } static enum tsx_ctrl_states x86_get_tsx_auto_mode(void) @@ -115,13 +116,13 @@ static void tsx_clear_cpuid(void) */ if (boot_cpu_has(X86_FEATURE_RTM_ALWAYS_ABORT) && boot_cpu_has(X86_FEATURE_TSX_FORCE_ABORT)) { - rdmsrl(MSR_TSX_FORCE_ABORT, msr); + rdmsrq(MSR_TSX_FORCE_ABORT, msr); msr |= MSR_TFA_TSX_CPUID_CLEAR; - wrmsrl(MSR_TSX_FORCE_ABORT, msr); + wrmsrq(MSR_TSX_FORCE_ABORT, msr); } else if (cpu_feature_enabled(X86_FEATURE_MSR_TSX_CTRL)) { - rdmsrl(MSR_IA32_TSX_CTRL, msr); + rdmsrq(MSR_IA32_TSX_CTRL, msr); msr |= TSX_CTRL_CPUID_CLEAR; - wrmsrl(MSR_IA32_TSX_CTRL, msr); + wrmsrq(MSR_IA32_TSX_CTRL, msr); } } @@ -146,11 +147,11 @@ static void tsx_dev_mode_disable(void) !cpu_feature_enabled(X86_FEATURE_SRBDS_CTRL)) return; - rdmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + rdmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); if (mcu_opt_ctrl & RTM_ALLOW) { mcu_opt_ctrl &= ~RTM_ALLOW; - wrmsrl(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); + wrmsrq(MSR_IA32_MCU_OPT_CTRL, mcu_opt_ctrl); setup_force_cpu_cap(X86_FEATURE_RTM_ALWAYS_ABORT); } } diff --git a/arch/x86/kernel/cpu/umwait.c b/arch/x86/kernel/cpu/umwait.c index 2293efd6ffa6..933fcd7ff250 100644 --- a/arch/x86/kernel/cpu/umwait.c +++ b/arch/x86/kernel/cpu/umwait.c @@ -33,7 +33,7 @@ static DEFINE_MUTEX(umwait_lock); static void umwait_update_control_msr(void * unused) { lockdep_assert_irqs_disabled(); - wrmsr(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached), 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, READ_ONCE(umwait_control_cached)); } /* @@ -71,7 +71,7 @@ static int umwait_cpu_offline(unsigned int cpu) * the original control MSR value in umwait_init(). So there * is no race condition here. */ - wrmsr(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached, 0); + wrmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); return 0; } @@ -214,7 +214,7 @@ static int __init umwait_init(void) * changed. This is the only place where orig_umwait_control_cached * is modified. */ - rdmsrl(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); + rdmsrq(MSR_IA32_UMWAIT_CONTROL, orig_umwait_control_cached); ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "umwait:online", umwait_cpu_online, umwait_cpu_offline); diff --git a/arch/x86/kernel/cpu/zhaoxin.c b/arch/x86/kernel/cpu/zhaoxin.c index 90eba7eb5335..89b1c8a70fe8 100644 --- a/arch/x86/kernel/cpu/zhaoxin.c +++ b/arch/x86/kernel/cpu/zhaoxin.c @@ -4,6 +4,7 @@ #include <asm/cpu.h> #include <asm/cpufeature.h> +#include <asm/msr.h> #include "cpu.h" |