diff options
Diffstat (limited to 'tools')
277 files changed, 16020 insertions, 6055 deletions
diff --git a/tools/arch/arm/include/uapi/asm/kvm.h b/tools/arch/arm/include/uapi/asm/kvm.h deleted file mode 100644 index d5dd96902817..000000000000 --- a/tools/arch/arm/include/uapi/asm/kvm.h +++ /dev/null @@ -1,315 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Copyright (C) 2012 - Virtual Open Systems and Columbia University - * Author: Christoffer Dall <c.dall@virtualopensystems.com> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License, version 2, as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. - */ - -#ifndef __ARM_KVM_H__ -#define __ARM_KVM_H__ - -#include <linux/types.h> -#include <linux/psci.h> -#include <asm/ptrace.h> - -#define __KVM_HAVE_GUEST_DEBUG -#define __KVM_HAVE_IRQ_LINE -#define __KVM_HAVE_READONLY_MEM -#define __KVM_HAVE_VCPU_EVENTS - -#define KVM_COALESCED_MMIO_PAGE_OFFSET 1 - -#define KVM_REG_SIZE(id) \ - (1U << (((id) & KVM_REG_SIZE_MASK) >> KVM_REG_SIZE_SHIFT)) - -/* Valid for svc_regs, abt_regs, und_regs, irq_regs in struct kvm_regs */ -#define KVM_ARM_SVC_sp svc_regs[0] -#define KVM_ARM_SVC_lr svc_regs[1] -#define KVM_ARM_SVC_spsr svc_regs[2] -#define KVM_ARM_ABT_sp abt_regs[0] -#define KVM_ARM_ABT_lr abt_regs[1] -#define KVM_ARM_ABT_spsr abt_regs[2] -#define KVM_ARM_UND_sp und_regs[0] -#define KVM_ARM_UND_lr und_regs[1] -#define KVM_ARM_UND_spsr und_regs[2] -#define KVM_ARM_IRQ_sp irq_regs[0] -#define KVM_ARM_IRQ_lr irq_regs[1] -#define KVM_ARM_IRQ_spsr irq_regs[2] - -/* Valid only for fiq_regs in struct kvm_regs */ -#define KVM_ARM_FIQ_r8 fiq_regs[0] -#define KVM_ARM_FIQ_r9 fiq_regs[1] -#define KVM_ARM_FIQ_r10 fiq_regs[2] -#define KVM_ARM_FIQ_fp fiq_regs[3] -#define KVM_ARM_FIQ_ip fiq_regs[4] -#define KVM_ARM_FIQ_sp fiq_regs[5] -#define KVM_ARM_FIQ_lr fiq_regs[6] -#define KVM_ARM_FIQ_spsr fiq_regs[7] - -struct kvm_regs { - struct pt_regs usr_regs; /* R0_usr - R14_usr, PC, CPSR */ - unsigned long svc_regs[3]; /* SP_svc, LR_svc, SPSR_svc */ - unsigned long abt_regs[3]; /* SP_abt, LR_abt, SPSR_abt */ - unsigned long und_regs[3]; /* SP_und, LR_und, SPSR_und */ - unsigned long irq_regs[3]; /* SP_irq, LR_irq, SPSR_irq */ - unsigned long fiq_regs[8]; /* R8_fiq - R14_fiq, SPSR_fiq */ -}; - -/* Supported Processor Types */ -#define KVM_ARM_TARGET_CORTEX_A15 0 -#define KVM_ARM_TARGET_CORTEX_A7 1 -#define KVM_ARM_NUM_TARGETS 2 - -/* KVM_ARM_SET_DEVICE_ADDR ioctl id encoding */ -#define KVM_ARM_DEVICE_TYPE_SHIFT 0 -#define KVM_ARM_DEVICE_TYPE_MASK (0xffff << KVM_ARM_DEVICE_TYPE_SHIFT) -#define KVM_ARM_DEVICE_ID_SHIFT 16 -#define KVM_ARM_DEVICE_ID_MASK (0xffff << KVM_ARM_DEVICE_ID_SHIFT) - -/* Supported device IDs */ -#define KVM_ARM_DEVICE_VGIC_V2 0 - -/* Supported VGIC address types */ -#define KVM_VGIC_V2_ADDR_TYPE_DIST 0 -#define KVM_VGIC_V2_ADDR_TYPE_CPU 1 - -#define KVM_VGIC_V2_DIST_SIZE 0x1000 -#define KVM_VGIC_V2_CPU_SIZE 0x2000 - -/* Supported VGICv3 address types */ -#define KVM_VGIC_V3_ADDR_TYPE_DIST 2 -#define KVM_VGIC_V3_ADDR_TYPE_REDIST 3 -#define KVM_VGIC_ITS_ADDR_TYPE 4 -#define KVM_VGIC_V3_ADDR_TYPE_REDIST_REGION 5 - -#define KVM_VGIC_V3_DIST_SIZE SZ_64K -#define KVM_VGIC_V3_REDIST_SIZE (2 * SZ_64K) -#define KVM_VGIC_V3_ITS_SIZE (2 * SZ_64K) - -#define KVM_ARM_VCPU_POWER_OFF 0 /* CPU is started in OFF state */ -#define KVM_ARM_VCPU_PSCI_0_2 1 /* CPU uses PSCI v0.2 */ - -struct kvm_vcpu_init { - __u32 target; - __u32 features[7]; -}; - -struct kvm_sregs { -}; - -struct kvm_fpu { -}; - -struct kvm_guest_debug_arch { -}; - -struct kvm_debug_exit_arch { -}; - -struct kvm_sync_regs { - /* Used with KVM_CAP_ARM_USER_IRQ */ - __u64 device_irq_level; -}; - -struct kvm_arch_memory_slot { -}; - -/* for KVM_GET/SET_VCPU_EVENTS */ -struct kvm_vcpu_events { - struct { - __u8 serror_pending; - __u8 serror_has_esr; - __u8 ext_dabt_pending; - /* Align it to 8 bytes */ - __u8 pad[5]; - __u64 serror_esr; - } exception; - __u32 reserved[12]; -}; - -/* If you need to interpret the index values, here is the key: */ -#define KVM_REG_ARM_COPROC_MASK 0x000000000FFF0000 -#define KVM_REG_ARM_COPROC_SHIFT 16 -#define KVM_REG_ARM_32_OPC2_MASK 0x0000000000000007 -#define KVM_REG_ARM_32_OPC2_SHIFT 0 -#define KVM_REG_ARM_OPC1_MASK 0x0000000000000078 -#define KVM_REG_ARM_OPC1_SHIFT 3 -#define KVM_REG_ARM_CRM_MASK 0x0000000000000780 -#define KVM_REG_ARM_CRM_SHIFT 7 -#define KVM_REG_ARM_32_CRN_MASK 0x0000000000007800 -#define KVM_REG_ARM_32_CRN_SHIFT 11 -/* - * For KVM currently all guest registers are nonsecure, but we reserve a bit - * in the encoding to distinguish secure from nonsecure for AArch32 system - * registers that are banked by security. This is 1 for the secure banked - * register, and 0 for the nonsecure banked register or if the register is - * not banked by security. - */ -#define KVM_REG_ARM_SECURE_MASK 0x0000000010000000 -#define KVM_REG_ARM_SECURE_SHIFT 28 - -#define ARM_CP15_REG_SHIFT_MASK(x,n) \ - (((x) << KVM_REG_ARM_ ## n ## _SHIFT) & KVM_REG_ARM_ ## n ## _MASK) - -#define __ARM_CP15_REG(op1,crn,crm,op2) \ - (KVM_REG_ARM | (15 << KVM_REG_ARM_COPROC_SHIFT) | \ - ARM_CP15_REG_SHIFT_MASK(op1, OPC1) | \ - ARM_CP15_REG_SHIFT_MASK(crn, 32_CRN) | \ - ARM_CP15_REG_SHIFT_MASK(crm, CRM) | \ - ARM_CP15_REG_SHIFT_MASK(op2, 32_OPC2)) - -#define ARM_CP15_REG32(...) (__ARM_CP15_REG(__VA_ARGS__) | KVM_REG_SIZE_U32) - -#define __ARM_CP15_REG64(op1,crm) \ - (__ARM_CP15_REG(op1, 0, crm, 0) | KVM_REG_SIZE_U64) -#define ARM_CP15_REG64(...) __ARM_CP15_REG64(__VA_ARGS__) - -/* PL1 Physical Timer Registers */ -#define KVM_REG_ARM_PTIMER_CTL ARM_CP15_REG32(0, 14, 2, 1) -#define KVM_REG_ARM_PTIMER_CNT ARM_CP15_REG64(0, 14) -#define KVM_REG_ARM_PTIMER_CVAL ARM_CP15_REG64(2, 14) - -/* Virtual Timer Registers */ -#define KVM_REG_ARM_TIMER_CTL ARM_CP15_REG32(0, 14, 3, 1) -#define KVM_REG_ARM_TIMER_CNT ARM_CP15_REG64(1, 14) -#define KVM_REG_ARM_TIMER_CVAL ARM_CP15_REG64(3, 14) - -/* Normal registers are mapped as coprocessor 16. */ -#define KVM_REG_ARM_CORE (0x0010 << KVM_REG_ARM_COPROC_SHIFT) -#define KVM_REG_ARM_CORE_REG(name) (offsetof(struct kvm_regs, name) / 4) - -/* Some registers need more space to represent values. */ -#define KVM_REG_ARM_DEMUX (0x0011 << KVM_REG_ARM_COPROC_SHIFT) -#define KVM_REG_ARM_DEMUX_ID_MASK 0x000000000000FF00 -#define KVM_REG_ARM_DEMUX_ID_SHIFT 8 -#define KVM_REG_ARM_DEMUX_ID_CCSIDR (0x00 << KVM_REG_ARM_DEMUX_ID_SHIFT) -#define KVM_REG_ARM_DEMUX_VAL_MASK 0x00000000000000FF -#define KVM_REG_ARM_DEMUX_VAL_SHIFT 0 - -/* VFP registers: we could overload CP10 like ARM does, but that's ugly. */ -#define KVM_REG_ARM_VFP (0x0012 << KVM_REG_ARM_COPROC_SHIFT) -#define KVM_REG_ARM_VFP_MASK 0x000000000000FFFF -#define KVM_REG_ARM_VFP_BASE_REG 0x0 -#define KVM_REG_ARM_VFP_FPSID 0x1000 -#define KVM_REG_ARM_VFP_FPSCR 0x1001 -#define KVM_REG_ARM_VFP_MVFR1 0x1006 -#define KVM_REG_ARM_VFP_MVFR0 0x1007 -#define KVM_REG_ARM_VFP_FPEXC 0x1008 -#define KVM_REG_ARM_VFP_FPINST 0x1009 -#define KVM_REG_ARM_VFP_FPINST2 0x100A - -/* KVM-as-firmware specific pseudo-registers */ -#define KVM_REG_ARM_FW (0x0014 << KVM_REG_ARM_COPROC_SHIFT) -#define KVM_REG_ARM_FW_REG(r) (KVM_REG_ARM | KVM_REG_SIZE_U64 | \ - KVM_REG_ARM_FW | ((r) & 0xffff)) -#define KVM_REG_ARM_PSCI_VERSION KVM_REG_ARM_FW_REG(0) -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1 KVM_REG_ARM_FW_REG(1) - /* Higher values mean better protection. */ -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_AVAIL 0 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_AVAIL 1 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_1_NOT_REQUIRED 2 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2 KVM_REG_ARM_FW_REG(2) - /* Higher values mean better protection. */ -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_AVAIL 0 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_UNKNOWN 1 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_AVAIL 2 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_NOT_REQUIRED 3 -#define KVM_REG_ARM_SMCCC_ARCH_WORKAROUND_2_ENABLED (1U << 4) - -/* Device Control API: ARM VGIC */ -#define KVM_DEV_ARM_VGIC_GRP_ADDR 0 -#define KVM_DEV_ARM_VGIC_GRP_DIST_REGS 1 -#define KVM_DEV_ARM_VGIC_GRP_CPU_REGS 2 -#define KVM_DEV_ARM_VGIC_CPUID_SHIFT 32 -#define KVM_DEV_ARM_VGIC_CPUID_MASK (0xffULL << KVM_DEV_ARM_VGIC_CPUID_SHIFT) -#define KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT 32 -#define KVM_DEV_ARM_VGIC_V3_MPIDR_MASK \ - (0xffffffffULL << KVM_DEV_ARM_VGIC_V3_MPIDR_SHIFT) -#define KVM_DEV_ARM_VGIC_OFFSET_SHIFT 0 -#define KVM_DEV_ARM_VGIC_OFFSET_MASK (0xffffffffULL << KVM_DEV_ARM_VGIC_OFFSET_SHIFT) -#define KVM_DEV_ARM_VGIC_SYSREG_INSTR_MASK (0xffff) -#define KVM_DEV_ARM_VGIC_GRP_NR_IRQS 3 -#define KVM_DEV_ARM_VGIC_GRP_CTRL 4 -#define KVM_DEV_ARM_VGIC_GRP_REDIST_REGS 5 -#define KVM_DEV_ARM_VGIC_GRP_CPU_SYSREGS 6 -#define KVM_DEV_ARM_VGIC_GRP_LEVEL_INFO 7 -#define KVM_DEV_ARM_VGIC_GRP_ITS_REGS 8 -#define KVM_DEV_ARM_VGIC_GRP_MAINT_IRQ 9 -#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT 10 -#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_MASK \ - (0x3fffffULL << KVM_DEV_ARM_VGIC_LINE_LEVEL_INFO_SHIFT) -#define KVM_DEV_ARM_VGIC_LINE_LEVEL_INTID_MASK 0x3ff -#define VGIC_LEVEL_INFO_LINE_LEVEL 0 - -/* Device Control API on vcpu fd */ -#define KVM_ARM_VCPU_PMU_V3_CTRL 0 -#define KVM_ARM_VCPU_PMU_V3_IRQ 0 -#define KVM_ARM_VCPU_PMU_V3_INIT 1 -#define KVM_ARM_VCPU_TIMER_CTRL 1 -#define KVM_ARM_VCPU_TIMER_IRQ_VTIMER 0 -#define KVM_ARM_VCPU_TIMER_IRQ_PTIMER 1 - -#define KVM_DEV_ARM_VGIC_CTRL_INIT 0 -#define KVM_DEV_ARM_ITS_SAVE_TABLES 1 -#define KVM_DEV_ARM_ITS_RESTORE_TABLES 2 -#define KVM_DEV_ARM_VGIC_SAVE_PENDING_TABLES 3 -#define KVM_DEV_ARM_ITS_CTRL_RESET 4 - -/* KVM_IRQ_LINE irq field index values */ -#define KVM_ARM_IRQ_VCPU2_SHIFT 28 -#define KVM_ARM_IRQ_VCPU2_MASK 0xf -#define KVM_ARM_IRQ_TYPE_SHIFT 24 -#define KVM_ARM_IRQ_TYPE_MASK 0xf -#define KVM_ARM_IRQ_VCPU_SHIFT 16 -#define KVM_ARM_IRQ_VCPU_MASK 0xff -#define KVM_ARM_IRQ_NUM_SHIFT 0 -#define KVM_ARM_IRQ_NUM_MASK 0xffff - -/* irq_type field */ -#define KVM_ARM_IRQ_TYPE_CPU 0 -#define KVM_ARM_IRQ_TYPE_SPI 1 -#define KVM_ARM_IRQ_TYPE_PPI 2 - -/* out-of-kernel GIC cpu interrupt injection irq_number field */ -#define KVM_ARM_IRQ_CPU_IRQ 0 -#define KVM_ARM_IRQ_CPU_FIQ 1 - -/* - * This used to hold the highest supported SPI, but it is now obsolete - * and only here to provide source code level compatibility with older - * userland. The highest SPI number can be set via KVM_DEV_ARM_VGIC_GRP_NR_IRQS. - */ -#ifndef __KERNEL__ -#define KVM_ARM_IRQ_GIC_MAX 127 -#endif - -/* One single KVM irqchip, ie. the VGIC */ -#define KVM_NR_IRQCHIPS 1 - -/* PSCI interface */ -#define KVM_PSCI_FN_BASE 0x95c1ba5e -#define KVM_PSCI_FN(n) (KVM_PSCI_FN_BASE + (n)) - -#define KVM_PSCI_FN_CPU_SUSPEND KVM_PSCI_FN(0) -#define KVM_PSCI_FN_CPU_OFF KVM_PSCI_FN(1) -#define KVM_PSCI_FN_CPU_ON KVM_PSCI_FN(2) -#define KVM_PSCI_FN_MIGRATE KVM_PSCI_FN(3) - -#define KVM_PSCI_RET_SUCCESS PSCI_RET_SUCCESS -#define KVM_PSCI_RET_NI PSCI_RET_NOT_SUPPORTED -#define KVM_PSCI_RET_INVAL PSCI_RET_INVALID_PARAMS -#define KVM_PSCI_RET_DENIED PSCI_RET_DENIED - -#endif /* __ARM_KVM_H__ */ diff --git a/tools/arch/s390/include/uapi/asm/kvm_perf.h b/tools/arch/s390/include/uapi/asm/kvm_perf.h deleted file mode 100644 index 84606b8cc49e..000000000000 --- a/tools/arch/s390/include/uapi/asm/kvm_perf.h +++ /dev/null @@ -1,22 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -/* - * Definitions for perf-kvm on s390 - * - * Copyright 2014 IBM Corp. - * Author(s): Alexander Yarygin <yarygin@linux.vnet.ibm.com> - */ - -#ifndef __LINUX_KVM_PERF_S390_H -#define __LINUX_KVM_PERF_S390_H - -#include <asm/sie.h> - -#define DECODE_STR_LEN 40 - -#define VCPU_ID "id" - -#define KVM_ENTRY_TRACE "kvm:kvm_s390_sie_enter" -#define KVM_EXIT_TRACE "kvm:kvm_s390_sie_exit" -#define KVM_EXIT_REASON "icptcode" - -#endif diff --git a/tools/arch/x86/include/uapi/asm/kvm_perf.h b/tools/arch/x86/include/uapi/asm/kvm_perf.h deleted file mode 100644 index 125cf5cdf6c5..000000000000 --- a/tools/arch/x86/include/uapi/asm/kvm_perf.h +++ /dev/null @@ -1,17 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ -#ifndef _ASM_X86_KVM_PERF_H -#define _ASM_X86_KVM_PERF_H - -#include <asm/svm.h> -#include <asm/vmx.h> -#include <asm/kvm.h> - -#define DECODE_STR_LEN 20 - -#define VCPU_ID "vcpu_id" - -#define KVM_ENTRY_TRACE "kvm:kvm_entry" -#define KVM_EXIT_TRACE "kvm:kvm_exit" -#define KVM_EXIT_REASON "exit_reason" - -#endif /* _ASM_X86_KVM_PERF_H */ diff --git a/tools/build/Makefile.feature b/tools/build/Makefile.feature index 649c5ab8e8f2..32bbe29fe5f6 100644 --- a/tools/build/Makefile.feature +++ b/tools/build/Makefile.feature @@ -68,7 +68,6 @@ FEATURE_TESTS_BASIC := \ libdw \ eventfd \ fortify-source \ - get_current_dir_name \ gettid \ glibc \ libbfd \ @@ -80,11 +79,9 @@ FEATURE_TESTS_BASIC := \ libelf-zstd \ libnuma \ numa_num_possible_cpus \ - libperl \ libpython \ libslang \ libtraceevent \ - libtracefs \ libcpupower \ pthread-attr-setaffinity-np \ pthread-barrier \ @@ -121,11 +118,11 @@ FEATURE_TESTS_EXTRA := \ libbfd-liberty \ libbfd-liberty-z \ libopencsd \ + libperl \ cxx \ llvm \ clang \ libbpf \ - libbpf-strings \ libpfm4 \ libdebuginfod \ clang-bpf-co-re \ @@ -144,7 +141,6 @@ FEATURE_DISPLAY ?= \ libelf \ libnuma \ numa_num_possible_cpus \ - libperl \ libpython \ libcapstone \ llvm-perf \ diff --git a/tools/build/feature/Makefile b/tools/build/feature/Makefile index b41a42818d8a..49b0add392b1 100644 --- a/tools/build/feature/Makefile +++ b/tools/build/feature/Makefile @@ -8,7 +8,6 @@ FILES= \ test-libdw.bin \ test-eventfd.bin \ test-fortify-source.bin \ - test-get_current_dir_name.bin \ test-glibc.bin \ test-gtk2.bin \ test-gtk2-infobar.bin \ @@ -34,7 +33,6 @@ FILES= \ test-libperl.bin \ test-libpython.bin \ test-libslang.bin \ - test-libslang-include-subdir.bin \ test-libtraceevent.bin \ test-libcpupower.bin \ test-libtracefs.bin \ @@ -58,7 +56,6 @@ FILES= \ test-lzma.bin \ test-bpf.bin \ test-libbpf.bin \ - test-libbpf-strings.bin \ test-get_cpuid.bin \ test-sdt.bin \ test-cxx.bin \ @@ -94,7 +91,7 @@ else # paths are used instead. ifdef CROSS_COMPILE ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),) - CROSS_ARCH = $(shell $(CC) -dumpmachine) + CROSS_ARCH = $(notdir $(CROSS_COMPILE:%-=%)) PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/ @@ -147,9 +144,6 @@ $(OUTPUT)test-libelf.bin: $(OUTPUT)test-eventfd.bin: $(BUILD) -$(OUTPUT)test-get_current_dir_name.bin: - $(BUILD) - $(OUTPUT)test-glibc.bin: $(BUILD) @@ -234,9 +228,6 @@ $(OUTPUT)test-libunwind-debug-frame-aarch64.bin: $(OUTPUT)test-libslang.bin: $(BUILD) -lslang -$(OUTPUT)test-libslang-include-subdir.bin: - $(BUILD) -lslang - $(OUTPUT)test-libtraceevent.bin: $(BUILD) -ltraceevent @@ -316,10 +307,10 @@ $(OUTPUT)test-libcapstone.bin: $(BUILD) # -lcapstone provided by $(FEATURE_CHECK_LDFLAGS-libcapstone) $(OUTPUT)test-compile-32.bin: - $(CC) -m32 -o $@ test-compile.c + $(CC) -m32 -Wall -Werror -o $@ test-compile.c $(OUTPUT)test-compile-x32.bin: - $(CC) -mx32 -o $@ test-compile.c + $(CC) -mx32 -Wall -Werror -o $@ test-compile.c $(OUTPUT)test-zlib.bin: $(BUILD) -lz @@ -336,9 +327,6 @@ $(OUTPUT)test-bpf.bin: $(OUTPUT)test-libbpf.bin: $(BUILD) -lbpf -$(OUTPUT)test-libbpf-strings.bin: - $(BUILD) - $(OUTPUT)test-sdt.bin: $(BUILD) diff --git a/tools/build/feature/test-all.c b/tools/build/feature/test-all.c index 4419fb4710bd..8a354b81417c 100644 --- a/tools/build/feature/test-all.c +++ b/tools/build/feature/test-all.c @@ -7,17 +7,13 @@ */ /* - * Quirk: Python and Perl headers cannot be in arbitrary places, so keep - * these 3 testcases at the top: + * Quirk: Python headers cannot be in arbitrary places, so keep this testcase at + * the top: */ #define main main_test_libpython # include "test-libpython.c" #undef main -#define main main_test_libperl -# include "test-libperl.c" -#undef main - #define main main_test_hello # include "test-hello.c" #undef main @@ -26,10 +22,6 @@ # include "test-libelf.c" #undef main -#define main main_test_get_current_dir_name -# include "test-get_current_dir_name.c" -#undef main - #define main main_test_gettid # include "test-gettid.c" #undef main @@ -154,17 +146,11 @@ # include "test-libtraceevent.c" #undef main -#define main main_test_libtracefs -# include "test-libtracefs.c" -#undef main - int main(int argc, char *argv[]) { main_test_libpython(); - main_test_libperl(); main_test_hello(); main_test_libelf(); - main_test_get_current_dir_name(); main_test_gettid(); main_test_glibc(); main_test_libdw(); @@ -192,7 +178,6 @@ int main(int argc, char *argv[]) main_test_reallocarray(); main_test_libzstd(); main_test_libtraceevent(); - main_test_libtracefs(); return 0; } diff --git a/tools/build/feature/test-get_current_dir_name.c b/tools/build/feature/test-get_current_dir_name.c deleted file mode 100644 index c3c201691b4f..000000000000 --- a/tools/build/feature/test-get_current_dir_name.c +++ /dev/null @@ -1,11 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#define _GNU_SOURCE -#include <unistd.h> -#include <stdlib.h> - -int main(void) -{ - free(get_current_dir_name()); - return 0; -} -#undef _GNU_SOURCE diff --git a/tools/build/feature/test-libbpf-strings.c b/tools/build/feature/test-libbpf-strings.c deleted file mode 100644 index 83e6c45f5c85..000000000000 --- a/tools/build/feature/test-libbpf-strings.c +++ /dev/null @@ -1,10 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <bpf/btf.h> - -int main(void) -{ - struct btf_dump_type_data_opts opts; - - opts.emit_strings = 0; - return opts.emit_strings; -} diff --git a/tools/build/feature/test-libslang-include-subdir.c b/tools/build/feature/test-libslang-include-subdir.c deleted file mode 100644 index 3ea47ec7590e..000000000000 --- a/tools/build/feature/test-libslang-include-subdir.c +++ /dev/null @@ -1,7 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include <slang/slang.h> - -int main(void) -{ - return SLsmg_init_smg(); -} diff --git a/tools/include/linux/bitmap.h b/tools/include/linux/bitmap.h index d4d300040d01..0d992245c600 100644 --- a/tools/include/linux/bitmap.h +++ b/tools/include/linux/bitmap.h @@ -3,6 +3,7 @@ #define _TOOLS_LINUX_BITMAP_H #include <string.h> +#include <asm-generic/bitsperlong.h> #include <linux/align.h> #include <linux/bitops.h> #include <linux/find.h> diff --git a/tools/include/linux/gfp_types.h b/tools/include/linux/gfp_types.h index 5f9f1ed190a0..65db9349f905 100644 --- a/tools/include/linux/gfp_types.h +++ b/tools/include/linux/gfp_types.h @@ -1 +1,392 @@ -#include "../../../include/linux/gfp_types.h" +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_GFP_TYPES_H +#define __LINUX_GFP_TYPES_H + +#include <linux/bits.h> + +/* The typedef is in types.h but we want the documentation here */ +#if 0 +/** + * typedef gfp_t - Memory allocation flags. + * + * GFP flags are commonly used throughout Linux to indicate how memory + * should be allocated. The GFP acronym stands for get_free_pages(), + * the underlying memory allocation function. Not every GFP flag is + * supported by every function which may allocate memory. Most users + * will want to use a plain ``GFP_KERNEL``. + */ +typedef unsigned int __bitwise gfp_t; +#endif + +/* + * In case of changes, please don't forget to update + * include/trace/events/mmflags.h and tools/perf/builtin-kmem.c + */ + +enum { + ___GFP_DMA_BIT, + ___GFP_HIGHMEM_BIT, + ___GFP_DMA32_BIT, + ___GFP_MOVABLE_BIT, + ___GFP_RECLAIMABLE_BIT, + ___GFP_HIGH_BIT, + ___GFP_IO_BIT, + ___GFP_FS_BIT, + ___GFP_ZERO_BIT, + ___GFP_UNUSED_BIT, /* 0x200u unused */ + ___GFP_DIRECT_RECLAIM_BIT, + ___GFP_KSWAPD_RECLAIM_BIT, + ___GFP_WRITE_BIT, + ___GFP_NOWARN_BIT, + ___GFP_RETRY_MAYFAIL_BIT, + ___GFP_NOFAIL_BIT, + ___GFP_NORETRY_BIT, + ___GFP_MEMALLOC_BIT, + ___GFP_COMP_BIT, + ___GFP_NOMEMALLOC_BIT, + ___GFP_HARDWALL_BIT, + ___GFP_THISNODE_BIT, + ___GFP_ACCOUNT_BIT, + ___GFP_ZEROTAGS_BIT, +#ifdef CONFIG_KASAN_HW_TAGS + ___GFP_SKIP_ZERO_BIT, + ___GFP_SKIP_KASAN_BIT, +#endif +#ifdef CONFIG_LOCKDEP + ___GFP_NOLOCKDEP_BIT, +#endif +#ifdef CONFIG_SLAB_OBJ_EXT + ___GFP_NO_OBJ_EXT_BIT, +#endif + ___GFP_LAST_BIT +}; + +/* Plain integer GFP bitmasks. Do not use this directly. */ +#define ___GFP_DMA BIT(___GFP_DMA_BIT) +#define ___GFP_HIGHMEM BIT(___GFP_HIGHMEM_BIT) +#define ___GFP_DMA32 BIT(___GFP_DMA32_BIT) +#define ___GFP_MOVABLE BIT(___GFP_MOVABLE_BIT) +#define ___GFP_RECLAIMABLE BIT(___GFP_RECLAIMABLE_BIT) +#define ___GFP_HIGH BIT(___GFP_HIGH_BIT) +#define ___GFP_IO BIT(___GFP_IO_BIT) +#define ___GFP_FS BIT(___GFP_FS_BIT) +#define ___GFP_ZERO BIT(___GFP_ZERO_BIT) +/* 0x200u unused */ +#define ___GFP_DIRECT_RECLAIM BIT(___GFP_DIRECT_RECLAIM_BIT) +#define ___GFP_KSWAPD_RECLAIM BIT(___GFP_KSWAPD_RECLAIM_BIT) +#define ___GFP_WRITE BIT(___GFP_WRITE_BIT) +#define ___GFP_NOWARN BIT(___GFP_NOWARN_BIT) +#define ___GFP_RETRY_MAYFAIL BIT(___GFP_RETRY_MAYFAIL_BIT) +#define ___GFP_NOFAIL BIT(___GFP_NOFAIL_BIT) +#define ___GFP_NORETRY BIT(___GFP_NORETRY_BIT) +#define ___GFP_MEMALLOC BIT(___GFP_MEMALLOC_BIT) +#define ___GFP_COMP BIT(___GFP_COMP_BIT) +#define ___GFP_NOMEMALLOC BIT(___GFP_NOMEMALLOC_BIT) +#define ___GFP_HARDWALL BIT(___GFP_HARDWALL_BIT) +#define ___GFP_THISNODE BIT(___GFP_THISNODE_BIT) +#define ___GFP_ACCOUNT BIT(___GFP_ACCOUNT_BIT) +#define ___GFP_ZEROTAGS BIT(___GFP_ZEROTAGS_BIT) +#ifdef CONFIG_KASAN_HW_TAGS +#define ___GFP_SKIP_ZERO BIT(___GFP_SKIP_ZERO_BIT) +#define ___GFP_SKIP_KASAN BIT(___GFP_SKIP_KASAN_BIT) +#else +#define ___GFP_SKIP_ZERO 0 +#define ___GFP_SKIP_KASAN 0 +#endif +#ifdef CONFIG_LOCKDEP +#define ___GFP_NOLOCKDEP BIT(___GFP_NOLOCKDEP_BIT) +#else +#define ___GFP_NOLOCKDEP 0 +#endif +#ifdef CONFIG_SLAB_OBJ_EXT +#define ___GFP_NO_OBJ_EXT BIT(___GFP_NO_OBJ_EXT_BIT) +#else +#define ___GFP_NO_OBJ_EXT 0 +#endif + +/* + * Physical address zone modifiers (see linux/mmzone.h - low four bits) + * + * Do not put any conditional on these. If necessary modify the definitions + * without the underscores and use them consistently. The definitions here may + * be used in bit comparisons. + */ +#define __GFP_DMA ((__force gfp_t)___GFP_DMA) +#define __GFP_HIGHMEM ((__force gfp_t)___GFP_HIGHMEM) +#define __GFP_DMA32 ((__force gfp_t)___GFP_DMA32) +#define __GFP_MOVABLE ((__force gfp_t)___GFP_MOVABLE) /* ZONE_MOVABLE allowed */ +#define GFP_ZONEMASK (__GFP_DMA|__GFP_HIGHMEM|__GFP_DMA32|__GFP_MOVABLE) + +/** + * DOC: Page mobility and placement hints + * + * Page mobility and placement hints + * --------------------------------- + * + * These flags provide hints about how mobile the page is. Pages with similar + * mobility are placed within the same pageblocks to minimise problems due + * to external fragmentation. + * + * %__GFP_MOVABLE (also a zone modifier) indicates that the page can be + * moved by page migration during memory compaction or can be reclaimed. + * + * %__GFP_RECLAIMABLE is used for slab allocations that specify + * SLAB_RECLAIM_ACCOUNT and whose pages can be freed via shrinkers. + * + * %__GFP_WRITE indicates the caller intends to dirty the page. Where possible, + * these pages will be spread between local zones to avoid all the dirty + * pages being in one zone (fair zone allocation policy). + * + * %__GFP_HARDWALL enforces the cpuset memory allocation policy. + * + * %__GFP_THISNODE forces the allocation to be satisfied from the requested + * node with no fallbacks or placement policy enforcements. + * + * %__GFP_ACCOUNT causes the allocation to be accounted to kmemcg. + * + * %__GFP_NO_OBJ_EXT causes slab allocation to have no object extension. + */ +#define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) +#define __GFP_WRITE ((__force gfp_t)___GFP_WRITE) +#define __GFP_HARDWALL ((__force gfp_t)___GFP_HARDWALL) +#define __GFP_THISNODE ((__force gfp_t)___GFP_THISNODE) +#define __GFP_ACCOUNT ((__force gfp_t)___GFP_ACCOUNT) +#define __GFP_NO_OBJ_EXT ((__force gfp_t)___GFP_NO_OBJ_EXT) + +/** + * DOC: Watermark modifiers + * + * Watermark modifiers -- controls access to emergency reserves + * ------------------------------------------------------------ + * + * %__GFP_HIGH indicates that the caller is high-priority and that granting + * the request is necessary before the system can make forward progress. + * For example creating an IO context to clean pages and requests + * from atomic context. + * + * %__GFP_MEMALLOC allows access to all memory. This should only be used when + * the caller guarantees the allocation will allow more memory to be freed + * very shortly e.g. process exiting or swapping. Users either should + * be the MM or co-ordinating closely with the VM (e.g. swap over NFS). + * Users of this flag have to be extremely careful to not deplete the reserve + * completely and implement a throttling mechanism which controls the + * consumption of the reserve based on the amount of freed memory. + * Usage of a pre-allocated pool (e.g. mempool) should be always considered + * before using this flag. + * + * %__GFP_NOMEMALLOC is used to explicitly forbid access to emergency reserves. + * This takes precedence over the %__GFP_MEMALLOC flag if both are set. + */ +#define __GFP_HIGH ((__force gfp_t)___GFP_HIGH) +#define __GFP_MEMALLOC ((__force gfp_t)___GFP_MEMALLOC) +#define __GFP_NOMEMALLOC ((__force gfp_t)___GFP_NOMEMALLOC) + +/** + * DOC: Reclaim modifiers + * + * Reclaim modifiers + * ----------------- + * Please note that all the following flags are only applicable to sleepable + * allocations (e.g. %GFP_NOWAIT and %GFP_ATOMIC will ignore them). + * + * %__GFP_IO can start physical IO. + * + * %__GFP_FS can call down to the low-level FS. Clearing the flag avoids the + * allocator recursing into the filesystem which might already be holding + * locks. + * + * %__GFP_DIRECT_RECLAIM indicates that the caller may enter direct reclaim. + * This flag can be cleared to avoid unnecessary delays when a fallback + * option is available. + * + * %__GFP_KSWAPD_RECLAIM indicates that the caller wants to wake kswapd when + * the low watermark is reached and have it reclaim pages until the high + * watermark is reached. A caller may wish to clear this flag when fallback + * options are available and the reclaim is likely to disrupt the system. The + * canonical example is THP allocation where a fallback is cheap but + * reclaim/compaction may cause indirect stalls. + * + * %__GFP_RECLAIM is shorthand to allow/forbid both direct and kswapd reclaim. + * + * The default allocator behavior depends on the request size. We have a concept + * of so-called costly allocations (with order > %PAGE_ALLOC_COSTLY_ORDER). + * !costly allocations are too essential to fail so they are implicitly + * non-failing by default (with some exceptions like OOM victims might fail so + * the caller still has to check for failures) while costly requests try to be + * not disruptive and back off even without invoking the OOM killer. + * The following three modifiers might be used to override some of these + * implicit rules. Please note that all of them must be used along with + * %__GFP_DIRECT_RECLAIM flag. + * + * %__GFP_NORETRY: The VM implementation will try only very lightweight + * memory direct reclaim to get some memory under memory pressure (thus + * it can sleep). It will avoid disruptive actions like OOM killer. The + * caller must handle the failure which is quite likely to happen under + * heavy memory pressure. The flag is suitable when failure can easily be + * handled at small cost, such as reduced throughput. + * + * %__GFP_RETRY_MAYFAIL: The VM implementation will retry memory reclaim + * procedures that have previously failed if there is some indication + * that progress has been made elsewhere. It can wait for other + * tasks to attempt high-level approaches to freeing memory such as + * compaction (which removes fragmentation) and page-out. + * There is still a definite limit to the number of retries, but it is + * a larger limit than with %__GFP_NORETRY. + * Allocations with this flag may fail, but only when there is + * genuinely little unused memory. While these allocations do not + * directly trigger the OOM killer, their failure indicates that + * the system is likely to need to use the OOM killer soon. The + * caller must handle failure, but can reasonably do so by failing + * a higher-level request, or completing it only in a much less + * efficient manner. + * If the allocation does fail, and the caller is in a position to + * free some non-essential memory, doing so could benefit the system + * as a whole. + * + * %__GFP_NOFAIL: The VM implementation _must_ retry infinitely: the caller + * cannot handle allocation failures. The allocation could block + * indefinitely but will never return with failure. Testing for + * failure is pointless. + * It _must_ be blockable and used together with __GFP_DIRECT_RECLAIM. + * It should _never_ be used in non-sleepable contexts. + * New users should be evaluated carefully (and the flag should be + * used only when there is no reasonable failure policy) but it is + * definitely preferable to use the flag rather than opencode endless + * loop around allocator. + * Allocating pages from the buddy with __GFP_NOFAIL and order > 1 is + * not supported. Please consider using kvmalloc() instead. + */ +#define __GFP_IO ((__force gfp_t)___GFP_IO) +#define __GFP_FS ((__force gfp_t)___GFP_FS) +#define __GFP_DIRECT_RECLAIM ((__force gfp_t)___GFP_DIRECT_RECLAIM) /* Caller can reclaim */ +#define __GFP_KSWAPD_RECLAIM ((__force gfp_t)___GFP_KSWAPD_RECLAIM) /* kswapd can wake */ +#define __GFP_RECLAIM ((__force gfp_t)(___GFP_DIRECT_RECLAIM|___GFP_KSWAPD_RECLAIM)) +#define __GFP_RETRY_MAYFAIL ((__force gfp_t)___GFP_RETRY_MAYFAIL) +#define __GFP_NOFAIL ((__force gfp_t)___GFP_NOFAIL) +#define __GFP_NORETRY ((__force gfp_t)___GFP_NORETRY) + +/** + * DOC: Action modifiers + * + * Action modifiers + * ---------------- + * + * %__GFP_NOWARN suppresses allocation failure reports. + * + * %__GFP_COMP address compound page metadata. + * + * %__GFP_ZERO returns a zeroed page on success. + * + * %__GFP_ZEROTAGS zeroes memory tags at allocation time if the memory itself + * is being zeroed (either via __GFP_ZERO or via init_on_alloc, provided that + * __GFP_SKIP_ZERO is not set). This flag is intended for optimization: setting + * memory tags at the same time as zeroing memory has minimal additional + * performance impact. + * + * %__GFP_SKIP_KASAN makes KASAN skip unpoisoning on page allocation. + * Used for userspace and vmalloc pages; the latter are unpoisoned by + * kasan_unpoison_vmalloc instead. For userspace pages, results in + * poisoning being skipped as well, see should_skip_kasan_poison for + * details. Only effective in HW_TAGS mode. + */ +#define __GFP_NOWARN ((__force gfp_t)___GFP_NOWARN) +#define __GFP_COMP ((__force gfp_t)___GFP_COMP) +#define __GFP_ZERO ((__force gfp_t)___GFP_ZERO) +#define __GFP_ZEROTAGS ((__force gfp_t)___GFP_ZEROTAGS) +#define __GFP_SKIP_ZERO ((__force gfp_t)___GFP_SKIP_ZERO) +#define __GFP_SKIP_KASAN ((__force gfp_t)___GFP_SKIP_KASAN) + +/* Disable lockdep for GFP context tracking */ +#define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) + +/* Room for N __GFP_FOO bits */ +#define __GFP_BITS_SHIFT ___GFP_LAST_BIT +#define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) + +/** + * DOC: Useful GFP flag combinations + * + * Useful GFP flag combinations + * ---------------------------- + * + * Useful GFP flag combinations that are commonly used. It is recommended + * that subsystems start with one of these combinations and then set/clear + * %__GFP_FOO flags as necessary. + * + * %GFP_ATOMIC users can not sleep and need the allocation to succeed. A lower + * watermark is applied to allow access to "atomic reserves". + * The current implementation doesn't support NMI and few other strict + * non-preemptive contexts (e.g. raw_spin_lock). The same applies to %GFP_NOWAIT. + * + * %GFP_KERNEL is typical for kernel-internal allocations. The caller requires + * %ZONE_NORMAL or a lower zone for direct access but can direct reclaim. + * + * %GFP_KERNEL_ACCOUNT is the same as GFP_KERNEL, except the allocation is + * accounted to kmemcg. + * + * %GFP_NOWAIT is for kernel allocations that should not stall for direct + * reclaim, start physical IO or use any filesystem callback. It is very + * likely to fail to allocate memory, even for very small allocations. + * + * %GFP_NOIO will use direct reclaim to discard clean pages or slab pages + * that do not require the starting of any physical IO. + * Please try to avoid using this flag directly and instead use + * memalloc_noio_{save,restore} to mark the whole scope which cannot + * perform any IO with a short explanation why. All allocation requests + * will inherit GFP_NOIO implicitly. + * + * %GFP_NOFS will use direct reclaim but will not use any filesystem interfaces. + * Please try to avoid using this flag directly and instead use + * memalloc_nofs_{save,restore} to mark the whole scope which cannot/shouldn't + * recurse into the FS layer with a short explanation why. All allocation + * requests will inherit GFP_NOFS implicitly. + * + * %GFP_USER is for userspace allocations that also need to be directly + * accessibly by the kernel or hardware. It is typically used by hardware + * for buffers that are mapped to userspace (e.g. graphics) that hardware + * still must DMA to. cpuset limits are enforced for these allocations. + * + * %GFP_DMA exists for historical reasons and should be avoided where possible. + * The flags indicates that the caller requires that the lowest zone be + * used (%ZONE_DMA or 16M on x86-64). Ideally, this would be removed but + * it would require careful auditing as some users really require it and + * others use the flag to avoid lowmem reserves in %ZONE_DMA and treat the + * lowest zone as a type of emergency reserve. + * + * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit + * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory + * because the DMA32 kmalloc cache array is not implemented. + * (Reason: there is no such user in kernel). + * + * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, + * do not need to be directly accessible by the kernel but that cannot + * move once in use. An example may be a hardware allocation that maps + * data directly into userspace but has no addressing limitations. + * + * %GFP_HIGHUSER_MOVABLE is for userspace allocations that the kernel does not + * need direct access to but can use kmap() when access is required. They + * are expected to be movable via page reclaim or page migration. Typically, + * pages on the LRU would also be allocated with %GFP_HIGHUSER_MOVABLE. + * + * %GFP_TRANSHUGE and %GFP_TRANSHUGE_LIGHT are used for THP allocations. They + * are compound allocations that will generally fail quickly if memory is not + * available and will not wake kswapd/kcompactd on failure. The _LIGHT + * version does not attempt reclaim/compaction at all and is by default used + * in page fault path, while the non-light is used by khugepaged. + */ +#define GFP_ATOMIC (__GFP_HIGH|__GFP_KSWAPD_RECLAIM) +#define GFP_KERNEL (__GFP_RECLAIM | __GFP_IO | __GFP_FS) +#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT) +#define GFP_NOWAIT (__GFP_KSWAPD_RECLAIM | __GFP_NOWARN) +#define GFP_NOIO (__GFP_RECLAIM) +#define GFP_NOFS (__GFP_RECLAIM | __GFP_IO) +#define GFP_USER (__GFP_RECLAIM | __GFP_IO | __GFP_FS | __GFP_HARDWALL) +#define GFP_DMA __GFP_DMA +#define GFP_DMA32 __GFP_DMA32 +#define GFP_HIGHUSER (GFP_USER | __GFP_HIGHMEM) +#define GFP_HIGHUSER_MOVABLE (GFP_HIGHUSER | __GFP_MOVABLE | __GFP_SKIP_KASAN) +#define GFP_TRANSHUGE_LIGHT ((GFP_HIGHUSER_MOVABLE | __GFP_COMP | \ + __GFP_NOMEMALLOC | __GFP_NOWARN) & ~__GFP_RECLAIM) +#define GFP_TRANSHUGE (GFP_TRANSHUGE_LIGHT | __GFP_DIRECT_RECLAIM) + +#endif /* __LINUX_GFP_TYPES_H */ diff --git a/tools/include/uapi/linux/genetlink.h b/tools/include/uapi/linux/genetlink.h new file mode 100644 index 000000000000..ddba3ca01e39 --- /dev/null +++ b/tools/include/uapi/linux/genetlink.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_GENERIC_NETLINK_H +#define _UAPI__LINUX_GENERIC_NETLINK_H + +#include <linux/types.h> +#include <linux/netlink.h> + +#define GENL_NAMSIZ 16 /* length of family name */ + +#define GENL_MIN_ID NLMSG_MIN_TYPE +#define GENL_MAX_ID 1023 + +struct genlmsghdr { + __u8 cmd; + __u8 version; + __u16 reserved; +}; + +#define GENL_HDRLEN NLMSG_ALIGN(sizeof(struct genlmsghdr)) + +#define GENL_ADMIN_PERM 0x01 +#define GENL_CMD_CAP_DO 0x02 +#define GENL_CMD_CAP_DUMP 0x04 +#define GENL_CMD_CAP_HASPOL 0x08 +#define GENL_UNS_ADMIN_PERM 0x10 + +/* + * List of reserved static generic netlink identifiers: + */ +#define GENL_ID_CTRL NLMSG_MIN_TYPE +#define GENL_ID_VFS_DQUOT (NLMSG_MIN_TYPE + 1) +#define GENL_ID_PMCRAID (NLMSG_MIN_TYPE + 2) +/* must be last reserved + 1 */ +#define GENL_START_ALLOC (NLMSG_MIN_TYPE + 3) + +/************************************************************************** + * Controller + **************************************************************************/ + +enum { + CTRL_CMD_UNSPEC, + CTRL_CMD_NEWFAMILY, + CTRL_CMD_DELFAMILY, + CTRL_CMD_GETFAMILY, + CTRL_CMD_NEWOPS, + CTRL_CMD_DELOPS, + CTRL_CMD_GETOPS, + CTRL_CMD_NEWMCAST_GRP, + CTRL_CMD_DELMCAST_GRP, + CTRL_CMD_GETMCAST_GRP, /* unused */ + CTRL_CMD_GETPOLICY, + __CTRL_CMD_MAX, +}; + +#define CTRL_CMD_MAX (__CTRL_CMD_MAX - 1) + +enum { + CTRL_ATTR_UNSPEC, + CTRL_ATTR_FAMILY_ID, + CTRL_ATTR_FAMILY_NAME, + CTRL_ATTR_VERSION, + CTRL_ATTR_HDRSIZE, + CTRL_ATTR_MAXATTR, + CTRL_ATTR_OPS, + CTRL_ATTR_MCAST_GROUPS, + CTRL_ATTR_POLICY, + CTRL_ATTR_OP_POLICY, + CTRL_ATTR_OP, + __CTRL_ATTR_MAX, +}; + +#define CTRL_ATTR_MAX (__CTRL_ATTR_MAX - 1) + +enum { + CTRL_ATTR_OP_UNSPEC, + CTRL_ATTR_OP_ID, + CTRL_ATTR_OP_FLAGS, + __CTRL_ATTR_OP_MAX, +}; + +#define CTRL_ATTR_OP_MAX (__CTRL_ATTR_OP_MAX - 1) + +enum { + CTRL_ATTR_MCAST_GRP_UNSPEC, + CTRL_ATTR_MCAST_GRP_NAME, + CTRL_ATTR_MCAST_GRP_ID, + __CTRL_ATTR_MCAST_GRP_MAX, +}; + +#define CTRL_ATTR_MCAST_GRP_MAX (__CTRL_ATTR_MCAST_GRP_MAX - 1) + +enum { + CTRL_ATTR_POLICY_UNSPEC, + CTRL_ATTR_POLICY_DO, + CTRL_ATTR_POLICY_DUMP, + + __CTRL_ATTR_POLICY_DUMP_MAX, + CTRL_ATTR_POLICY_DUMP_MAX = __CTRL_ATTR_POLICY_DUMP_MAX - 1 +}; + +#define CTRL_ATTR_POLICY_MAX (__CTRL_ATTR_POLICY_DUMP_MAX - 1) + +#endif /* _UAPI__LINUX_GENERIC_NETLINK_H */ diff --git a/tools/include/uapi/linux/if_addr.h b/tools/include/uapi/linux/if_addr.h new file mode 100644 index 000000000000..aa7958b4e41d --- /dev/null +++ b/tools/include/uapi/linux/if_addr.h @@ -0,0 +1,79 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_IF_ADDR_H +#define _UAPI__LINUX_IF_ADDR_H + +#include <linux/types.h> +#include <linux/netlink.h> + +struct ifaddrmsg { + __u8 ifa_family; + __u8 ifa_prefixlen; /* The prefix length */ + __u8 ifa_flags; /* Flags */ + __u8 ifa_scope; /* Address scope */ + __u32 ifa_index; /* Link index */ +}; + +/* + * Important comment: + * IFA_ADDRESS is prefix address, rather than local interface address. + * It makes no difference for normally configured broadcast interfaces, + * but for point-to-point IFA_ADDRESS is DESTINATION address, + * local address is supplied in IFA_LOCAL attribute. + * + * IFA_FLAGS is a u32 attribute that extends the u8 field ifa_flags. + * If present, the value from struct ifaddrmsg will be ignored. + */ +enum { + IFA_UNSPEC, + IFA_ADDRESS, + IFA_LOCAL, + IFA_LABEL, + IFA_BROADCAST, + IFA_ANYCAST, + IFA_CACHEINFO, + IFA_MULTICAST, + IFA_FLAGS, + IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */ + IFA_TARGET_NETNSID, + IFA_PROTO, /* u8, address protocol */ + __IFA_MAX, +}; + +#define IFA_MAX (__IFA_MAX - 1) + +/* ifa_flags */ +#define IFA_F_SECONDARY 0x01 +#define IFA_F_TEMPORARY IFA_F_SECONDARY + +#define IFA_F_NODAD 0x02 +#define IFA_F_OPTIMISTIC 0x04 +#define IFA_F_DADFAILED 0x08 +#define IFA_F_HOMEADDRESS 0x10 +#define IFA_F_DEPRECATED 0x20 +#define IFA_F_TENTATIVE 0x40 +#define IFA_F_PERMANENT 0x80 +#define IFA_F_MANAGETEMPADDR 0x100 +#define IFA_F_NOPREFIXROUTE 0x200 +#define IFA_F_MCAUTOJOIN 0x400 +#define IFA_F_STABLE_PRIVACY 0x800 + +struct ifa_cacheinfo { + __u32 ifa_prefered; + __u32 ifa_valid; + __u32 cstamp; /* created timestamp, hundredths of seconds */ + __u32 tstamp; /* updated timestamp, hundredths of seconds */ +}; + +/* backwards compatibility for userspace */ +#ifndef __KERNEL__ +#define IFA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct ifaddrmsg)))) +#define IFA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct ifaddrmsg)) +#endif + +/* ifa_proto */ +#define IFAPROT_UNSPEC 0 +#define IFAPROT_KERNEL_LO 1 /* loopback */ +#define IFAPROT_KERNEL_RA 2 /* set by kernel from router announcement */ +#define IFAPROT_KERNEL_LL 3 /* link-local set by kernel */ + +#endif diff --git a/tools/include/uapi/linux/neighbour.h b/tools/include/uapi/linux/neighbour.h new file mode 100644 index 000000000000..c34a81245f87 --- /dev/null +++ b/tools/include/uapi/linux/neighbour.h @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_NEIGHBOUR_H +#define _UAPI__LINUX_NEIGHBOUR_H + +#include <linux/types.h> +#include <linux/netlink.h> + +struct ndmsg { + __u8 ndm_family; + __u8 ndm_pad1; + __u16 ndm_pad2; + __s32 ndm_ifindex; + __u16 ndm_state; + __u8 ndm_flags; + __u8 ndm_type; +}; + +enum { + NDA_UNSPEC, + NDA_DST, + NDA_LLADDR, + NDA_CACHEINFO, + NDA_PROBES, + NDA_VLAN, + NDA_PORT, + NDA_VNI, + NDA_IFINDEX, + NDA_MASTER, + NDA_LINK_NETNSID, + NDA_SRC_VNI, + NDA_PROTOCOL, /* Originator of entry */ + NDA_NH_ID, + NDA_FDB_EXT_ATTRS, + NDA_FLAGS_EXT, + NDA_NDM_STATE_MASK, + NDA_NDM_FLAGS_MASK, + __NDA_MAX +}; + +#define NDA_MAX (__NDA_MAX - 1) + +/* + * Neighbor Cache Entry Flags + */ + +#define NTF_USE (1 << 0) +#define NTF_SELF (1 << 1) +#define NTF_MASTER (1 << 2) +#define NTF_PROXY (1 << 3) /* == ATF_PUBL */ +#define NTF_EXT_LEARNED (1 << 4) +#define NTF_OFFLOADED (1 << 5) +#define NTF_STICKY (1 << 6) +#define NTF_ROUTER (1 << 7) +/* Extended flags under NDA_FLAGS_EXT: */ +#define NTF_EXT_MANAGED (1 << 0) +#define NTF_EXT_LOCKED (1 << 1) +#define NTF_EXT_EXT_VALIDATED (1 << 2) + +/* + * Neighbor Cache Entry States. + */ + +#define NUD_INCOMPLETE 0x01 +#define NUD_REACHABLE 0x02 +#define NUD_STALE 0x04 +#define NUD_DELAY 0x08 +#define NUD_PROBE 0x10 +#define NUD_FAILED 0x20 + +/* Dummy states */ +#define NUD_NOARP 0x40 +#define NUD_PERMANENT 0x80 +#define NUD_NONE 0x00 + +/* NUD_NOARP & NUD_PERMANENT are pseudostates, they never change and make no + * address resolution or NUD. + * + * NUD_PERMANENT also cannot be deleted by garbage collectors. This holds true + * for dynamic entries with NTF_EXT_LEARNED flag as well. However, upon carrier + * down event, NUD_PERMANENT entries are not flushed whereas NTF_EXT_LEARNED + * flagged entries explicitly are (which is also consistent with the routing + * subsystem). + * + * When NTF_EXT_LEARNED is set for a bridge fdb entry the different cache entry + * states don't make sense and thus are ignored. Such entries don't age and + * can roam. + * + * NTF_EXT_MANAGED flagged neigbor entries are managed by the kernel on behalf + * of a user space control plane, and automatically refreshed so that (if + * possible) they remain in NUD_REACHABLE state. + * + * NTF_EXT_LOCKED flagged bridge FDB entries are entries generated by the + * bridge in response to a host trying to communicate via a locked bridge port + * with MAB enabled. Their purpose is to notify user space that a host requires + * authentication. + * + * NTF_EXT_EXT_VALIDATED flagged neighbor entries were externally validated by + * a user space control plane. The kernel will not remove or invalidate them, + * but it can probe them and notify user space when they become reachable. + */ + +struct nda_cacheinfo { + __u32 ndm_confirmed; + __u32 ndm_used; + __u32 ndm_updated; + __u32 ndm_refcnt; +}; + +/***************************************************************** + * Neighbour tables specific messages. + * + * To retrieve the neighbour tables send RTM_GETNEIGHTBL with the + * NLM_F_DUMP flag set. Every neighbour table configuration is + * spread over multiple messages to avoid running into message + * size limits on systems with many interfaces. The first message + * in the sequence transports all not device specific data such as + * statistics, configuration, and the default parameter set. + * This message is followed by 0..n messages carrying device + * specific parameter sets. + * Although the ordering should be sufficient, NDTA_NAME can be + * used to identify sequences. The initial message can be identified + * by checking for NDTA_CONFIG. The device specific messages do + * not contain this TLV but have NDTPA_IFINDEX set to the + * corresponding interface index. + * + * To change neighbour table attributes, send RTM_SETNEIGHTBL + * with NDTA_NAME set. Changeable attribute include NDTA_THRESH[1-3], + * NDTA_GC_INTERVAL, and all TLVs in NDTA_PARMS unless marked + * otherwise. Device specific parameter sets can be changed by + * setting NDTPA_IFINDEX to the interface index of the corresponding + * device. + ****/ + +struct ndt_stats { + __u64 ndts_allocs; + __u64 ndts_destroys; + __u64 ndts_hash_grows; + __u64 ndts_res_failed; + __u64 ndts_lookups; + __u64 ndts_hits; + __u64 ndts_rcv_probes_mcast; + __u64 ndts_rcv_probes_ucast; + __u64 ndts_periodic_gc_runs; + __u64 ndts_forced_gc_runs; + __u64 ndts_table_fulls; +}; + +enum { + NDTPA_UNSPEC, + NDTPA_IFINDEX, /* u32, unchangeable */ + NDTPA_REFCNT, /* u32, read-only */ + NDTPA_REACHABLE_TIME, /* u64, read-only, msecs */ + NDTPA_BASE_REACHABLE_TIME, /* u64, msecs */ + NDTPA_RETRANS_TIME, /* u64, msecs */ + NDTPA_GC_STALETIME, /* u64, msecs */ + NDTPA_DELAY_PROBE_TIME, /* u64, msecs */ + NDTPA_QUEUE_LEN, /* u32 */ + NDTPA_APP_PROBES, /* u32 */ + NDTPA_UCAST_PROBES, /* u32 */ + NDTPA_MCAST_PROBES, /* u32 */ + NDTPA_ANYCAST_DELAY, /* u64, msecs */ + NDTPA_PROXY_DELAY, /* u64, msecs */ + NDTPA_PROXY_QLEN, /* u32 */ + NDTPA_LOCKTIME, /* u64, msecs */ + NDTPA_QUEUE_LENBYTES, /* u32 */ + NDTPA_MCAST_REPROBES, /* u32 */ + NDTPA_PAD, + NDTPA_INTERVAL_PROBE_TIME_MS, /* u64, msecs */ + __NDTPA_MAX +}; +#define NDTPA_MAX (__NDTPA_MAX - 1) + +struct ndtmsg { + __u8 ndtm_family; + __u8 ndtm_pad1; + __u16 ndtm_pad2; +}; + +struct ndt_config { + __u16 ndtc_key_len; + __u16 ndtc_entry_size; + __u32 ndtc_entries; + __u32 ndtc_last_flush; /* delta to now in msecs */ + __u32 ndtc_last_rand; /* delta to now in msecs */ + __u32 ndtc_hash_rnd; + __u32 ndtc_hash_mask; + __u32 ndtc_hash_chain_gc; + __u32 ndtc_proxy_qlen; +}; + +enum { + NDTA_UNSPEC, + NDTA_NAME, /* char *, unchangeable */ + NDTA_THRESH1, /* u32 */ + NDTA_THRESH2, /* u32 */ + NDTA_THRESH3, /* u32 */ + NDTA_CONFIG, /* struct ndt_config, read-only */ + NDTA_PARMS, /* nested TLV NDTPA_* */ + NDTA_STATS, /* struct ndt_stats, read-only */ + NDTA_GC_INTERVAL, /* u64, msecs */ + NDTA_PAD, + __NDTA_MAX +}; +#define NDTA_MAX (__NDTA_MAX - 1) + + /* FDB activity notification bits used in NFEA_ACTIVITY_NOTIFY: + * - FDB_NOTIFY_BIT - notify on activity/expire for any entry + * - FDB_NOTIFY_INACTIVE_BIT - mark as inactive to avoid multiple notifications + */ +enum { + FDB_NOTIFY_BIT = (1 << 0), + FDB_NOTIFY_INACTIVE_BIT = (1 << 1) +}; + +/* embedded into NDA_FDB_EXT_ATTRS: + * [NDA_FDB_EXT_ATTRS] = { + * [NFEA_ACTIVITY_NOTIFY] + * ... + * } + */ +enum { + NFEA_UNSPEC, + NFEA_ACTIVITY_NOTIFY, + NFEA_DONT_REFRESH, + __NFEA_MAX +}; +#define NFEA_MAX (__NFEA_MAX - 1) + +#endif diff --git a/tools/include/uapi/linux/netfilter.h b/tools/include/uapi/linux/netfilter.h new file mode 100644 index 000000000000..5a79ccb76701 --- /dev/null +++ b/tools/include/uapi/linux/netfilter.h @@ -0,0 +1,80 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_NETFILTER_H +#define _UAPI__LINUX_NETFILTER_H + +#include <linux/types.h> +#include <linux/compiler.h> +#include <linux/in.h> +#include <linux/in6.h> + +/* Responses from hook functions. */ +#define NF_DROP 0 +#define NF_ACCEPT 1 +#define NF_STOLEN 2 +#define NF_QUEUE 3 +#define NF_REPEAT 4 +#define NF_STOP 5 /* Deprecated, for userspace nf_queue compatibility. */ +#define NF_MAX_VERDICT NF_STOP + +/* we overload the higher bits for encoding auxiliary data such as the queue + * number or errno values. Not nice, but better than additional function + * arguments. */ +#define NF_VERDICT_MASK 0x000000ff + +/* extra verdict flags have mask 0x0000ff00 */ +#define NF_VERDICT_FLAG_QUEUE_BYPASS 0x00008000 + +/* queue number (NF_QUEUE) or errno (NF_DROP) */ +#define NF_VERDICT_QMASK 0xffff0000 +#define NF_VERDICT_QBITS 16 + +#define NF_QUEUE_NR(x) ((((x) << 16) & NF_VERDICT_QMASK) | NF_QUEUE) + +#define NF_DROP_ERR(x) (((-x) << 16) | NF_DROP) + +/* only for userspace compatibility */ +#ifndef __KERNEL__ + +/* NF_VERDICT_BITS should be 8 now, but userspace might break if this changes */ +#define NF_VERDICT_BITS 16 +#endif + +enum nf_inet_hooks { + NF_INET_PRE_ROUTING, + NF_INET_LOCAL_IN, + NF_INET_FORWARD, + NF_INET_LOCAL_OUT, + NF_INET_POST_ROUTING, + NF_INET_NUMHOOKS, + NF_INET_INGRESS = NF_INET_NUMHOOKS, +}; + +enum nf_dev_hooks { + NF_NETDEV_INGRESS, + NF_NETDEV_EGRESS, + NF_NETDEV_NUMHOOKS +}; + +enum { + NFPROTO_UNSPEC = 0, + NFPROTO_INET = 1, + NFPROTO_IPV4 = 2, + NFPROTO_ARP = 3, + NFPROTO_NETDEV = 5, + NFPROTO_BRIDGE = 7, + NFPROTO_IPV6 = 10, +#ifndef __KERNEL__ /* no longer supported by kernel */ + NFPROTO_DECNET = 12, +#endif + NFPROTO_NUMPROTO, +}; + +union nf_inet_addr { + __u32 all[4]; + __be32 ip; + __be32 ip6[4]; + struct in_addr in; + struct in6_addr in6; +}; + +#endif /* _UAPI__LINUX_NETFILTER_H */ diff --git a/tools/include/uapi/linux/netfilter_arp.h b/tools/include/uapi/linux/netfilter_arp.h new file mode 100644 index 000000000000..791dfc5ae907 --- /dev/null +++ b/tools/include/uapi/linux/netfilter_arp.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-1.0+ WITH Linux-syscall-note */ +#ifndef __LINUX_ARP_NETFILTER_H +#define __LINUX_ARP_NETFILTER_H + +/* ARP-specific defines for netfilter. + * (C)2002 Rusty Russell IBM -- This code is GPL. + */ + +#include <linux/netfilter.h> + +/* There is no PF_ARP. */ +#define NF_ARP 0 + +/* ARP Hooks */ +#define NF_ARP_IN 0 +#define NF_ARP_OUT 1 +#define NF_ARP_FORWARD 2 + +#ifndef __KERNEL__ +#define NF_ARP_NUMHOOKS 3 +#endif + +#endif /* __LINUX_ARP_NETFILTER_H */ diff --git a/tools/include/uapi/linux/rtnetlink.h b/tools/include/uapi/linux/rtnetlink.h new file mode 100644 index 000000000000..dab9493c791b --- /dev/null +++ b/tools/include/uapi/linux/rtnetlink.h @@ -0,0 +1,848 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +#ifndef _UAPI__LINUX_RTNETLINK_H +#define _UAPI__LINUX_RTNETLINK_H + +#include <linux/types.h> +#include <linux/netlink.h> +#include <linux/if_link.h> +#include <linux/if_addr.h> +#include <linux/neighbour.h> + +/* rtnetlink families. Values up to 127 are reserved for real address + * families, values above 128 may be used arbitrarily. + */ +#define RTNL_FAMILY_IPMR 128 +#define RTNL_FAMILY_IP6MR 129 +#define RTNL_FAMILY_MAX 129 + +/**** + * Routing/neighbour discovery messages. + ****/ + +/* Types of messages */ + +enum { + RTM_BASE = 16, +#define RTM_BASE RTM_BASE + + RTM_NEWLINK = 16, +#define RTM_NEWLINK RTM_NEWLINK + RTM_DELLINK, +#define RTM_DELLINK RTM_DELLINK + RTM_GETLINK, +#define RTM_GETLINK RTM_GETLINK + RTM_SETLINK, +#define RTM_SETLINK RTM_SETLINK + + RTM_NEWADDR = 20, +#define RTM_NEWADDR RTM_NEWADDR + RTM_DELADDR, +#define RTM_DELADDR RTM_DELADDR + RTM_GETADDR, +#define RTM_GETADDR RTM_GETADDR + + RTM_NEWROUTE = 24, +#define RTM_NEWROUTE RTM_NEWROUTE + RTM_DELROUTE, +#define RTM_DELROUTE RTM_DELROUTE + RTM_GETROUTE, +#define RTM_GETROUTE RTM_GETROUTE + + RTM_NEWNEIGH = 28, +#define RTM_NEWNEIGH RTM_NEWNEIGH + RTM_DELNEIGH, +#define RTM_DELNEIGH RTM_DELNEIGH + RTM_GETNEIGH, +#define RTM_GETNEIGH RTM_GETNEIGH + + RTM_NEWRULE = 32, +#define RTM_NEWRULE RTM_NEWRULE + RTM_DELRULE, +#define RTM_DELRULE RTM_DELRULE + RTM_GETRULE, +#define RTM_GETRULE RTM_GETRULE + + RTM_NEWQDISC = 36, +#define RTM_NEWQDISC RTM_NEWQDISC + RTM_DELQDISC, +#define RTM_DELQDISC RTM_DELQDISC + RTM_GETQDISC, +#define RTM_GETQDISC RTM_GETQDISC + + RTM_NEWTCLASS = 40, +#define RTM_NEWTCLASS RTM_NEWTCLASS + RTM_DELTCLASS, +#define RTM_DELTCLASS RTM_DELTCLASS + RTM_GETTCLASS, +#define RTM_GETTCLASS RTM_GETTCLASS + + RTM_NEWTFILTER = 44, +#define RTM_NEWTFILTER RTM_NEWTFILTER + RTM_DELTFILTER, +#define RTM_DELTFILTER RTM_DELTFILTER + RTM_GETTFILTER, +#define RTM_GETTFILTER RTM_GETTFILTER + + RTM_NEWACTION = 48, +#define RTM_NEWACTION RTM_NEWACTION + RTM_DELACTION, +#define RTM_DELACTION RTM_DELACTION + RTM_GETACTION, +#define RTM_GETACTION RTM_GETACTION + + RTM_NEWPREFIX = 52, +#define RTM_NEWPREFIX RTM_NEWPREFIX + + RTM_NEWMULTICAST = 56, +#define RTM_NEWMULTICAST RTM_NEWMULTICAST + RTM_DELMULTICAST, +#define RTM_DELMULTICAST RTM_DELMULTICAST + RTM_GETMULTICAST, +#define RTM_GETMULTICAST RTM_GETMULTICAST + + RTM_NEWANYCAST = 60, +#define RTM_NEWANYCAST RTM_NEWANYCAST + RTM_DELANYCAST, +#define RTM_DELANYCAST RTM_DELANYCAST + RTM_GETANYCAST, +#define RTM_GETANYCAST RTM_GETANYCAST + + RTM_NEWNEIGHTBL = 64, +#define RTM_NEWNEIGHTBL RTM_NEWNEIGHTBL + RTM_GETNEIGHTBL = 66, +#define RTM_GETNEIGHTBL RTM_GETNEIGHTBL + RTM_SETNEIGHTBL, +#define RTM_SETNEIGHTBL RTM_SETNEIGHTBL + + RTM_NEWNDUSEROPT = 68, +#define RTM_NEWNDUSEROPT RTM_NEWNDUSEROPT + + RTM_NEWADDRLABEL = 72, +#define RTM_NEWADDRLABEL RTM_NEWADDRLABEL + RTM_DELADDRLABEL, +#define RTM_DELADDRLABEL RTM_DELADDRLABEL + RTM_GETADDRLABEL, +#define RTM_GETADDRLABEL RTM_GETADDRLABEL + + RTM_GETDCB = 78, +#define RTM_GETDCB RTM_GETDCB + RTM_SETDCB, +#define RTM_SETDCB RTM_SETDCB + + RTM_NEWNETCONF = 80, +#define RTM_NEWNETCONF RTM_NEWNETCONF + RTM_DELNETCONF, +#define RTM_DELNETCONF RTM_DELNETCONF + RTM_GETNETCONF = 82, +#define RTM_GETNETCONF RTM_GETNETCONF + + RTM_NEWMDB = 84, +#define RTM_NEWMDB RTM_NEWMDB + RTM_DELMDB = 85, +#define RTM_DELMDB RTM_DELMDB + RTM_GETMDB = 86, +#define RTM_GETMDB RTM_GETMDB + + RTM_NEWNSID = 88, +#define RTM_NEWNSID RTM_NEWNSID + RTM_DELNSID = 89, +#define RTM_DELNSID RTM_DELNSID + RTM_GETNSID = 90, +#define RTM_GETNSID RTM_GETNSID + + RTM_NEWSTATS = 92, +#define RTM_NEWSTATS RTM_NEWSTATS + RTM_GETSTATS = 94, +#define RTM_GETSTATS RTM_GETSTATS + RTM_SETSTATS, +#define RTM_SETSTATS RTM_SETSTATS + + RTM_NEWCACHEREPORT = 96, +#define RTM_NEWCACHEREPORT RTM_NEWCACHEREPORT + + RTM_NEWCHAIN = 100, +#define RTM_NEWCHAIN RTM_NEWCHAIN + RTM_DELCHAIN, +#define RTM_DELCHAIN RTM_DELCHAIN + RTM_GETCHAIN, +#define RTM_GETCHAIN RTM_GETCHAIN + + RTM_NEWNEXTHOP = 104, +#define RTM_NEWNEXTHOP RTM_NEWNEXTHOP + RTM_DELNEXTHOP, +#define RTM_DELNEXTHOP RTM_DELNEXTHOP + RTM_GETNEXTHOP, +#define RTM_GETNEXTHOP RTM_GETNEXTHOP + + RTM_NEWLINKPROP = 108, +#define RTM_NEWLINKPROP RTM_NEWLINKPROP + RTM_DELLINKPROP, +#define RTM_DELLINKPROP RTM_DELLINKPROP + RTM_GETLINKPROP, +#define RTM_GETLINKPROP RTM_GETLINKPROP + + RTM_NEWVLAN = 112, +#define RTM_NEWVLAN RTM_NEWVLAN + RTM_DELVLAN, +#define RTM_DELVLAN RTM_DELVLAN + RTM_GETVLAN, +#define RTM_GETVLAN RTM_GETVLAN + + RTM_NEWNEXTHOPBUCKET = 116, +#define RTM_NEWNEXTHOPBUCKET RTM_NEWNEXTHOPBUCKET + RTM_DELNEXTHOPBUCKET, +#define RTM_DELNEXTHOPBUCKET RTM_DELNEXTHOPBUCKET + RTM_GETNEXTHOPBUCKET, +#define RTM_GETNEXTHOPBUCKET RTM_GETNEXTHOPBUCKET + + RTM_NEWTUNNEL = 120, +#define RTM_NEWTUNNEL RTM_NEWTUNNEL + RTM_DELTUNNEL, +#define RTM_DELTUNNEL RTM_DELTUNNEL + RTM_GETTUNNEL, +#define RTM_GETTUNNEL RTM_GETTUNNEL + + __RTM_MAX, +#define RTM_MAX (((__RTM_MAX + 3) & ~3) - 1) +}; + +#define RTM_NR_MSGTYPES (RTM_MAX + 1 - RTM_BASE) +#define RTM_NR_FAMILIES (RTM_NR_MSGTYPES >> 2) +#define RTM_FAM(cmd) (((cmd) - RTM_BASE) >> 2) + +/* + Generic structure for encapsulation of optional route information. + It is reminiscent of sockaddr, but with sa_family replaced + with attribute type. + */ + +struct rtattr { + unsigned short rta_len; + unsigned short rta_type; +}; + +/* Macros to handle rtattributes */ + +#define RTA_ALIGNTO 4U +#define RTA_ALIGN(len) ( ((len)+RTA_ALIGNTO-1) & ~(RTA_ALIGNTO-1) ) +#define RTA_OK(rta,len) ((len) >= (int)sizeof(struct rtattr) && \ + (rta)->rta_len >= sizeof(struct rtattr) && \ + (rta)->rta_len <= (len)) +#define RTA_NEXT(rta,attrlen) ((attrlen) -= RTA_ALIGN((rta)->rta_len), \ + (struct rtattr*)(((char*)(rta)) + RTA_ALIGN((rta)->rta_len))) +#define RTA_LENGTH(len) (RTA_ALIGN(sizeof(struct rtattr)) + (len)) +#define RTA_SPACE(len) RTA_ALIGN(RTA_LENGTH(len)) +#define RTA_DATA(rta) ((void*)(((char*)(rta)) + RTA_LENGTH(0))) +#define RTA_PAYLOAD(rta) ((int)((rta)->rta_len) - RTA_LENGTH(0)) + + + + +/****************************************************************************** + * Definitions used in routing table administration. + ****/ + +struct rtmsg { + unsigned char rtm_family; + unsigned char rtm_dst_len; + unsigned char rtm_src_len; + unsigned char rtm_tos; + + unsigned char rtm_table; /* Routing table id */ + unsigned char rtm_protocol; /* Routing protocol; see below */ + unsigned char rtm_scope; /* See below */ + unsigned char rtm_type; /* See below */ + + unsigned rtm_flags; +}; + +/* rtm_type */ + +enum { + RTN_UNSPEC, + RTN_UNICAST, /* Gateway or direct route */ + RTN_LOCAL, /* Accept locally */ + RTN_BROADCAST, /* Accept locally as broadcast, + send as broadcast */ + RTN_ANYCAST, /* Accept locally as broadcast, + but send as unicast */ + RTN_MULTICAST, /* Multicast route */ + RTN_BLACKHOLE, /* Drop */ + RTN_UNREACHABLE, /* Destination is unreachable */ + RTN_PROHIBIT, /* Administratively prohibited */ + RTN_THROW, /* Not in this table */ + RTN_NAT, /* Translate this address */ + RTN_XRESOLVE, /* Use external resolver */ + __RTN_MAX +}; + +#define RTN_MAX (__RTN_MAX - 1) + + +/* rtm_protocol */ + +#define RTPROT_UNSPEC 0 +#define RTPROT_REDIRECT 1 /* Route installed by ICMP redirects; + not used by current IPv4 */ +#define RTPROT_KERNEL 2 /* Route installed by kernel */ +#define RTPROT_BOOT 3 /* Route installed during boot */ +#define RTPROT_STATIC 4 /* Route installed by administrator */ + +/* Values of protocol >= RTPROT_STATIC are not interpreted by kernel; + they are just passed from user and back as is. + It will be used by hypothetical multiple routing daemons. + Note that protocol values should be standardized in order to + avoid conflicts. + */ + +#define RTPROT_GATED 8 /* Apparently, GateD */ +#define RTPROT_RA 9 /* RDISC/ND router advertisements */ +#define RTPROT_MRT 10 /* Merit MRT */ +#define RTPROT_ZEBRA 11 /* Zebra */ +#define RTPROT_BIRD 12 /* BIRD */ +#define RTPROT_DNROUTED 13 /* DECnet routing daemon */ +#define RTPROT_XORP 14 /* XORP */ +#define RTPROT_NTK 15 /* Netsukuku */ +#define RTPROT_DHCP 16 /* DHCP client */ +#define RTPROT_MROUTED 17 /* Multicast daemon */ +#define RTPROT_KEEPALIVED 18 /* Keepalived daemon */ +#define RTPROT_BABEL 42 /* Babel daemon */ +#define RTPROT_OVN 84 /* OVN daemon */ +#define RTPROT_OPENR 99 /* Open Routing (Open/R) Routes */ +#define RTPROT_BGP 186 /* BGP Routes */ +#define RTPROT_ISIS 187 /* ISIS Routes */ +#define RTPROT_OSPF 188 /* OSPF Routes */ +#define RTPROT_RIP 189 /* RIP Routes */ +#define RTPROT_EIGRP 192 /* EIGRP Routes */ + +/* rtm_scope + + Really it is not scope, but sort of distance to the destination. + NOWHERE are reserved for not existing destinations, HOST is our + local addresses, LINK are destinations, located on directly attached + link and UNIVERSE is everywhere in the Universe. + + Intermediate values are also possible f.e. interior routes + could be assigned a value between UNIVERSE and LINK. +*/ + +enum rt_scope_t { + RT_SCOPE_UNIVERSE=0, +/* User defined values */ + RT_SCOPE_SITE=200, + RT_SCOPE_LINK=253, + RT_SCOPE_HOST=254, + RT_SCOPE_NOWHERE=255 +}; + +/* rtm_flags */ + +#define RTM_F_NOTIFY 0x100 /* Notify user of route change */ +#define RTM_F_CLONED 0x200 /* This route is cloned */ +#define RTM_F_EQUALIZE 0x400 /* Multipath equalizer: NI */ +#define RTM_F_PREFIX 0x800 /* Prefix addresses */ +#define RTM_F_LOOKUP_TABLE 0x1000 /* set rtm_table to FIB lookup result */ +#define RTM_F_FIB_MATCH 0x2000 /* return full fib lookup match */ +#define RTM_F_OFFLOAD 0x4000 /* route is offloaded */ +#define RTM_F_TRAP 0x8000 /* route is trapping packets */ +#define RTM_F_OFFLOAD_FAILED 0x20000000 /* route offload failed, this value + * is chosen to avoid conflicts with + * other flags defined in + * include/uapi/linux/ipv6_route.h + */ + +/* Reserved table identifiers */ + +enum rt_class_t { + RT_TABLE_UNSPEC=0, +/* User defined values */ + RT_TABLE_COMPAT=252, + RT_TABLE_DEFAULT=253, + RT_TABLE_MAIN=254, + RT_TABLE_LOCAL=255, + RT_TABLE_MAX=0xFFFFFFFF +}; + + +/* Routing message attributes */ + +enum rtattr_type_t { + RTA_UNSPEC, + RTA_DST, + RTA_SRC, + RTA_IIF, + RTA_OIF, + RTA_GATEWAY, + RTA_PRIORITY, + RTA_PREFSRC, + RTA_METRICS, + RTA_MULTIPATH, + RTA_PROTOINFO, /* no longer used */ + RTA_FLOW, + RTA_CACHEINFO, + RTA_SESSION, /* no longer used */ + RTA_MP_ALGO, /* no longer used */ + RTA_TABLE, + RTA_MARK, + RTA_MFC_STATS, + RTA_VIA, + RTA_NEWDST, + RTA_PREF, + RTA_ENCAP_TYPE, + RTA_ENCAP, + RTA_EXPIRES, + RTA_PAD, + RTA_UID, + RTA_TTL_PROPAGATE, + RTA_IP_PROTO, + RTA_SPORT, + RTA_DPORT, + RTA_NH_ID, + RTA_FLOWLABEL, + __RTA_MAX +}; + +#define RTA_MAX (__RTA_MAX - 1) + +#define RTM_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct rtmsg)))) +#define RTM_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct rtmsg)) + +/* RTM_MULTIPATH --- array of struct rtnexthop. + * + * "struct rtnexthop" describes all necessary nexthop information, + * i.e. parameters of path to a destination via this nexthop. + * + * At the moment it is impossible to set different prefsrc, mtu, window + * and rtt for different paths from multipath. + */ + +struct rtnexthop { + unsigned short rtnh_len; + unsigned char rtnh_flags; + unsigned char rtnh_hops; + int rtnh_ifindex; +}; + +/* rtnh_flags */ + +#define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ +#define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ +#define RTNH_F_ONLINK 4 /* Gateway is forced on link */ +#define RTNH_F_OFFLOAD 8 /* Nexthop is offloaded */ +#define RTNH_F_LINKDOWN 16 /* carrier-down on nexthop */ +#define RTNH_F_UNRESOLVED 32 /* The entry is unresolved (ipmr) */ +#define RTNH_F_TRAP 64 /* Nexthop is trapping packets */ + +#define RTNH_COMPARE_MASK (RTNH_F_DEAD | RTNH_F_LINKDOWN | \ + RTNH_F_OFFLOAD | RTNH_F_TRAP) + +/* Macros to handle hexthops */ + +#define RTNH_ALIGNTO 4 +#define RTNH_ALIGN(len) ( ((len)+RTNH_ALIGNTO-1) & ~(RTNH_ALIGNTO-1) ) +#define RTNH_OK(rtnh,len) ((rtnh)->rtnh_len >= sizeof(struct rtnexthop) && \ + ((int)(rtnh)->rtnh_len) <= (len)) +#define RTNH_NEXT(rtnh) ((struct rtnexthop*)(((char*)(rtnh)) + RTNH_ALIGN((rtnh)->rtnh_len))) +#define RTNH_LENGTH(len) (RTNH_ALIGN(sizeof(struct rtnexthop)) + (len)) +#define RTNH_SPACE(len) RTNH_ALIGN(RTNH_LENGTH(len)) +#define RTNH_DATA(rtnh) ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0))) + +/* RTA_VIA */ +struct rtvia { + __kernel_sa_family_t rtvia_family; + __u8 rtvia_addr[]; +}; + +/* RTM_CACHEINFO */ + +struct rta_cacheinfo { + __u32 rta_clntref; + __u32 rta_lastuse; + __s32 rta_expires; + __u32 rta_error; + __u32 rta_used; + +#define RTNETLINK_HAVE_PEERINFO 1 + __u32 rta_id; + __u32 rta_ts; + __u32 rta_tsage; +}; + +/* RTM_METRICS --- array of struct rtattr with types of RTAX_* */ + +enum { + RTAX_UNSPEC, +#define RTAX_UNSPEC RTAX_UNSPEC + RTAX_LOCK, +#define RTAX_LOCK RTAX_LOCK + RTAX_MTU, +#define RTAX_MTU RTAX_MTU + RTAX_WINDOW, +#define RTAX_WINDOW RTAX_WINDOW + RTAX_RTT, +#define RTAX_RTT RTAX_RTT + RTAX_RTTVAR, +#define RTAX_RTTVAR RTAX_RTTVAR + RTAX_SSTHRESH, +#define RTAX_SSTHRESH RTAX_SSTHRESH + RTAX_CWND, +#define RTAX_CWND RTAX_CWND + RTAX_ADVMSS, +#define RTAX_ADVMSS RTAX_ADVMSS + RTAX_REORDERING, +#define RTAX_REORDERING RTAX_REORDERING + RTAX_HOPLIMIT, +#define RTAX_HOPLIMIT RTAX_HOPLIMIT + RTAX_INITCWND, +#define RTAX_INITCWND RTAX_INITCWND + RTAX_FEATURES, +#define RTAX_FEATURES RTAX_FEATURES + RTAX_RTO_MIN, +#define RTAX_RTO_MIN RTAX_RTO_MIN + RTAX_INITRWND, +#define RTAX_INITRWND RTAX_INITRWND + RTAX_QUICKACK, +#define RTAX_QUICKACK RTAX_QUICKACK + RTAX_CC_ALGO, +#define RTAX_CC_ALGO RTAX_CC_ALGO + RTAX_FASTOPEN_NO_COOKIE, +#define RTAX_FASTOPEN_NO_COOKIE RTAX_FASTOPEN_NO_COOKIE + __RTAX_MAX +}; + +#define RTAX_MAX (__RTAX_MAX - 1) + +#define RTAX_FEATURE_ECN (1 << 0) +#define RTAX_FEATURE_SACK (1 << 1) /* unused */ +#define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ +#define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ +#define RTAX_FEATURE_TCP_USEC_TS (1 << 4) + +#define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ + RTAX_FEATURE_TCP_USEC_TS) + +struct rta_session { + __u8 proto; + __u8 pad1; + __u16 pad2; + + union { + struct { + __u16 sport; + __u16 dport; + } ports; + + struct { + __u8 type; + __u8 code; + __u16 ident; + } icmpt; + + __u32 spi; + } u; +}; + +struct rta_mfc_stats { + __u64 mfcs_packets; + __u64 mfcs_bytes; + __u64 mfcs_wrong_if; +}; + +/**** + * General form of address family dependent message. + ****/ + +struct rtgenmsg { + unsigned char rtgen_family; +}; + +/***************************************************************** + * Link layer specific messages. + ****/ + +/* struct ifinfomsg + * passes link level specific information, not dependent + * on network protocol. + */ + +struct ifinfomsg { + unsigned char ifi_family; + unsigned char __ifi_pad; + unsigned short ifi_type; /* ARPHRD_* */ + int ifi_index; /* Link index */ + unsigned ifi_flags; /* IFF_* flags */ + unsigned ifi_change; /* IFF_* change mask */ +}; + +/******************************************************************** + * prefix information + ****/ + +struct prefixmsg { + unsigned char prefix_family; + unsigned char prefix_pad1; + unsigned short prefix_pad2; + int prefix_ifindex; + unsigned char prefix_type; + unsigned char prefix_len; + unsigned char prefix_flags; + unsigned char prefix_pad3; +}; + +enum +{ + PREFIX_UNSPEC, + PREFIX_ADDRESS, + PREFIX_CACHEINFO, + __PREFIX_MAX +}; + +#define PREFIX_MAX (__PREFIX_MAX - 1) + +struct prefix_cacheinfo { + __u32 preferred_time; + __u32 valid_time; +}; + + +/***************************************************************** + * Traffic control messages. + ****/ + +struct tcmsg { + unsigned char tcm_family; + unsigned char tcm__pad1; + unsigned short tcm__pad2; + int tcm_ifindex; + __u32 tcm_handle; + __u32 tcm_parent; +/* tcm_block_index is used instead of tcm_parent + * in case tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK + */ +#define tcm_block_index tcm_parent + __u32 tcm_info; +}; + +/* For manipulation of filters in shared block, tcm_ifindex is set to + * TCM_IFINDEX_MAGIC_BLOCK, and tcm_parent is aliased to tcm_block_index + * which is the block index. + */ +#define TCM_IFINDEX_MAGIC_BLOCK (0xFFFFFFFFU) + +enum { + TCA_UNSPEC, + TCA_KIND, + TCA_OPTIONS, + TCA_STATS, + TCA_XSTATS, + TCA_RATE, + TCA_FCNT, + TCA_STATS2, + TCA_STAB, + TCA_PAD, + TCA_DUMP_INVISIBLE, + TCA_CHAIN, + TCA_HW_OFFLOAD, + TCA_INGRESS_BLOCK, + TCA_EGRESS_BLOCK, + TCA_DUMP_FLAGS, + TCA_EXT_WARN_MSG, + __TCA_MAX +}; + +#define TCA_MAX (__TCA_MAX - 1) + +#define TCA_DUMP_FLAGS_TERSE (1 << 0) /* Means that in dump user gets only basic + * data necessary to identify the objects + * (handle, cookie, etc.) and stats. + */ + +#define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg)))) +#define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg)) + +/******************************************************************** + * Neighbor Discovery userland options + ****/ + +struct nduseroptmsg { + unsigned char nduseropt_family; + unsigned char nduseropt_pad1; + unsigned short nduseropt_opts_len; /* Total length of options */ + int nduseropt_ifindex; + __u8 nduseropt_icmp_type; + __u8 nduseropt_icmp_code; + unsigned short nduseropt_pad2; + unsigned int nduseropt_pad3; + /* Followed by one or more ND options */ +}; + +enum { + NDUSEROPT_UNSPEC, + NDUSEROPT_SRCADDR, + __NDUSEROPT_MAX +}; + +#define NDUSEROPT_MAX (__NDUSEROPT_MAX - 1) + +#ifndef __KERNEL__ +/* RTnetlink multicast groups - backwards compatibility for userspace */ +#define RTMGRP_LINK 1 +#define RTMGRP_NOTIFY 2 +#define RTMGRP_NEIGH 4 +#define RTMGRP_TC 8 + +#define RTMGRP_IPV4_IFADDR 0x10 +#define RTMGRP_IPV4_MROUTE 0x20 +#define RTMGRP_IPV4_ROUTE 0x40 +#define RTMGRP_IPV4_RULE 0x80 + +#define RTMGRP_IPV6_IFADDR 0x100 +#define RTMGRP_IPV6_MROUTE 0x200 +#define RTMGRP_IPV6_ROUTE 0x400 +#define RTMGRP_IPV6_IFINFO 0x800 + +#define RTMGRP_DECnet_IFADDR 0x1000 +#define RTMGRP_DECnet_ROUTE 0x4000 + +#define RTMGRP_IPV6_PREFIX 0x20000 +#endif + +/* RTnetlink multicast groups */ +enum rtnetlink_groups { + RTNLGRP_NONE, +#define RTNLGRP_NONE RTNLGRP_NONE + RTNLGRP_LINK, +#define RTNLGRP_LINK RTNLGRP_LINK + RTNLGRP_NOTIFY, +#define RTNLGRP_NOTIFY RTNLGRP_NOTIFY + RTNLGRP_NEIGH, +#define RTNLGRP_NEIGH RTNLGRP_NEIGH + RTNLGRP_TC, +#define RTNLGRP_TC RTNLGRP_TC + RTNLGRP_IPV4_IFADDR, +#define RTNLGRP_IPV4_IFADDR RTNLGRP_IPV4_IFADDR + RTNLGRP_IPV4_MROUTE, +#define RTNLGRP_IPV4_MROUTE RTNLGRP_IPV4_MROUTE + RTNLGRP_IPV4_ROUTE, +#define RTNLGRP_IPV4_ROUTE RTNLGRP_IPV4_ROUTE + RTNLGRP_IPV4_RULE, +#define RTNLGRP_IPV4_RULE RTNLGRP_IPV4_RULE + RTNLGRP_IPV6_IFADDR, +#define RTNLGRP_IPV6_IFADDR RTNLGRP_IPV6_IFADDR + RTNLGRP_IPV6_MROUTE, +#define RTNLGRP_IPV6_MROUTE RTNLGRP_IPV6_MROUTE + RTNLGRP_IPV6_ROUTE, +#define RTNLGRP_IPV6_ROUTE RTNLGRP_IPV6_ROUTE + RTNLGRP_IPV6_IFINFO, +#define RTNLGRP_IPV6_IFINFO RTNLGRP_IPV6_IFINFO + RTNLGRP_DECnet_IFADDR, +#define RTNLGRP_DECnet_IFADDR RTNLGRP_DECnet_IFADDR + RTNLGRP_NOP2, + RTNLGRP_DECnet_ROUTE, +#define RTNLGRP_DECnet_ROUTE RTNLGRP_DECnet_ROUTE + RTNLGRP_DECnet_RULE, +#define RTNLGRP_DECnet_RULE RTNLGRP_DECnet_RULE + RTNLGRP_NOP4, + RTNLGRP_IPV6_PREFIX, +#define RTNLGRP_IPV6_PREFIX RTNLGRP_IPV6_PREFIX + RTNLGRP_IPV6_RULE, +#define RTNLGRP_IPV6_RULE RTNLGRP_IPV6_RULE + RTNLGRP_ND_USEROPT, +#define RTNLGRP_ND_USEROPT RTNLGRP_ND_USEROPT + RTNLGRP_PHONET_IFADDR, +#define RTNLGRP_PHONET_IFADDR RTNLGRP_PHONET_IFADDR + RTNLGRP_PHONET_ROUTE, +#define RTNLGRP_PHONET_ROUTE RTNLGRP_PHONET_ROUTE + RTNLGRP_DCB, +#define RTNLGRP_DCB RTNLGRP_DCB + RTNLGRP_IPV4_NETCONF, +#define RTNLGRP_IPV4_NETCONF RTNLGRP_IPV4_NETCONF + RTNLGRP_IPV6_NETCONF, +#define RTNLGRP_IPV6_NETCONF RTNLGRP_IPV6_NETCONF + RTNLGRP_MDB, +#define RTNLGRP_MDB RTNLGRP_MDB + RTNLGRP_MPLS_ROUTE, +#define RTNLGRP_MPLS_ROUTE RTNLGRP_MPLS_ROUTE + RTNLGRP_NSID, +#define RTNLGRP_NSID RTNLGRP_NSID + RTNLGRP_MPLS_NETCONF, +#define RTNLGRP_MPLS_NETCONF RTNLGRP_MPLS_NETCONF + RTNLGRP_IPV4_MROUTE_R, +#define RTNLGRP_IPV4_MROUTE_R RTNLGRP_IPV4_MROUTE_R + RTNLGRP_IPV6_MROUTE_R, +#define RTNLGRP_IPV6_MROUTE_R RTNLGRP_IPV6_MROUTE_R + RTNLGRP_NEXTHOP, +#define RTNLGRP_NEXTHOP RTNLGRP_NEXTHOP + RTNLGRP_BRVLAN, +#define RTNLGRP_BRVLAN RTNLGRP_BRVLAN + RTNLGRP_MCTP_IFADDR, +#define RTNLGRP_MCTP_IFADDR RTNLGRP_MCTP_IFADDR + RTNLGRP_TUNNEL, +#define RTNLGRP_TUNNEL RTNLGRP_TUNNEL + RTNLGRP_STATS, +#define RTNLGRP_STATS RTNLGRP_STATS + RTNLGRP_IPV4_MCADDR, +#define RTNLGRP_IPV4_MCADDR RTNLGRP_IPV4_MCADDR + RTNLGRP_IPV6_MCADDR, +#define RTNLGRP_IPV6_MCADDR RTNLGRP_IPV6_MCADDR + RTNLGRP_IPV6_ACADDR, +#define RTNLGRP_IPV6_ACADDR RTNLGRP_IPV6_ACADDR + __RTNLGRP_MAX +}; +#define RTNLGRP_MAX (__RTNLGRP_MAX - 1) + +/* TC action piece */ +struct tcamsg { + unsigned char tca_family; + unsigned char tca__pad1; + unsigned short tca__pad2; +}; + +enum { + TCA_ROOT_UNSPEC, + TCA_ROOT_TAB, +#define TCA_ACT_TAB TCA_ROOT_TAB +#define TCAA_MAX TCA_ROOT_TAB + TCA_ROOT_FLAGS, + TCA_ROOT_COUNT, + TCA_ROOT_TIME_DELTA, /* in msecs */ + TCA_ROOT_EXT_WARN_MSG, + __TCA_ROOT_MAX, +#define TCA_ROOT_MAX (__TCA_ROOT_MAX - 1) +}; + +#define TA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcamsg)))) +#define TA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcamsg)) +/* tcamsg flags stored in attribute TCA_ROOT_FLAGS + * + * TCA_ACT_FLAG_LARGE_DUMP_ON user->kernel to request for larger than + * TCA_ACT_MAX_PRIO actions in a dump. All dump responses will contain the + * number of actions being dumped stored in for user app's consumption in + * TCA_ROOT_COUNT + * + * TCA_ACT_FLAG_TERSE_DUMP user->kernel to request terse (brief) dump that only + * includes essential action info (kind, index, etc.) + * + */ +#define TCA_FLAG_LARGE_DUMP_ON (1 << 0) +#define TCA_ACT_FLAG_LARGE_DUMP_ON TCA_FLAG_LARGE_DUMP_ON +#define TCA_ACT_FLAG_TERSE_DUMP (1 << 1) + +/* New extended info filters for IFLA_EXT_MASK */ +#define RTEXT_FILTER_VF (1 << 0) +#define RTEXT_FILTER_BRVLAN (1 << 1) +#define RTEXT_FILTER_BRVLAN_COMPRESSED (1 << 2) +#define RTEXT_FILTER_SKIP_STATS (1 << 3) +#define RTEXT_FILTER_MRP (1 << 4) +#define RTEXT_FILTER_CFM_CONFIG (1 << 5) +#define RTEXT_FILTER_CFM_STATUS (1 << 6) +#define RTEXT_FILTER_MST (1 << 7) + +/* End of information exported to user level */ + + + +#endif /* _UAPI__LINUX_RTNETLINK_H */ diff --git a/tools/lib/perf/include/perf/event.h b/tools/lib/perf/include/perf/event.h index 6608f1e3701b..aa1e91c97a22 100644 --- a/tools/lib/perf/include/perf/event.h +++ b/tools/lib/perf/include/perf/event.h @@ -291,6 +291,7 @@ struct perf_record_header_event_type { struct perf_record_header_tracing_data { struct perf_event_header header; __u32 size; + __u32 pad; }; #define PERF_RECORD_MISC_BUILD_ID_SIZE (1 << 15) diff --git a/tools/lib/perf/mmap.c b/tools/lib/perf/mmap.c index c1a51d925e0e..ec124eb0ec0a 100644 --- a/tools/lib/perf/mmap.c +++ b/tools/lib/perf/mmap.c @@ -508,7 +508,7 @@ int perf_mmap__read_self(struct perf_mmap *map, struct perf_counts_values *count idx = READ_ONCE(pc->index); cnt = READ_ONCE(pc->offset); if (pc->cap_user_rdpmc && idx) { - s64 evcnt = read_perf_counter(idx - 1); + u64 evcnt = read_perf_counter(idx - 1); u16 width = READ_ONCE(pc->pmc_width); evcnt <<= 64 - width; diff --git a/tools/perf/Documentation/Build.txt b/tools/perf/Documentation/Build.txt index 83dc87c662b6..57b226e7fc2f 100644 --- a/tools/perf/Documentation/Build.txt +++ b/tools/perf/Documentation/Build.txt @@ -99,3 +99,18 @@ configuration paths for cross building: In this case, the variable PKG_CONFIG_SYSROOT_DIR can be used alongside the variable PKG_CONFIG_LIBDIR or PKG_CONFIG_PATH to prepend the sysroot path to the library paths for cross compilation. + +5) Build with Clang +=================== +By default, the makefile uses GCC as compiler. With specifying environment +variables HOSTCC, CC and CXX, it allows to build perf with Clang. + +Using Clang for a native build: + + $ HOSTCC=clang CC=clang CXX=clang++ make -C tools/perf + +Specifying ARCH and CROSS_COMPILE for cross compilation: + + $ HOSTCC=clang CC=clang CXX=clang++ \ + ARCH=arm64 CROSS_COMPILE=aarch64-linux-gnu- \ + make -C tools/perf diff --git a/tools/perf/Documentation/android.txt b/tools/perf/Documentation/android.txt index 24a59998fc91..3f3cc7ac3d13 100644 --- a/tools/perf/Documentation/android.txt +++ b/tools/perf/Documentation/android.txt @@ -1,78 +1,10 @@ How to compile perf for Android -========================================= +=============================== -I. Set the Android NDK environment ------------------------------------------------- +There are two ways to build perf and run it on Android: -(a). Use the Android NDK ------------------------------------------------- -1. You need to download and install the Android Native Development Kit (NDK). -Set the NDK variable to point to the path where you installed the NDK: - export NDK=/path/to/android-ndk +- Method 1: Build perf with static linking. See Build.txt, section + "4) Cross compilation" for how to build a static perf binary. -2. Set cross-compiling environment variables for NDK toolchain and sysroot. -For arm: - export NDK_TOOLCHAIN=${NDK}/toolchains/arm-linux-androideabi-4.9/prebuilt/linux-x86_64/bin/arm-linux-androideabi- - export NDK_SYSROOT=${NDK}/platforms/android-24/arch-arm -For x86: - export NDK_TOOLCHAIN=${NDK}/toolchains/x86-4.9/prebuilt/linux-x86_64/bin/i686-linux-android- - export NDK_SYSROOT=${NDK}/platforms/android-24/arch-x86 - -This method is only tested for Android NDK versions Revision 11b and later. -perf uses some bionic enhancements that are not included in prior NDK versions. -You can use method (b) described below instead. - -(b). Use the Android source tree ------------------------------------------------ -1. Download the master branch of the Android source tree. -Set the environment for the target you want using: - source build/envsetup.sh - lunch - -2. Build your own NDK sysroot to contain latest bionic changes and set the -NDK sysroot environment variable. - cd ${ANDROID_BUILD_TOP}/ndk -For arm: - ./build/tools/build-ndk-sysroot.sh --abi=arm - export NDK_SYSROOT=${ANDROID_BUILD_TOP}/ndk/build/platforms/android-3/arch-arm -For x86: - ./build/tools/build-ndk-sysroot.sh --abi=x86 - export NDK_SYSROOT=${ANDROID_BUILD_TOP}/ndk/build/platforms/android-3/arch-x86 - -3. Set the NDK toolchain environment variable. -For arm: - export NDK_TOOLCHAIN=${ANDROID_TOOLCHAIN}/arm-linux-androideabi- -For x86: - export NDK_TOOLCHAIN=${ANDROID_TOOLCHAIN}/i686-linux-android- - -II. Compile perf for Android ------------------------------------------------- -You need to run make with the NDK toolchain and sysroot defined above: -For arm: - make WERROR=0 ARCH=arm CROSS_COMPILE=${NDK_TOOLCHAIN} EXTRA_CFLAGS="-pie --sysroot=${NDK_SYSROOT}" -For x86: - make WERROR=0 ARCH=x86 CROSS_COMPILE=${NDK_TOOLCHAIN} EXTRA_CFLAGS="-pie --sysroot=${NDK_SYSROOT}" - -III. Install perf ------------------------------------------------ -You need to connect to your Android device/emulator using adb. -Install perf using: - adb push perf /data/perf - -If you also want to use perf-archive you need busybox tools for Android. -For installing perf-archive, you first need to replace #!/bin/bash with #!/system/bin/sh: - sed 's/#!\/bin\/bash/#!\/system\/bin\/sh/g' perf-archive >> /tmp/perf-archive - chmod +x /tmp/perf-archive - adb push /tmp/perf-archive /data/perf-archive - -IV. Environment settings for running perf ------------------------------------------------- -Some perf features need environment variables to run properly. -You need to set these before running perf on the target: - adb shell - # PERF_PAGER=cat - -IV. Run perf ------------------------------------------------- -Run perf on your device/emulator to which you previously connected using adb: - # ./data/perf +- Method 2: Download the Android NDK and use the bundled Clang to + build perf. See Build.txt, section "5) Build with clang" for details. diff --git a/tools/perf/Documentation/intel-acr.txt b/tools/perf/Documentation/intel-acr.txt new file mode 100644 index 000000000000..72654fdd9a52 --- /dev/null +++ b/tools/perf/Documentation/intel-acr.txt @@ -0,0 +1,53 @@ +Intel Auto Counter Reload Support +--------------------------------- +Support for Intel Auto Counter Reload in perf tools + +Auto counter reload provides a means for software to specify to hardware +that certain counters, if supported, should be automatically reloaded +upon overflow of chosen counters. By taking a sample only if the rate of +one event exceeds some threshold relative to the rate of another event, +this feature enables software to sample based on the relative rate of +two or more events. To enable this, the user must provide a sample period +term and a bitmask ("acr_mask") for each relevant event specifying the +counters in an event group to reload if the event's specified sample +period is exceeded. + +For example, if the user desires to measure a scenario when IPC > 2, +the event group might look like the one below: + + perf record -e {cpu_atom/instructions,period=200000,acr_mask=0x2/, \ + cpu_atom/cycles,period=100000,acr_mask=0x3/} -- true + +In this case, if the "instructions" counter exceeds the sample period of +200000, the second counter, "cycles", will be reset and a sample will be +taken. If "cycles" is exceeded first, both counters in the group will be +reset. In this way, samples will only be taken for cases where IPC > 2. + +The acr_mask term is a hexadecimal value representing a bitmask of the +events in the group to be reset when the period is exceeded. In the +example above, "instructions" is assigned an acr_mask of 0x2, meaning +only the second event in the group is reloaded and a sample is taken +for the first event. "cycles" is assigned an acr_mask of 0x3, meaning +that both event counters will be reset if the sample period is exceeded +first. + +ratio-to-prev Event Term +------------------------ +To simplify this, an event term "ratio-to-prev" is provided which is used +alongside the sample period term n or the -c/--count option. This would +allow users to specify the desired relative rate between events as a +ratio. Note: Both events compared must belong to the same PMU. + +The command above would then become + + perf record -e {cpu_atom/instructions/, \ + cpu_atom/cycles,period=100000,ratio-to-prev=0.5/} -- true + +ratio-to-prev is the ratio of the event using the term relative +to the previous event in the group, which will always be 1, +for a 1:0.5 or 2:1 ratio. + +To sample for IPC < 2 for example, the events need to be reordered: + + perf record -e {cpu_atom/cycles/, \ + cpu_atom/instructions,period=200000,ratio-to-prev=2.0/} -- true diff --git a/tools/perf/Documentation/perf-annotate.txt b/tools/perf/Documentation/perf-annotate.txt index 46090c5b42b4..547f1a268018 100644 --- a/tools/perf/Documentation/perf-annotate.txt +++ b/tools/perf/Documentation/perf-annotate.txt @@ -170,7 +170,6 @@ include::itrace.txt[] --code-with-type:: Show data type info in code annotation (for memory instructions only). - Currently it only works with --stdio option. SEE ALSO diff --git a/tools/perf/Documentation/perf-arm-spe.txt b/tools/perf/Documentation/perf-arm-spe.txt index 37afade4f1b2..cda8dd47fc4d 100644 --- a/tools/perf/Documentation/perf-arm-spe.txt +++ b/tools/perf/Documentation/perf-arm-spe.txt @@ -191,14 +191,20 @@ groups: 36 branch 0 remote-access 900 memory + 1800 instructions The arm_spe// and dummy:u events are implementation details and are expected to be empty. -To get a full list of unique samples that are not sorted into groups, set the itrace option to -generate 'instruction' samples. The period option is also taken into account, so set it to 1 -instruction unless you want to further downsample the already sampled SPE data: +The instructions group contains the full list of unique samples that are not +sorted into other groups. To generate only this group use --itrace=i1i. - perf report --itrace=i1i +1i (1 instruction interval) signifies no further downsampling. Rather than an +instruction interval, this generates a sample every n SPE samples. For example +to generate the default set of events for every 100 SPE samples: + + perf report --itrace==bxofmtMai100i + +Other period types, for example nanoseconds (ns) are not currently supported. Memory access details are also stored on the samples and this can be viewed with: diff --git a/tools/perf/Documentation/perf-bench.txt b/tools/perf/Documentation/perf-bench.txt index 8331bd28b10e..1160224cb718 100644 --- a/tools/perf/Documentation/perf-bench.txt +++ b/tools/perf/Documentation/perf-bench.txt @@ -177,11 +177,21 @@ Suite for evaluating performance of simple memory copy in various ways. Options of *memcpy* ^^^^^^^^^^^^^^^^^^^ --l:: +-s:: --size:: Specify size of memory to copy (default: 1MB). Available units are B, KB, MB, GB and TB (case insensitive). +-p:: +--page:: +Specify page-size for mapping memory buffers (default: 4KB). +Available values are 4KB, 2MB, 1GB (case insensitive). + +-k:: +--chunk:: +Specify the chunk-size for each invocation. (default: 0, or full-extent) +Available units are B, KB, MB, GB and TB (case insensitive). + -f:: --function:: Specify function to copy (default: default). @@ -201,11 +211,21 @@ Suite for evaluating performance of simple memory set in various ways. Options of *memset* ^^^^^^^^^^^^^^^^^^^ --l:: +-s:: --size:: Specify size of memory to set (default: 1MB). Available units are B, KB, MB, GB and TB (case insensitive). +-p:: +--page:: +Specify page-size for mapping memory buffers (default: 4KB). +Available values are 4KB, 2MB, 1GB (case insensitive). + +-k:: +--chunk:: +Specify the chunk-size for each invocation. (default: 0, or full-extent) +Available units are B, KB, MB, GB and TB (case insensitive). + -f:: --function:: Specify function to set (default: default). @@ -220,6 +240,40 @@ Repeat memset invocation this number of times. --cycles:: Use perf's cpu-cycles event instead of gettimeofday syscall. +*mmap*:: +Suite for evaluating memory subsystem performance for mmap()'d memory. + +Options of *mmap* +^^^^^^^^^^^^^^^^^ +-s:: +--size:: +Specify size of memory to set (default: 1MB). +Available units are B, KB, MB, GB and TB (case insensitive). + +-p:: +--page:: +Specify page-size for mapping memory buffers (default: 4KB). +Available values are 4KB, 2MB, 1GB (case insensitive). + +-r:: +--randomize:: +Specify seed to randomize page access offset (default: 0, or not randomized). + +-f:: +--function:: +Specify function to set (default: all). +Available functions are 'demand' and 'populate', with the first +demand faulting pages in the region and the second using an eager +mapping. + +-l:: +--nr_loops:: +Repeat mmap() invocation this number of times. + +-c:: +--cycles:: +Use perf's cpu-cycles event instead of gettimeofday syscall. + SUITES FOR 'numa' ~~~~~~~~~~~~~~~~~ *mem*:: diff --git a/tools/perf/Documentation/perf-check.txt b/tools/perf/Documentation/perf-check.txt index ee92042082f7..4c9ccda6ce91 100644 --- a/tools/perf/Documentation/perf-check.txt +++ b/tools/perf/Documentation/perf-check.txt @@ -56,6 +56,7 @@ feature:: libcapstone / HAVE_LIBCAPSTONE_SUPPORT libdw-dwarf-unwind / HAVE_LIBDW_SUPPORT libelf / HAVE_LIBELF_SUPPORT + libLLVM / HAVE_LIBLLVM_SUPPORT libnuma / HAVE_LIBNUMA_SUPPORT libopencsd / HAVE_CSTRACE_SUPPORT libperl / HAVE_LIBPERL_SUPPORT diff --git a/tools/perf/Documentation/perf-diff.txt b/tools/perf/Documentation/perf-diff.txt index f3067a4af294..58efab72d2e5 100644 --- a/tools/perf/Documentation/perf-diff.txt +++ b/tools/perf/Documentation/perf-diff.txt @@ -285,7 +285,7 @@ If specified the 'Weighted diff' column is displayed with value 'd' computed as: - period being the hist entry period value - - WEIGHT-A/WEIGHT-B being user supplied weights in the the '-c' option + - WEIGHT-A/WEIGHT-B being user supplied weights in the '-c' option behind ':' separator like '-c wdiff:1,2'. - WEIGHT-A being the weight of the data file - WEIGHT-B being the weight of the baseline data file diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt index 28215306a78a..a4378a0cd914 100644 --- a/tools/perf/Documentation/perf-list.txt +++ b/tools/perf/Documentation/perf-list.txt @@ -73,6 +73,7 @@ counted. The following modifiers exist: e - group or event are exclusive and do not share the PMU b - use BPF aggregration (see perf stat --bpf-counters) R - retire latency value of the event + X - don't regroup the event to match PMUs The 'p' modifier can be used for specifying how precise the instruction address should be. The 'p' modifier can be specified multiple times: @@ -392,6 +393,8 @@ Support raw format: . '--raw-dump [hw|sw|cache|tracepoint|pmu|event_glob]', shows the raw-dump of a certain kind of events. +include::intel-acr.txt[] + SEE ALSO -------- linkperf:perf-stat[1], linkperf:perf-top[1], diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt index 973fede403a0..892c82a9bf40 100644 --- a/tools/perf/Documentation/perf-trace.txt +++ b/tools/perf/Documentation/perf-trace.txt @@ -249,6 +249,10 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs. works well with -s/--summary option where no argument information is required. +--max-summary=N:: + Maximum number of lines in the summary mode. Note that this applies to + each entry (thread or cgroup). + PAGEFAULTS ---------- diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt index cd95ba09f727..c9d4dec65344 100644 --- a/tools/perf/Documentation/perf.data-file-format.txt +++ b/tools/perf/Documentation/perf.data-file-format.txt @@ -348,6 +348,16 @@ to special needs. struct perf_bpil, which contains detailed information about a BPF program, including type, id, tag, jited/xlated instructions, etc. +The format of data in HEADER_BPF_PROG_INFO is as follows: + u32 count + + struct perf_bpil { + u32 info_len; /* size of struct bpf_prog_info, when the tool is compiled */ + u32 data_len; /* total bytes allocated for data, round up to 8 bytes */ + u64 arrays; /* which arrays are included in data */ + struct bpf_prog_info info; + u8 data[]; + }[count]; HEADER_BPF_BTF = 26, diff --git a/tools/perf/Makefile.config b/tools/perf/Makefile.config index 5a5832ee7b53..5700516aa84a 100644 --- a/tools/perf/Makefile.config +++ b/tools/perf/Makefile.config @@ -23,8 +23,39 @@ HOSTCFLAGS := $(filter-out -Wnested-externs,$(EXTRA_WARNINGS)) # borrowed from kernel headers depends on it, e.g. put_unaligned_*(). CFLAGS += -fno-strict-aliasing -# Enabled Wthread-safety analysis for clang builds. +# Set target flag and options when using clang as compiler. ifeq ($(CC_NO_CLANG), 0) + CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi + CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu + CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu + CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu + CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu + CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu + CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu + CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu + CLANG_TARGET_FLAGS_x86_64 := x86_64-linux-gnu + + # Default to host architecture if ARCH is not explicitly given. + ifeq ($(ARCH), $(HOSTARCH)) + CLANG_TARGET_FLAGS := $(shell $(CLANG) -print-target-triple) + else + CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) + endif + + ifeq ($(CROSS_COMPILE),) + ifeq ($(CLANG_TARGET_FLAGS),) + $(error Specify CROSS_COMPILE or add CLANG_TARGET_FLAGS for $(ARCH)) + else + CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) + endif # CLANG_TARGET_FLAGS + else + CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) + endif # CROSS_COMPILE + + CC := $(CLANG) $(CLANG_FLAGS) -fintegrated-as + CXX := $(CXX) $(CLANG_FLAGS) -fintegrated-as + + # Enabled Wthread-safety analysis for clang builds. CFLAGS += -Wthread-safety endif @@ -417,10 +448,6 @@ ifeq ($(feature-eventfd), 1) CFLAGS += -DHAVE_EVENTFD_SUPPORT endif -ifeq ($(feature-get_current_dir_name), 1) - CFLAGS += -DHAVE_GET_CURRENT_DIR_NAME -endif - ifeq ($(feature-gettid), 1) CFLAGS += -DHAVE_GETTID endif @@ -600,13 +627,6 @@ ifndef NO_LIBELF LIBBPF_INCLUDE = $(LIBBPF_DIR)/.. endif endif - - FEATURE_CHECK_CFLAGS-libbpf-strings="-I$(LIBBPF_INCLUDE)" - $(call feature_check,libbpf-strings) - ifeq ($(feature-libbpf-strings), 1) - $(call detected,CONFIG_LIBBPF_STRINGS) - CFLAGS += -DHAVE_LIBBPF_STRINGS_SUPPORT - endif endif endif # NO_LIBBPF endif # NO_LIBELF @@ -784,15 +804,10 @@ endif ifndef NO_SLANG ifneq ($(feature-libslang), 1) - ifneq ($(feature-libslang-include-subdir), 1) - $(warning slang not found, disables TUI support. Please install slang-devel, libslang-dev or libslang2-dev) - NO_SLANG := 1 - else - CFLAGS += -DHAVE_SLANG_INCLUDE_SUBDIR - endif + $(warning slang not found, disables TUI support. Please install slang-devel, libslang-dev or libslang2-dev) + NO_SLANG := 1 endif ifndef NO_SLANG - # Fedora has /usr/include/slang/slang.h, but ubuntu /usr/include/slang.h CFLAGS += -DHAVE_SLANG_SUPPORT EXTLIBS += -lslang $(call detected,CONFIG_SLANG) @@ -817,9 +832,7 @@ ifdef GTK2 endif endif -ifdef NO_LIBPERL - CFLAGS += -DNO_LIBPERL -else +ifdef LIBPERL PERL_EMBED_LDOPTS = $(shell perl -MExtUtils::Embed -e ldopts 2>/dev/null) PERL_EMBED_LDFLAGS = $(call strip-libs,$(PERL_EMBED_LDOPTS)) PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS)) @@ -829,17 +842,13 @@ else PERL_EMBED_LDOPTS := $(filter-out -specs=%,$(PERL_EMBED_LDOPTS)) FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS) + $(call feature_check,libperl) ifneq ($(feature-libperl), 1) - CFLAGS += -DNO_LIBPERL - NO_LIBPERL := 1 - $(warning Missing perl devel files. Disabling perl scripting support, please install perl-ExtUtils-Embed/libperl-dev) + $(error Missing perl devel files. Please install perl-ExtUtils-Embed/libperl-dev) else LDFLAGS += $(PERL_EMBED_LDFLAGS) EXTLIBS += $(PERL_EMBED_LIBADD) CFLAGS += -DHAVE_LIBPERL_SUPPORT - ifeq ($(CC_NO_CLANG), 0) - CFLAGS += -Wno-compound-token-split-by-macro - endif $(call detected,CONFIG_LIBPERL) endif endif @@ -947,6 +956,7 @@ ifdef BUILD_NONDISTRO CFLAGS += -DHAVE_LIBBFD_SUPPORT CXXFLAGS += -DHAVE_LIBBFD_SUPPORT + $(call detected,CONFIG_LIBBFD) $(call feature_check,libbfd-buildid) @@ -955,6 +965,14 @@ ifdef BUILD_NONDISTRO else $(warning Old version of libbfd/binutils things like PE executable profiling will not be available) endif + + ifeq ($(feature-disassembler-four-args), 1) + CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE + endif + + ifeq ($(feature-disassembler-init-styled), 1) + CFLAGS += -DDISASM_INIT_STYLED + endif endif ifndef NO_LIBLLVM @@ -1046,14 +1064,6 @@ ifdef HAVE_KVM_STAT_SUPPORT CFLAGS += -DHAVE_KVM_STAT_SUPPORT endif -ifeq ($(feature-disassembler-four-args), 1) - CFLAGS += -DDISASM_FOUR_ARGS_SIGNATURE -endif - -ifeq ($(feature-disassembler-init-styled), 1) - CFLAGS += -DDISASM_INIT_STYLED -endif - ifeq (${IS_64_BIT}, 1) ifndef NO_PERF_READ_VDSO32 $(call feature_check,compile-32) @@ -1181,20 +1191,6 @@ ifneq ($(NO_LIBTRACEEVENT),1) else $(error ERROR: libtraceevent is missing. Please install libtraceevent-dev/libtraceevent-devel and/or set LIBTRACEEVENT_DIR or build with NO_LIBTRACEEVENT=1) endif - - ifeq ($(feature-libtracefs), 1) - CFLAGS += $(shell $(PKG_CONFIG) --cflags libtracefs) - LDFLAGS += $(shell $(PKG_CONFIG) --libs-only-L libtracefs) - EXTLIBS += $(shell $(PKG_CONFIG) --libs-only-l libtracefs) - LIBTRACEFS_VERSION := $(shell $(PKG_CONFIG) --modversion libtracefs).0.0 - LIBTRACEFS_VERSION_1 := $(word 1, $(subst ., ,$(LIBTRACEFS_VERSION))) - LIBTRACEFS_VERSION_2 := $(word 2, $(subst ., ,$(LIBTRACEFS_VERSION))) - LIBTRACEFS_VERSION_3 := $(word 3, $(subst ., ,$(LIBTRACEFS_VERSION))) - LIBTRACEFS_VERSION_CPP := $(shell expr $(LIBTRACEFS_VERSION_1) \* 255 \* 255 + $(LIBTRACEFS_VERSION_2) \* 255 + $(LIBTRACEFS_VERSION_3)) - CFLAGS += -DLIBTRACEFS_VERSION=$(LIBTRACEFS_VERSION_CPP) - else - $(warning libtracefs is missing. Please install libtracefs-dev/libtracefs-devel) - endif endif # Among the variables below, these: diff --git a/tools/perf/Makefile.perf b/tools/perf/Makefile.perf index e2150acc2c13..47c906b807ef 100644 --- a/tools/perf/Makefile.perf +++ b/tools/perf/Makefile.perf @@ -17,7 +17,7 @@ include ../scripts/utilities.mak # # Define CROSS_COMPILE as prefix name of compiler if you want cross-builds. # -# Define NO_LIBPERL to disable perl script extension. +# Define LIBPERL to enable perl script extension. # # Define NO_LIBPYTHON to disable python script extension. # @@ -194,7 +194,7 @@ else # paths are used instead. ifdef CROSS_COMPILE ifeq ($(PKG_CONFIG_LIBDIR)$(PKG_CONFIG_PATH)$(PKG_CONFIG_SYSROOT_DIR),) - CROSS_ARCH = $(shell $(CC) -dumpmachine) + CROSS_ARCH = $(notdir $(CROSS_COMPILE:%-=%)) PKG_CONFIG_LIBDIR := /usr/local/$(CROSS_ARCH)/lib/pkgconfig/ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/local/lib/$(CROSS_ARCH)/pkgconfig/ PKG_CONFIG_LIBDIR := $(PKG_CONFIG_LIBDIR):/usr/lib/$(CROSS_ARCH)/pkgconfig/ @@ -941,7 +941,7 @@ $(OUTPUT)dlfilters/%.so: $(OUTPUT)dlfilters/%.o ifndef NO_JVMTI LIBJVMTI_IN := $(OUTPUT)jvmti/jvmti-in.o -$(LIBJVMTI_IN): FORCE +$(LIBJVMTI_IN): prepare FORCE $(Q)$(MAKE) -f $(srctree)/tools/build/Makefile.build dir=jvmti obj=jvmti $(OUTPUT)$(LIBJVMTI): $(LIBJVMTI_IN) @@ -1103,7 +1103,7 @@ endif $(INSTALL) $(OUTPUT)perf-archive -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' $(call QUIET_INSTALL, perf-iostat) \ $(INSTALL) $(OUTPUT)perf-iostat -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' -ifndef NO_LIBPERL +ifdef LIBPERL $(call QUIET_INSTALL, perl-scripts) \ $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'; \ $(INSTALL) scripts/perl/Perf-Trace-Util/lib/Perf/Trace/* -m 644 -t '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/scripts/perl/Perf-Trace-Util/lib/Perf/Trace'; \ diff --git a/tools/perf/arch/arm64/util/arm-spe.c b/tools/perf/arch/arm64/util/arm-spe.c index 4f2833b62ff5..cac43cde7dbe 100644 --- a/tools/perf/arch/arm64/util/arm-spe.c +++ b/tools/perf/arch/arm64/util/arm-spe.c @@ -121,12 +121,17 @@ static int arm_spe_save_cpu_header(struct auxtrace_record *itr, /* No Arm SPE PMU is found */ data[ARM_SPE_CPU_PMU_TYPE] = ULLONG_MAX; data[ARM_SPE_CAP_MIN_IVAL] = 0; + data[ARM_SPE_CAP_EVENT_FILTER] = 0; } else { data[ARM_SPE_CPU_PMU_TYPE] = pmu->type; if (perf_pmu__scan_file(pmu, "caps/min_interval", "%lu", &val) != 1) val = 0; data[ARM_SPE_CAP_MIN_IVAL] = val; + + if (perf_pmu__scan_file(pmu, "caps/event_filter", "%lx", &val) != 1) + val = 0; + data[ARM_SPE_CAP_EVENT_FILTER] = val; } free(cpuid); diff --git a/tools/perf/arch/arm64/util/arm64_exception_types.h b/tools/perf/arch/arm64/util/arm64_exception_types.h index 27c981ebe401..bf827f19ace0 100644 --- a/tools/perf/arch/arm64/util/arm64_exception_types.h +++ b/tools/perf/arch/arm64/util/arm64_exception_types.h @@ -31,9 +31,10 @@ #define ESR_ELx_EC_FP_ASIMD (0x07) #define ESR_ELx_EC_CP10_ID (0x08) /* EL2 only */ #define ESR_ELx_EC_PAC (0x09) /* EL2 and above */ -/* Unallocated EC: 0x0A - 0x0B */ +#define ESR_ELx_EC_OTHER (0x0A) +/* Unallocated EC: 0x0B */ #define ESR_ELx_EC_CP14_64 (0x0C) -/* Unallocated EC: 0x0d */ +#define ESR_ELx_EC_BTI (0x0D) #define ESR_ELx_EC_ILL (0x0E) /* Unallocated EC: 0x0F - 0x10 */ #define ESR_ELx_EC_SVC32 (0x11) @@ -46,7 +47,10 @@ #define ESR_ELx_EC_SYS64 (0x18) #define ESR_ELx_EC_SVE (0x19) #define ESR_ELx_EC_ERET (0x1a) /* EL2 only */ -/* Unallocated EC: 0x1b - 0x1E */ +/* Unallocated EC: 0x1B */ +#define ESR_ELx_EC_FPAC (0x1C) /* EL1 and above */ +#define ESR_ELx_EC_SME (0x1D) +/* Unallocated EC: 0x1E */ #define ESR_ELx_EC_IMP_DEF (0x1f) /* EL3 only */ #define ESR_ELx_EC_IABT_LOW (0x20) #define ESR_ELx_EC_IABT_CUR (0x21) @@ -55,11 +59,12 @@ #define ESR_ELx_EC_DABT_LOW (0x24) #define ESR_ELx_EC_DABT_CUR (0x25) #define ESR_ELx_EC_SP_ALIGN (0x26) -/* Unallocated EC: 0x27 */ +#define ESR_ELx_EC_MOPS (0x27) #define ESR_ELx_EC_FP_EXC32 (0x28) /* Unallocated EC: 0x29 - 0x2B */ #define ESR_ELx_EC_FP_EXC64 (0x2C) -/* Unallocated EC: 0x2D - 0x2E */ +#define ESR_ELx_EC_GCS (0x2D) +/* Unallocated EC: 0x2E */ #define ESR_ELx_EC_SERROR (0x2F) #define ESR_ELx_EC_BREAKPT_LOW (0x30) #define ESR_ELx_EC_BREAKPT_CUR (0x31) diff --git a/tools/perf/arch/powerpc/util/Build b/tools/perf/arch/powerpc/util/Build index fdd6a77a3432..a5b0babd307e 100644 --- a/tools/perf/arch/powerpc/util/Build +++ b/tools/perf/arch/powerpc/util/Build @@ -10,3 +10,4 @@ perf-util-$(CONFIG_LIBDW) += skip-callchain-idx.o perf-util-$(CONFIG_LIBUNWIND) += unwind-libunwind.o perf-util-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o +perf-util-$(CONFIG_AUXTRACE) += auxtrace.o diff --git a/tools/perf/arch/powerpc/util/auxtrace.c b/tools/perf/arch/powerpc/util/auxtrace.c new file mode 100644 index 000000000000..62c6f67f1bbe --- /dev/null +++ b/tools/perf/arch/powerpc/util/auxtrace.c @@ -0,0 +1,103 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * VPA support + */ + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/string.h> + +#include "../../util/evlist.h" +#include "../../util/debug.h" +#include "../../util/auxtrace.h" +#include "../../util/powerpc-vpadtl.h" +#include "../../util/record.h" +#include <internal/lib.h> // page_size + +#define KiB(x) ((x) * 1024) + +static int +powerpc_vpadtl_recording_options(struct auxtrace_record *ar __maybe_unused, + struct evlist *evlist __maybe_unused, + struct record_opts *opts) +{ + opts->full_auxtrace = true; + + /* + * Set auxtrace_mmap_pages to minimum + * two pages + */ + if (!opts->auxtrace_mmap_pages) { + opts->auxtrace_mmap_pages = KiB(128) / page_size; + if (opts->mmap_pages == UINT_MAX) + opts->mmap_pages = KiB(256) / page_size; + } + + return 0; +} + +static size_t powerpc_vpadtl_info_priv_size(struct auxtrace_record *itr __maybe_unused, + struct evlist *evlist __maybe_unused) +{ + return VPADTL_AUXTRACE_PRIV_SIZE; +} + +static int +powerpc_vpadtl_info_fill(struct auxtrace_record *itr __maybe_unused, + struct perf_session *session __maybe_unused, + struct perf_record_auxtrace_info *auxtrace_info, + size_t priv_size __maybe_unused) +{ + auxtrace_info->type = PERF_AUXTRACE_VPA_DTL; + + return 0; +} + +static void powerpc_vpadtl_free(struct auxtrace_record *itr) +{ + free(itr); +} + +static u64 powerpc_vpadtl_reference(struct auxtrace_record *itr __maybe_unused) +{ + return 0; +} + +struct auxtrace_record *auxtrace_record__init(struct evlist *evlist, + int *err) +{ + struct auxtrace_record *aux; + struct evsel *pos; + int found = 0; + + evlist__for_each_entry(evlist, pos) { + if (strstarts(pos->name, "vpa_dtl")) { + found = 1; + pos->needs_auxtrace_mmap = true; + break; + } + } + + if (!found) + return NULL; + + /* + * To obtain the auxtrace buffer file descriptor, the auxtrace event + * must come first. + */ + evlist__to_front(pos->evlist, pos); + + aux = zalloc(sizeof(*aux)); + if (aux == NULL) { + pr_debug("aux record is NULL\n"); + *err = -ENOMEM; + return NULL; + } + + aux->recording_options = powerpc_vpadtl_recording_options; + aux->info_priv_size = powerpc_vpadtl_info_priv_size; + aux->info_fill = powerpc_vpadtl_info_fill; + aux->free = powerpc_vpadtl_free; + aux->reference = powerpc_vpadtl_reference; + return aux; +} diff --git a/tools/perf/arch/x86/annotate/instructions.c b/tools/perf/arch/x86/annotate/instructions.c index c6d403eae744..da98a4e3c52c 100644 --- a/tools/perf/arch/x86/annotate/instructions.c +++ b/tools/perf/arch/x86/annotate/instructions.c @@ -301,7 +301,7 @@ static void update_insn_state_x86(struct type_state *state, * as a pointer. */ tsr->type = type_die; - tsr->kind = TSR_KIND_POINTER; + tsr->kind = TSR_KIND_PERCPU_POINTER; tsr->ok = true; pr_debug_dtp("add [%x] percpu %#"PRIx64" -> reg%d", @@ -521,7 +521,7 @@ retry: } /* And then dereference the calculated pointer if it has one */ else if (has_reg_type(state, sreg) && state->regs[sreg].ok && - state->regs[sreg].kind == TSR_KIND_POINTER && + state->regs[sreg].kind == TSR_KIND_PERCPU_POINTER && die_get_member_type(&state->regs[sreg].type, src->offset, &type_die)) { tsr->type = type_die; diff --git a/tools/perf/arch/x86/util/evsel.c b/tools/perf/arch/x86/util/evsel.c index 9bc80fff3aa0..23a8e662a912 100644 --- a/tools/perf/arch/x86/util/evsel.c +++ b/tools/perf/arch/x86/util/evsel.c @@ -1,10 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 +#include <errno.h> #include <stdio.h> #include <stdlib.h> +#include "util/evlist.h" #include "util/evsel.h" +#include "util/evsel_config.h" #include "util/env.h" #include "util/pmu.h" #include "util/pmus.h" +#include "util/stat.h" +#include "util/strbuf.h" #include "linux/string.h" #include "topdown.h" #include "evsel.h" @@ -67,6 +72,57 @@ int arch_evsel__hw_name(struct evsel *evsel, char *bf, size_t size) event_name); } +void arch_evsel__apply_ratio_to_prev(struct evsel *evsel, + struct perf_event_attr *attr) +{ + struct perf_event_attr *prev_attr = NULL; + struct evsel *evsel_prev = NULL; + const char *name = "acr_mask"; + int evsel_idx = 0; + __u64 ev_mask, pr_ev_mask; + + if (!perf_pmu__has_format(evsel->pmu, name)) { + pr_err("'%s' does not have acr_mask format support\n", evsel->pmu->name); + return; + } + if (perf_pmu__format_type(evsel->pmu, name) != + PERF_PMU_FORMAT_VALUE_CONFIG2) { + pr_err("'%s' does not have config2 format support\n", evsel->pmu->name); + return; + } + + evsel_prev = evsel__prev(evsel); + if (!evsel_prev) { + pr_err("Previous event does not exist.\n"); + return; + } + + prev_attr = &evsel_prev->core.attr; + + if (prev_attr->config2) { + pr_err("'%s' has set config2 (acr_mask?) already, configuration not supported\n", evsel_prev->name); + return; + } + + /* + * acr_mask (config2) is calculated using the event's index in + * the event group. The first event will use the index of the + * second event as its mask (e.g., 0x2), indicating that the + * second event counter will be reset and a sample taken for + * the first event if its counter overflows. The second event + * will use the mask consisting of the first and second bits + * (e.g., 0x3), meaning both counters will be reset if the + * second event counter overflows. + */ + + evsel_idx = evsel__group_idx(evsel); + ev_mask = 1ull << evsel_idx; + pr_ev_mask = 1ull << (evsel_idx - 1); + + prev_attr->config2 = ev_mask; + attr->config2 = ev_mask | pr_ev_mask; +} + static void ibs_l3miss_warn(void) { pr_warning( @@ -102,13 +158,15 @@ void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr) } } -int arch_evsel__open_strerror(struct evsel *evsel, char *msg, size_t size) +static int amd_evsel__open_strerror(struct evsel *evsel, char *msg, size_t size) { - if (!x86__is_amd_cpu()) + struct perf_pmu *pmu; + + if (evsel->core.attr.precise_ip == 0) return 0; - if (!evsel->core.attr.precise_ip && - !(evsel->pmu && !strncmp(evsel->pmu->name, "ibs", 3))) + pmu = evsel__find_pmu(evsel); + if (!pmu || strncmp(pmu->name, "ibs", 3)) return 0; /* More verbose IBS errors. */ @@ -118,6 +176,54 @@ int arch_evsel__open_strerror(struct evsel *evsel, char *msg, size_t size) return scnprintf(msg, size, "AMD IBS doesn't support privilege filtering. Try " "again without the privilege modifiers (like 'k') at the end."); } + return 0; +} + +static int intel_evsel__open_strerror(struct evsel *evsel, int err, char *msg, size_t size) +{ + struct strbuf sb = STRBUF_INIT; + int ret; + + if (err != EINVAL) + return 0; + if (!topdown_sys_has_perf_metrics()) + return 0; + + if (arch_is_topdown_slots(evsel)) { + if (!evsel__is_group_leader(evsel)) { + evlist__uniquify_evsel_names(evsel->evlist, &stat_config); + evlist__format_evsels(evsel->evlist, &sb, 2048); + ret = scnprintf(msg, size, "Topdown slots event can only be group leader " + "in '%s'.", sb.buf); + strbuf_release(&sb); + return ret; + } + } else if (arch_is_topdown_metrics(evsel)) { + struct evsel *pos; + + evlist__for_each_entry(evsel->evlist, pos) { + if (pos == evsel || !arch_is_topdown_metrics(pos)) + continue; + + if (pos->core.attr.config != evsel->core.attr.config) + continue; + + evlist__uniquify_evsel_names(evsel->evlist, &stat_config); + evlist__format_evsels(evsel->evlist, &sb, 2048); + ret = scnprintf(msg, size, "Perf metric event '%s' is duplicated " + "in the same group (only one event is allowed) in '%s'.", + evsel__name(evsel), sb.buf); + strbuf_release(&sb); + return ret; + } + } return 0; } + +int arch_evsel__open_strerror(struct evsel *evsel, int err, char *msg, size_t size) +{ + return x86__is_amd_cpu() + ? amd_evsel__open_strerror(evsel, msg, size) + : intel_evsel__open_strerror(evsel, err, msg, size); +} diff --git a/tools/perf/arch/x86/util/kvm-stat.c b/tools/perf/arch/x86/util/kvm-stat.c index 424716518b75..bff36f9345ea 100644 --- a/tools/perf/arch/x86/util/kvm-stat.c +++ b/tools/perf/arch/x86/util/kvm-stat.c @@ -3,9 +3,11 @@ #include <string.h> #include "../../../util/kvm-stat.h" #include "../../../util/evsel.h" +#include "../../../util/env.h" #include <asm/svm.h> #include <asm/vmx.h> #include <asm/kvm.h> +#include <subcmd/parse-options.h> define_exit_reasons_table(vmx_exit_reasons, VMX_EXIT_REASONS); define_exit_reasons_table(svm_exit_reasons, SVM_EXIT_REASONS); @@ -211,3 +213,52 @@ int cpu_isa_init(struct perf_kvm_stat *kvm, const char *cpuid) return 0; } + +/* + * After KVM supports PEBS for guest on Intel platforms + * (https://lore.kernel.org/all/20220411101946.20262-1-likexu@tencent.com/), + * host loses the capability to sample guest with PEBS since all PEBS related + * MSRs are switched to guest value after vm-entry, like IA32_DS_AREA MSR is + * switched to guest GVA at vm-entry. This would lead to "perf kvm record" + * fails to sample guest on Intel platforms since "cycles:P" event is used to + * sample guest by default. + * + * So, to avoid this issue explicitly use "cycles" instead of "cycles:P" event + * by default to sample guest on Intel platforms. + */ +int kvm_add_default_arch_event(int *argc, const char **argv) +{ + const char **tmp; + bool event = false; + int ret = 0, i, j = *argc; + + const struct option event_options[] = { + OPT_BOOLEAN('e', "event", &event, NULL), + OPT_BOOLEAN(0, "pfm-events", &event, NULL), + OPT_END() + }; + + if (!x86__is_intel_cpu()) + return 0; + + tmp = calloc(j + 1, sizeof(char *)); + if (!tmp) + return -ENOMEM; + + for (i = 0; i < j; i++) + tmp[i] = argv[i]; + + parse_options(j, tmp, event_options, NULL, PARSE_OPT_KEEP_UNKNOWN); + if (!event) { + argv[j++] = STRDUP_FAIL_EXIT("-e"); + argv[j++] = STRDUP_FAIL_EXIT("cycles"); + *argc += 2; + } + + free(tmp); + return 0; + +EXIT: + free(tmp); + return ret; +} diff --git a/tools/perf/bench/bench.h b/tools/perf/bench/bench.h index 9f736423af53..8519eb5a42fa 100644 --- a/tools/perf/bench/bench.h +++ b/tools/perf/bench/bench.h @@ -28,6 +28,7 @@ int bench_syscall_fork(int argc, const char **argv); int bench_syscall_execve(int argc, const char **argv); int bench_mem_memcpy(int argc, const char **argv); int bench_mem_memset(int argc, const char **argv); +int bench_mem_mmap(int argc, const char **argv); int bench_mem_find_bit(int argc, const char **argv); int bench_futex_hash(int argc, const char **argv); int bench_futex_wake(int argc, const char **argv); diff --git a/tools/perf/bench/futex.h b/tools/perf/bench/futex.h index dd295d27044a..fcb72d682cf8 100644 --- a/tools/perf/bench/futex.h +++ b/tools/perf/bench/futex.h @@ -8,6 +8,7 @@ #ifndef _FUTEX_H #define _FUTEX_H +#include <stdbool.h> #include <unistd.h> #include <sys/syscall.h> #include <sys/types.h> diff --git a/tools/perf/bench/mem-functions.c b/tools/perf/bench/mem-functions.c index 19d45c377ac1..2908a3a796c9 100644 --- a/tools/perf/bench/mem-functions.c +++ b/tools/perf/bench/mem-functions.c @@ -22,27 +22,39 @@ #include <string.h> #include <unistd.h> #include <sys/time.h> +#include <sys/mman.h> #include <errno.h> #include <linux/time64.h> -#include <linux/zalloc.h> +#include <linux/log2.h> #define K 1024 +#define PAGE_SHIFT_4KB 12 +#define PAGE_SHIFT_2MB 21 +#define PAGE_SHIFT_1GB 30 + static const char *size_str = "1MB"; static const char *function_str = "all"; -static int nr_loops = 1; +static const char *page_size_str = "4KB"; +static const char *chunk_size_str = "0"; +static unsigned int nr_loops = 1; static bool use_cycles; static int cycles_fd; +static unsigned int seed; -static const struct option options[] = { +static const struct option bench_common_options[] = { OPT_STRING('s', "size", &size_str, "1MB", "Specify the size of the memory buffers. " "Available units: B, KB, MB, GB and TB (case insensitive)"), + OPT_STRING('p', "page", &page_size_str, "4KB", + "Specify page-size for mapping memory buffers. " + "Available sizes: 4KB, 2MB, 1GB (case insensitive)"), + OPT_STRING('f', "function", &function_str, "all", "Specify the function to run, \"all\" runs all available functions, \"help\" lists them"), - OPT_INTEGER('l', "nr_loops", &nr_loops, + OPT_UINTEGER('l', "nr_loops", &nr_loops, "Specify the number of loops to run. (default: 1)"), OPT_BOOLEAN('c', "cycles", &use_cycles, @@ -51,15 +63,56 @@ static const struct option options[] = { OPT_END() }; +static const struct option bench_mem_options[] = { + OPT_STRING('k', "chunk", &chunk_size_str, "0", + "Specify the chunk-size for each invocation. " + "Available units: B, KB, MB, GB and TB (case insensitive)"), + OPT_PARENT(bench_common_options), + OPT_END() +}; + +union bench_clock { + u64 cycles; + struct timeval tv; +}; + +struct bench_params { + size_t size; + size_t size_total; + size_t chunk_size; + unsigned int nr_loops; + unsigned int page_shift; + unsigned int seed; +}; + +struct bench_mem_info { + const struct function *functions; + int (*do_op)(const struct function *r, struct bench_params *p, + void *src, void *dst, union bench_clock *rt); + const char *const *usage; + const struct option *options; + bool alloc_src; +}; + +typedef bool (*mem_init_t)(struct bench_mem_info *, struct bench_params *, + void **, void **); +typedef void (*mem_fini_t)(struct bench_mem_info *, struct bench_params *, + void **, void **); typedef void *(*memcpy_t)(void *, const void *, size_t); typedef void *(*memset_t)(void *, int, size_t); +typedef void (*mmap_op_t)(void *, size_t, unsigned int, bool); struct function { const char *name; const char *desc; - union { - memcpy_t memcpy; - memset_t memset; + struct { + mem_init_t init; + mem_fini_t fini; + union { + memcpy_t memcpy; + memset_t memset; + mmap_op_t mmap_op; + }; } fn; }; @@ -91,6 +144,34 @@ static u64 get_cycles(void) return clk; } +static void clock_get(union bench_clock *t) +{ + if (use_cycles) + t->cycles = get_cycles(); + else + BUG_ON(gettimeofday(&t->tv, NULL)); +} + +static union bench_clock clock_diff(union bench_clock *s, union bench_clock *e) +{ + union bench_clock t; + + if (use_cycles) + t.cycles = e->cycles - s->cycles; + else + timersub(&e->tv, &s->tv, &t.tv); + + return t; +} + +static void clock_accum(union bench_clock *a, union bench_clock *b) +{ + if (use_cycles) + a->cycles += b->cycles; + else + timeradd(&a->tv, &b->tv, &a->tv); +} + static double timeval2double(struct timeval *ts) { return (double)ts->tv_sec + (double)ts->tv_usec / (double)USEC_PER_SEC; @@ -107,54 +188,40 @@ static double timeval2double(struct timeval *ts) printf(" %14lf GB/sec\n", x / K / K / K); \ } while (0) -struct bench_mem_info { - const struct function *functions; - u64 (*do_cycles)(const struct function *r, size_t size, void *src, void *dst); - double (*do_gettimeofday)(const struct function *r, size_t size, void *src, void *dst); - const char *const *usage; - bool alloc_src; -}; - -static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t size, double size_total) +static void __bench_mem_function(struct bench_mem_info *info, struct bench_params *p, + int r_idx) { const struct function *r = &info->functions[r_idx]; double result_bps = 0.0; - u64 result_cycles = 0; - void *src = NULL, *dst = zalloc(size); + union bench_clock rt = { 0 }; + void *src = NULL, *dst = NULL; printf("# function '%s' (%s)\n", r->name, r->desc); - if (dst == NULL) - goto out_alloc_failed; - - if (info->alloc_src) { - src = zalloc(size); - if (src == NULL) - goto out_alloc_failed; - } + if (r->fn.init && r->fn.init(info, p, &src, &dst)) + goto out_init_failed; if (bench_format == BENCH_FORMAT_DEFAULT) printf("# Copying %s bytes ...\n\n", size_str); - if (use_cycles) { - result_cycles = info->do_cycles(r, size, src, dst); - } else { - result_bps = info->do_gettimeofday(r, size, src, dst); - } + if (info->do_op(r, p, src, dst, &rt)) + goto out_test_failed; switch (bench_format) { case BENCH_FORMAT_DEFAULT: if (use_cycles) { - printf(" %14lf cycles/byte\n", (double)result_cycles/size_total); + printf(" %14lf cycles/byte\n", (double)rt.cycles/(double)p->size_total); } else { + result_bps = (double)p->size_total/timeval2double(&rt.tv); print_bps(result_bps); } break; case BENCH_FORMAT_SIMPLE: if (use_cycles) { - printf("%lf\n", (double)result_cycles/size_total); + printf("%lf\n", (double)rt.cycles/(double)p->size_total); } else { + result_bps = (double)p->size_total/timeval2double(&rt.tv); printf("%lf\n", result_bps); } break; @@ -164,22 +231,23 @@ static void __bench_mem_function(struct bench_mem_info *info, int r_idx, size_t break; } +out_test_failed: out_free: - free(src); - free(dst); + if (r->fn.fini) r->fn.fini(info, p, &src, &dst); return; -out_alloc_failed: - printf("# Memory allocation failed - maybe size (%s) is too large?\n", size_str); +out_init_failed: + printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str, + p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large"); goto out_free; } static int bench_mem_common(int argc, const char **argv, struct bench_mem_info *info) { int i; - size_t size; - double size_total; + struct bench_params p = { 0 }; + unsigned int page_size; - argc = parse_options(argc, argv, options, info->usage, 0); + argc = parse_options(argc, argv, info->options, info->usage, 0); if (use_cycles) { i = init_cycles(); @@ -189,17 +257,37 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * } } - size = (size_t)perf_atoll((char *)size_str); - size_total = (double)size * nr_loops; + p.nr_loops = nr_loops; + p.size = (size_t)perf_atoll((char *)size_str); - if ((s64)size <= 0) { + if ((s64)p.size <= 0) { fprintf(stderr, "Invalid size:%s\n", size_str); return 1; } + p.size_total = p.size * p.nr_loops; + + p.chunk_size = (size_t)perf_atoll((char *)chunk_size_str); + if ((s64)p.chunk_size < 0 || (s64)p.chunk_size > (s64)p.size) { + fprintf(stderr, "Invalid chunk_size:%s\n", chunk_size_str); + return 1; + } + if (!p.chunk_size) + p.chunk_size = p.size; + + page_size = (unsigned int)perf_atoll((char *)page_size_str); + if (page_size != (1 << PAGE_SHIFT_4KB) && + page_size != (1 << PAGE_SHIFT_2MB) && + page_size != (1 << PAGE_SHIFT_1GB)) { + fprintf(stderr, "Invalid page-size:%s\n", page_size_str); + return 1; + } + p.page_shift = ilog2(page_size); + + p.seed = seed; if (!strncmp(function_str, "all", 3)) { for (i = 0; info->functions[i].name; i++) - __bench_mem_function(info, i, size, size_total); + __bench_mem_function(info, &p, i); return 0; } @@ -218,7 +306,7 @@ static int bench_mem_common(int argc, const char **argv, struct bench_mem_info * return 1; } - __bench_mem_function(info, i, size, size_total); + __bench_mem_function(info, &p, i); return 0; } @@ -235,47 +323,81 @@ static void memcpy_prefault(memcpy_t fn, size_t size, void *src, void *dst) fn(dst, src, size); } -static u64 do_memcpy_cycles(const struct function *r, size_t size, void *src, void *dst) +static int do_memcpy(const struct function *r, struct bench_params *p, + void *src, void *dst, union bench_clock *rt) { - u64 cycle_start = 0ULL, cycle_end = 0ULL; + union bench_clock start, end; memcpy_t fn = r->fn.memcpy; - int i; - memcpy_prefault(fn, size, src, dst); + memcpy_prefault(fn, p->size, src, dst); + + clock_get(&start); + for (unsigned int i = 0; i < p->nr_loops; ++i) + for (size_t off = 0; off < p->size; off += p->chunk_size) + fn(dst + off, src + off, min(p->chunk_size, p->size - off)); + clock_get(&end); - cycle_start = get_cycles(); - for (i = 0; i < nr_loops; ++i) - fn(dst, src, size); - cycle_end = get_cycles(); + *rt = clock_diff(&start, &end); - return cycle_end - cycle_start; + return 0; } -static double do_memcpy_gettimeofday(const struct function *r, size_t size, void *src, void *dst) +static void *bench_mmap(size_t size, bool populate, unsigned int page_shift) { - struct timeval tv_start, tv_end, tv_diff; - memcpy_t fn = r->fn.memcpy; - int i; + void *p; + int extra = populate ? MAP_POPULATE : 0; + + if (page_shift != PAGE_SHIFT_4KB) + extra |= MAP_HUGETLB | (page_shift << MAP_HUGE_SHIFT); + + p = mmap(NULL, size, PROT_READ|PROT_WRITE, + extra | MAP_PRIVATE | MAP_ANONYMOUS, 0, 0); + + return p == MAP_FAILED ? NULL : p; +} + +static void bench_munmap(void *p, size_t size) +{ + if (p) + munmap(p, size); +} + +static bool mem_alloc(struct bench_mem_info *info, struct bench_params *p, + void **src, void **dst) +{ + bool failed; - memcpy_prefault(fn, size, src, dst); + *dst = bench_mmap(p->size, true, p->page_shift); + failed = *dst == NULL; - BUG_ON(gettimeofday(&tv_start, NULL)); - for (i = 0; i < nr_loops; ++i) - fn(dst, src, size); - BUG_ON(gettimeofday(&tv_end, NULL)); + if (info->alloc_src) { + *src = bench_mmap(p->size, true, p->page_shift); + failed = failed || *src == NULL; + } + + return failed; +} - timersub(&tv_end, &tv_start, &tv_diff); +static void mem_free(struct bench_mem_info *info __maybe_unused, + struct bench_params *p __maybe_unused, + void **src, void **dst) +{ + bench_munmap(*dst, p->size); + bench_munmap(*src, p->size); - return (double)(((double)size * nr_loops) / timeval2double(&tv_diff)); + *dst = *src = NULL; } struct function memcpy_functions[] = { { .name = "default", .desc = "Default memcpy() provided by glibc", + .fn.init = mem_alloc, + .fn.fini = mem_free, .fn.memcpy = memcpy }, #ifdef HAVE_ARCH_X86_64_SUPPORT -# define MEMCPY_FN(_fn, _name, _desc) {.name = _name, .desc = _desc, .fn.memcpy = _fn}, +# define MEMCPY_FN(_fn, _init, _fini, _name, _desc) \ + {.name = _name, .desc = _desc, .fn.memcpy = _fn, .fn.init = _init, .fn.fini = _fini }, # include "mem-memcpy-x86-64-asm-def.h" # undef MEMCPY_FN #endif @@ -292,55 +414,36 @@ int bench_mem_memcpy(int argc, const char **argv) { struct bench_mem_info info = { .functions = memcpy_functions, - .do_cycles = do_memcpy_cycles, - .do_gettimeofday = do_memcpy_gettimeofday, + .do_op = do_memcpy, .usage = bench_mem_memcpy_usage, + .options = bench_mem_options, .alloc_src = true, }; return bench_mem_common(argc, argv, &info); } -static u64 do_memset_cycles(const struct function *r, size_t size, void *src __maybe_unused, void *dst) -{ - u64 cycle_start = 0ULL, cycle_end = 0ULL; - memset_t fn = r->fn.memset; - int i; - - /* - * We prefault the freshly allocated memory range here, - * to not measure page fault overhead: - */ - fn(dst, -1, size); - - cycle_start = get_cycles(); - for (i = 0; i < nr_loops; ++i) - fn(dst, i, size); - cycle_end = get_cycles(); - - return cycle_end - cycle_start; -} - -static double do_memset_gettimeofday(const struct function *r, size_t size, void *src __maybe_unused, void *dst) +static int do_memset(const struct function *r, struct bench_params *p, + void *src __maybe_unused, void *dst, union bench_clock *rt) { - struct timeval tv_start, tv_end, tv_diff; + union bench_clock start, end; memset_t fn = r->fn.memset; - int i; /* * We prefault the freshly allocated memory range here, * to not measure page fault overhead: */ - fn(dst, -1, size); + fn(dst, -1, p->size); - BUG_ON(gettimeofday(&tv_start, NULL)); - for (i = 0; i < nr_loops; ++i) - fn(dst, i, size); - BUG_ON(gettimeofday(&tv_end, NULL)); + clock_get(&start); + for (unsigned int i = 0; i < p->nr_loops; ++i) + for (size_t off = 0; off < p->size; off += p->chunk_size) + fn(dst + off, i, min(p->chunk_size, p->size - off)); + clock_get(&end); - timersub(&tv_end, &tv_start, &tv_diff); + *rt = clock_diff(&start, &end); - return (double)(((double)size * nr_loops) / timeval2double(&tv_diff)); + return 0; } static const char * const bench_mem_memset_usage[] = { @@ -351,10 +454,13 @@ static const char * const bench_mem_memset_usage[] = { static const struct function memset_functions[] = { { .name = "default", .desc = "Default memset() provided by glibc", + .fn.init = mem_alloc, + .fn.fini = mem_free, .fn.memset = memset }, #ifdef HAVE_ARCH_X86_64_SUPPORT -# define MEMSET_FN(_fn, _name, _desc) { .name = _name, .desc = _desc, .fn.memset = _fn }, +# define MEMSET_FN(_fn, _init, _fini, _name, _desc) \ + {.name = _name, .desc = _desc, .fn.memset = _fn, .fn.init = _init, .fn.fini = _fini }, # include "mem-memset-x86-64-asm-def.h" # undef MEMSET_FN #endif @@ -366,9 +472,91 @@ int bench_mem_memset(int argc, const char **argv) { struct bench_mem_info info = { .functions = memset_functions, - .do_cycles = do_memset_cycles, - .do_gettimeofday = do_memset_gettimeofday, + .do_op = do_memset, .usage = bench_mem_memset_usage, + .options = bench_mem_options, + }; + + return bench_mem_common(argc, argv, &info); +} + +static void mmap_page_touch(void *dst, size_t size, unsigned int page_shift, bool random) +{ + unsigned long npages = size / (1 << page_shift); + unsigned long offset = 0, r = 0; + + for (unsigned long i = 0; i < npages; i++) { + if (random) + r = rand() % (1 << page_shift); + + *((char *)dst + offset + r) = *(char *)(dst + offset + r) + i; + offset += 1 << page_shift; + } +} + +static int do_mmap(const struct function *r, struct bench_params *p, + void *src __maybe_unused, void *dst __maybe_unused, + union bench_clock *accum) +{ + union bench_clock start, end, diff; + mmap_op_t fn = r->fn.mmap_op; + bool populate = strcmp(r->name, "populate") == 0; + + if (p->seed) + srand(p->seed); + + for (unsigned int i = 0; i < p->nr_loops; i++) { + clock_get(&start); + dst = bench_mmap(p->size, populate, p->page_shift); + if (!dst) + goto out; + + fn(dst, p->size, p->page_shift, p->seed); + clock_get(&end); + diff = clock_diff(&start, &end); + clock_accum(accum, &diff); + + bench_munmap(dst, p->size); + } + + return 0; +out: + printf("# Memory allocation failed - maybe size (%s) %s?\n", size_str, + p->page_shift != PAGE_SHIFT_4KB ? "has insufficient hugepages" : "is too large"); + return -1; +} + +static const char * const bench_mem_mmap_usage[] = { + "perf bench mem mmap <options>", + NULL +}; + +static const struct function mmap_functions[] = { + { .name = "demand", + .desc = "Demand loaded mmap()", + .fn.mmap_op = mmap_page_touch }, + + { .name = "populate", + .desc = "Eagerly populated mmap()", + .fn.mmap_op = mmap_page_touch }, + + { .name = NULL, } +}; + +int bench_mem_mmap(int argc, const char **argv) +{ + static const struct option bench_mmap_options[] = { + OPT_UINTEGER('r', "randomize", &seed, + "Seed to randomize page access offset."), + OPT_PARENT(bench_common_options), + OPT_END() + }; + + struct bench_mem_info info = { + .functions = mmap_functions, + .do_op = do_mmap, + .usage = bench_mem_mmap_usage, + .options = bench_mmap_options, }; return bench_mem_common(argc, argv, &info); diff --git a/tools/perf/bench/mem-memcpy-arch.h b/tools/perf/bench/mem-memcpy-arch.h index 5bcaec5601a8..852e48cfd8fe 100644 --- a/tools/perf/bench/mem-memcpy-arch.h +++ b/tools/perf/bench/mem-memcpy-arch.h @@ -2,7 +2,7 @@ #ifdef HAVE_ARCH_X86_64_SUPPORT -#define MEMCPY_FN(fn, name, desc) \ +#define MEMCPY_FN(fn, init, fini, name, desc) \ void *fn(void *, const void *, size_t); #include "mem-memcpy-x86-64-asm-def.h" diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h index 6188e19d3129..f43038f4448b 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ MEMCPY_FN(memcpy_orig, + mem_alloc, + mem_free, "x86-64-unrolled", "unrolled memcpy() in arch/x86/lib/memcpy_64.S") MEMCPY_FN(__memcpy, + mem_alloc, + mem_free, "x86-64-movsq", "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memset-arch.h b/tools/perf/bench/mem-memset-arch.h index 53f45482663f..278c5da12d63 100644 --- a/tools/perf/bench/mem-memset-arch.h +++ b/tools/perf/bench/mem-memset-arch.h @@ -2,7 +2,7 @@ #ifdef HAVE_ARCH_X86_64_SUPPORT -#define MEMSET_FN(fn, name, desc) \ +#define MEMSET_FN(fn, init, fini, name, desc) \ void *fn(void *, int, size_t); #include "mem-memset-x86-64-asm-def.h" diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h index 247c72fdfb9d..80ad1b7ea770 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ MEMSET_FN(memset_orig, + mem_alloc, + mem_free, "x86-64-unrolled", "unrolled memset() in arch/x86/lib/memset_64.S") MEMSET_FN(__memset, + mem_alloc, + mem_free, "x86-64-stosq", "movsq-based memset() in arch/x86/lib/memset_64.S") diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c index 5d57d2913f3d..646f43b0f7c4 100644 --- a/tools/perf/builtin-annotate.c +++ b/tools/perf/builtin-annotate.c @@ -917,11 +917,6 @@ int cmd_annotate(int argc, const char **argv) symbol_conf.annotate_data_sample = true; } else if (annotate_opts.code_with_type) { symbol_conf.annotate_data_member = true; - - if (!annotate.use_stdio) { - pr_err("--code-with-type only works with --stdio.\n"); - goto out_delete; - } } setup_browser(true); diff --git a/tools/perf/builtin-bench.c b/tools/perf/builtin-bench.c index 2c1a9f3d847a..02dea1b88228 100644 --- a/tools/perf/builtin-bench.c +++ b/tools/perf/builtin-bench.c @@ -65,6 +65,7 @@ static struct bench mem_benchmarks[] = { { "memcpy", "Benchmark for memcpy() functions", bench_mem_memcpy }, { "memset", "Benchmark for memset() functions", bench_mem_memset }, { "find_bit", "Benchmark for find_bit() functions", bench_mem_find_bit }, + { "mmap", "Benchmark for mmap() mappings", bench_mem_mmap }, { "all", "Run all memory access benchmarks", NULL }, { NULL, NULL, NULL } }; diff --git a/tools/perf/builtin-check.c b/tools/perf/builtin-check.c index b1e205871ab1..9ce2e71999df 100644 --- a/tools/perf/builtin-check.c +++ b/tools/perf/builtin-check.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include "builtin.h" #include "color.h" +#include "util/bpf-utils.h" #include "util/debug.h" #include "util/header.h" #include <tools/config.h> @@ -47,9 +48,10 @@ struct feature_status supported_features[] = { FEATURE_STATUS("libcapstone", HAVE_LIBCAPSTONE_SUPPORT), FEATURE_STATUS("libdw-dwarf-unwind", HAVE_LIBDW_SUPPORT), FEATURE_STATUS("libelf", HAVE_LIBELF_SUPPORT), + FEATURE_STATUS("libLLVM", HAVE_LIBLLVM_SUPPORT), FEATURE_STATUS("libnuma", HAVE_LIBNUMA_SUPPORT), FEATURE_STATUS("libopencsd", HAVE_CSTRACE_SUPPORT), - FEATURE_STATUS("libperl", HAVE_LIBPERL_SUPPORT), + FEATURE_STATUS_TIP("libperl", HAVE_LIBPERL_SUPPORT, "Deprecated, use LIBPERL=1 and install perl-ExtUtils-Embed/libperl-dev to build with it"), FEATURE_STATUS("libpfm4", HAVE_LIBPFM), FEATURE_STATUS("libpython", HAVE_LIBPYTHON_SUPPORT), FEATURE_STATUS("libslang", HAVE_SLANG_SUPPORT), diff --git a/tools/perf/builtin-kvm.c b/tools/perf/builtin-kvm.c index 7b15b4a705e4..f0f285763f19 100644 --- a/tools/perf/builtin-kvm.c +++ b/tools/perf/builtin-kvm.c @@ -1636,14 +1636,6 @@ exit: return ret; } -#define STRDUP_FAIL_EXIT(s) \ - ({ char *_p; \ - _p = strdup(s); \ - if (!_p) \ - return -ENOMEM; \ - _p; \ - }) - int __weak setup_kvm_events_tp(struct perf_kvm_stat *kvm __maybe_unused) { return 0; @@ -1688,7 +1680,7 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv) rec_argv[i] = STRDUP_FAIL_EXIT(record_args[i]); for (j = 0; j < events_tp_size; j++) { - rec_argv[i++] = "-e"; + rec_argv[i++] = STRDUP_FAIL_EXIT("-e"); rec_argv[i++] = STRDUP_FAIL_EXIT(kvm_events_tp[j]); } @@ -1696,7 +1688,7 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv) rec_argv[i++] = STRDUP_FAIL_EXIT(kvm->file_name); for (j = 1; j < (unsigned int)argc; j++, i++) - rec_argv[i] = argv[j]; + rec_argv[i] = STRDUP_FAIL_EXIT(argv[j]); set_option_flag(record_options, 'e', "event", PARSE_OPT_HIDDEN); set_option_flag(record_options, 0, "filter", PARSE_OPT_HIDDEN); @@ -1719,7 +1711,13 @@ kvm_events_record(struct perf_kvm_stat *kvm, int argc, const char **argv) set_option_flag(record_options, 0, "transaction", PARSE_OPT_DISABLED); record_usage = kvm_stat_record_usage; - return cmd_record(i, rec_argv); + ret = cmd_record(i, rec_argv); + +EXIT: + for (i = 0; i < rec_argc; i++) + free((void *)rec_argv[i]); + free(rec_argv); + return ret; } static int @@ -2000,58 +1998,122 @@ static int __cmd_record(const char *file_name, int argc, const char **argv) int rec_argc, i = 0, j, ret; const char **rec_argv; - ret = kvm_add_default_arch_event(&argc, argv); - if (ret) - return -EINVAL; - - rec_argc = argc + 2; + /* + * Besides the 2 more options "-o" and "filename", + * kvm_add_default_arch_event() may add 2 extra options, + * so allocate 4 more items. + */ + rec_argc = argc + 2 + 2; rec_argv = calloc(rec_argc + 1, sizeof(char *)); - rec_argv[i++] = strdup("record"); - rec_argv[i++] = strdup("-o"); - rec_argv[i++] = strdup(file_name); + if (!rec_argv) + return -ENOMEM; + + rec_argv[i++] = STRDUP_FAIL_EXIT("record"); + rec_argv[i++] = STRDUP_FAIL_EXIT("-o"); + rec_argv[i++] = STRDUP_FAIL_EXIT(file_name); for (j = 1; j < argc; j++, i++) - rec_argv[i] = argv[j]; + rec_argv[i] = STRDUP_FAIL_EXIT(argv[j]); BUG_ON(i != rec_argc); - return cmd_record(i, rec_argv); + ret = kvm_add_default_arch_event(&i, rec_argv); + if (ret) + goto EXIT; + + ret = cmd_record(i, rec_argv); + +EXIT: + for (i = 0; i < rec_argc; i++) + free((void *)rec_argv[i]); + free(rec_argv); + return ret; } static int __cmd_report(const char *file_name, int argc, const char **argv) { - int rec_argc, i = 0, j; + int rec_argc, i = 0, j, ret; const char **rec_argv; rec_argc = argc + 2; rec_argv = calloc(rec_argc + 1, sizeof(char *)); - rec_argv[i++] = strdup("report"); - rec_argv[i++] = strdup("-i"); - rec_argv[i++] = strdup(file_name); + if (!rec_argv) + return -ENOMEM; + + rec_argv[i++] = STRDUP_FAIL_EXIT("report"); + rec_argv[i++] = STRDUP_FAIL_EXIT("-i"); + rec_argv[i++] = STRDUP_FAIL_EXIT(file_name); for (j = 1; j < argc; j++, i++) - rec_argv[i] = argv[j]; + rec_argv[i] = STRDUP_FAIL_EXIT(argv[j]); BUG_ON(i != rec_argc); - return cmd_report(i, rec_argv); + ret = cmd_report(i, rec_argv); + +EXIT: + for (i = 0; i < rec_argc; i++) + free((void *)rec_argv[i]); + free(rec_argv); + return ret; } static int __cmd_buildid_list(const char *file_name, int argc, const char **argv) { - int rec_argc, i = 0, j; + int rec_argc, i = 0, j, ret; const char **rec_argv; rec_argc = argc + 2; rec_argv = calloc(rec_argc + 1, sizeof(char *)); - rec_argv[i++] = strdup("buildid-list"); - rec_argv[i++] = strdup("-i"); - rec_argv[i++] = strdup(file_name); + if (!rec_argv) + return -ENOMEM; + + rec_argv[i++] = STRDUP_FAIL_EXIT("buildid-list"); + rec_argv[i++] = STRDUP_FAIL_EXIT("-i"); + rec_argv[i++] = STRDUP_FAIL_EXIT(file_name); for (j = 1; j < argc; j++, i++) - rec_argv[i] = argv[j]; + rec_argv[i] = STRDUP_FAIL_EXIT(argv[j]); BUG_ON(i != rec_argc); - return cmd_buildid_list(i, rec_argv); + ret = cmd_buildid_list(i, rec_argv); + +EXIT: + for (i = 0; i < rec_argc; i++) + free((void *)rec_argv[i]); + free(rec_argv); + return ret; +} + +static int __cmd_top(int argc, const char **argv) +{ + int rec_argc, i = 0, ret; + const char **rec_argv; + + /* + * kvm_add_default_arch_event() may add 2 extra options, so + * allocate 2 more pointers in adavance. + */ + rec_argc = argc + 2; + rec_argv = calloc(rec_argc + 1, sizeof(char *)); + if (!rec_argv) + return -ENOMEM; + + for (i = 0; i < argc; i++) + rec_argv[i] = STRDUP_FAIL_EXIT(argv[i]); + + BUG_ON(i != argc); + + ret = kvm_add_default_arch_event(&i, rec_argv); + if (ret) + goto EXIT; + + ret = cmd_top(i, rec_argv); + +EXIT: + for (i = 0; i < rec_argc; i++) + free((void *)rec_argv[i]); + free(rec_argv); + return ret; } int cmd_kvm(int argc, const char **argv) @@ -2114,7 +2176,7 @@ int cmd_kvm(int argc, const char **argv) else if (strlen(argv[0]) > 2 && strstarts("diff", argv[0])) return cmd_diff(argc, argv); else if (!strcmp(argv[0], "top")) - return cmd_top(argc, argv); + return __cmd_top(argc, argv); else if (strlen(argv[0]) > 2 && strstarts("buildid-list", argv[0])) return __cmd_buildid_list(file_name, argc, argv); #if defined(HAVE_KVM_STAT_SUPPORT) && defined(HAVE_LIBTRACEEVENT) diff --git a/tools/perf/builtin-kwork.c b/tools/perf/builtin-kwork.c index d2e08de5976d..7f3068264568 100644 --- a/tools/perf/builtin-kwork.c +++ b/tools/perf/builtin-kwork.c @@ -2273,12 +2273,23 @@ static void setup_event_list(struct perf_kwork *kwork, pr_debug("\n"); } +#define STRDUP_FAIL_EXIT(s) \ + ({ char *_p; \ + _p = strdup(s); \ + if (!_p) { \ + ret = -ENOMEM; \ + goto EXIT; \ + } \ + _p; \ + }) + static int perf_kwork__record(struct perf_kwork *kwork, int argc, const char **argv) { const char **rec_argv; unsigned int rec_argc, i, j; struct kwork_class *class; + int ret; const char *const record_args[] = { "record", @@ -2298,17 +2309,17 @@ static int perf_kwork__record(struct perf_kwork *kwork, return -ENOMEM; for (i = 0; i < ARRAY_SIZE(record_args); i++) - rec_argv[i] = strdup(record_args[i]); + rec_argv[i] = STRDUP_FAIL_EXIT(record_args[i]); list_for_each_entry(class, &kwork->class_list, list) { for (j = 0; j < class->nr_tracepoints; j++) { - rec_argv[i++] = strdup("-e"); - rec_argv[i++] = strdup(class->tp_handlers[j].name); + rec_argv[i++] = STRDUP_FAIL_EXIT("-e"); + rec_argv[i++] = STRDUP_FAIL_EXIT(class->tp_handlers[j].name); } } for (j = 1; j < (unsigned int)argc; j++, i++) - rec_argv[i] = argv[j]; + rec_argv[i] = STRDUP_FAIL_EXIT(argv[j]); BUG_ON(i != rec_argc); @@ -2317,7 +2328,13 @@ static int perf_kwork__record(struct perf_kwork *kwork, pr_debug("%s ", rec_argv[j]); pr_debug("\n"); - return cmd_record(i, rec_argv); + ret = cmd_record(i, rec_argv); + +EXIT: + for (i = 0; i < rec_argc; i++) + free((void *)rec_argv[i]); + free(rec_argv); + return ret; } int cmd_kwork(int argc, const char **argv) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 7ea3a11aca70..d76f01956e33 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -1408,8 +1408,6 @@ try_again: ui__error("%s\n", msg); goto out; } - - pos->supported = true; } if (symbol_conf.kptr_restrict && !evlist__exclude_kernel(evlist)) { diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index f166d6cbc083..eca3b1c58c4b 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1532,35 +1532,24 @@ static int process_sched_wakeup_ignore(const struct perf_tool *tool __maybe_unus return 0; } -union map_priv { - void *ptr; - bool color; -}; - static bool thread__has_color(struct thread *thread) { - union map_priv priv = { - .ptr = thread__priv(thread), - }; - - return priv.color; + return thread__priv(thread) != NULL; } static struct thread* map__findnew_thread(struct perf_sched *sched, struct machine *machine, pid_t pid, pid_t tid) { struct thread *thread = machine__findnew_thread(machine, pid, tid); - union map_priv priv = { - .color = false, - }; + bool color = false; if (!sched->map.color_pids || !thread || thread__priv(thread)) return thread; if (thread_map__has(sched->map.color_pids, tid)) - priv.color = true; + color = true; - thread__set_priv(thread, priv.ptr); + thread__set_priv(thread, color ? ((void*)1) : NULL); return thread; } diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index d9fbdcf72f25..8124fcb51da9 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -43,6 +43,7 @@ #include <linux/stringify.h> #include <linux/time64.h> #include <linux/zalloc.h> +#include <linux/unaligned.h> #include <sys/utsname.h> #include "asm/bug.h" #include "util/mem-events.h" @@ -223,7 +224,7 @@ enum { OUTPUT_TYPE_MAX }; -// We need to refactor the evsel->priv use in in 'perf script' to allow for +// We need to refactor the evsel->priv use in 'perf script' to allow for // using that area, that is being used only in some cases. #define OUTPUT_TYPE_UNSET -1 @@ -1224,7 +1225,6 @@ static int any_dump_insn(struct evsel *evsel __maybe_unused, u8 *inbuf, int inlen, int *lenp, FILE *fp) { -#ifdef HAVE_LIBCAPSTONE_SUPPORT if (PRINT_FIELD(BRSTACKDISASM)) { int printed = fprintf_insn_asm(x->machine, x->thread, x->cpumode, x->is64bit, (uint8_t *)inbuf, inlen, ip, lenp, @@ -1233,7 +1233,6 @@ static int any_dump_insn(struct evsel *evsel __maybe_unused, if (printed > 0) return printed; } -#endif return fprintf(fp, "%s", dump_insn(x, ip, inbuf, inlen, lenp)); } @@ -2003,6 +2002,33 @@ static int perf_sample__fprintf_synth_iflag_chg(struct perf_sample *sample, FILE return len + perf_sample__fprintf_pt_spacing(len, fp); } +#ifdef HAVE_AUXTRACE_SUPPORT +static int perf_sample__fprintf_synth_vpadtl(struct perf_sample *data, FILE *fp) +{ + struct powerpc_vpadtl_entry *dtl = (struct powerpc_vpadtl_entry *)data->raw_data; + int len; + + len = fprintf(fp, "timebase: %" PRIu64 " dispatch_reason:%s, preempt_reason:%s,\n" + "enqueue_to_dispatch_time:%d, ready_to_enqueue_time:%d," + "waiting_to_ready_time:%d, processor_id: %d", + get_unaligned_be64(&dtl->timebase), + dispatch_reasons[dtl->dispatch_reason], + preempt_reasons[dtl->preempt_reason], + be32_to_cpu(dtl->enqueue_to_dispatch_time), + be32_to_cpu(dtl->ready_to_enqueue_time), + be32_to_cpu(dtl->waiting_to_ready_time), + be16_to_cpu(dtl->processor_id)); + + return len; +} +#else +static int perf_sample__fprintf_synth_vpadtl(struct perf_sample *data __maybe_unused, + FILE *fp __maybe_unused) +{ + return 0; +} +#endif + static int perf_sample__fprintf_synth(struct perf_sample *sample, struct evsel *evsel, FILE *fp) { @@ -2025,6 +2051,8 @@ static int perf_sample__fprintf_synth(struct perf_sample *sample, return perf_sample__fprintf_synth_evt(sample, fp); case PERF_SYNTH_INTEL_IFLAG_CHG: return perf_sample__fprintf_synth_iflag_chg(sample, fp); + case PERF_SYNTH_POWERPC_VPA_DTL: + return perf_sample__fprintf_synth_vpadtl(sample, fp); default: break; } diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 2c38dd98f6ca..7006f848f87a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -610,38 +610,33 @@ static int dispatch_events(bool forks, int timeout, int interval, int *times) enum counter_recovery { COUNTER_SKIP, COUNTER_RETRY, - COUNTER_FATAL, }; -static enum counter_recovery stat_handle_error(struct evsel *counter) +static enum counter_recovery stat_handle_error(struct evsel *counter, int err) { char msg[BUFSIZ]; + + assert(!counter->supported); + /* * PPC returns ENXIO for HW counters until 2.6.37 * (behavior changed with commit b0a873e). */ - if (errno == EINVAL || errno == ENOSYS || - errno == ENOENT || errno == ENXIO) { - if (verbose > 0) + if (err == EINVAL || err == ENOSYS || err == ENOENT || err == ENXIO) { + if (verbose > 0) { ui__warning("%s event is not supported by the kernel.\n", evsel__name(counter)); - counter->supported = false; - /* - * errored is a sticky flag that means one of the counter's - * cpu event had a problem and needs to be reexamined. - */ - counter->errored = true; - - if ((evsel__leader(counter) != counter) || - !(counter->core.leader->nr_members > 1)) - return COUNTER_SKIP; - } else if (evsel__fallback(counter, &target, errno, msg, sizeof(msg))) { + } + return COUNTER_SKIP; + } + if (evsel__fallback(counter, &target, err, msg, sizeof(msg))) { if (verbose > 0) ui__warning("%s\n", msg); + counter->supported = true; return COUNTER_RETRY; - } else if (target__has_per_thread(&target) && errno != EOPNOTSUPP && - evsel_list->core.threads && - evsel_list->core.threads->err_thread != -1) { + } + if (target__has_per_thread(&target) && err != EOPNOTSUPP && + evsel_list->core.threads && evsel_list->core.threads->err_thread != -1) { /* * For global --per-thread case, skip current * error thread. @@ -649,37 +644,73 @@ static enum counter_recovery stat_handle_error(struct evsel *counter) if (!thread_map__remove(evsel_list->core.threads, evsel_list->core.threads->err_thread)) { evsel_list->core.threads->err_thread = -1; + counter->supported = true; return COUNTER_RETRY; } - } else if (counter->skippable) { - if (verbose > 0) - ui__warning("skipping event %s that kernel failed to open .\n", - evsel__name(counter)); - counter->supported = false; - counter->errored = true; - return COUNTER_SKIP; } + if (verbose > 0) { + ui__warning(err == EOPNOTSUPP + ? "%s event is not supported by the kernel.\n" + : "skipping event %s that kernel failed to open.\n", + evsel__name(counter)); + } + return COUNTER_SKIP; +} - if (errno == EOPNOTSUPP) { - if (verbose > 0) { - ui__warning("%s event is not supported by the kernel.\n", - evsel__name(counter)); - } - counter->supported = false; - counter->errored = true; +static int create_perf_stat_counter(struct evsel *evsel, + struct perf_stat_config *config, + int cpu_map_idx) +{ + struct perf_event_attr *attr = &evsel->core.attr; + struct evsel *leader = evsel__leader(evsel); + + /* Reset supported flag as creating a stat counter is retried. */ + attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; + + /* + * The event is part of non trivial group, let's enable + * the group read (for leader) and ID retrieval for all + * members. + */ + if (leader->core.nr_members > 1) + attr->read_format |= PERF_FORMAT_ID|PERF_FORMAT_GROUP; - if ((evsel__leader(counter) != counter) || - !(counter->core.leader->nr_members > 1)) - return COUNTER_SKIP; + attr->inherit = !config->no_inherit && list_empty(&evsel->bpf_counter_list); + + /* + * Some events get initialized with sample_(period/type) set, + * like tracepoints. Clear it up for counting. + */ + attr->sample_period = 0; + + if (config->identifier) + attr->sample_type = PERF_SAMPLE_IDENTIFIER; + + if (config->all_user) { + attr->exclude_kernel = 1; + attr->exclude_user = 0; } - evsel__open_strerror(counter, &target, errno, msg, sizeof(msg)); - ui__error("%s\n", msg); + if (config->all_kernel) { + attr->exclude_kernel = 0; + attr->exclude_user = 1; + } - if (child_pid != -1) - kill(child_pid, SIGTERM); + /* + * Disabling all counters initially, they will be enabled + * either manually by us or by kernel via enable_on_exec + * set later. + */ + if (evsel__is_group_leader(evsel)) { + attr->disabled = 1; - return COUNTER_FATAL; + if (target__enable_on_exec(&target)) + attr->enable_on_exec = 1; + } + + return evsel__open_per_cpu_and_thread(evsel, evsel__cpus(evsel), cpu_map_idx, + evsel->core.threads); } static int __run_perf_stat(int argc, const char **argv, int run_idx) @@ -696,8 +727,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) bool is_pipe = STAT_RECORD ? perf_stat.data.is_pipe : false; struct evlist_cpu_iterator evlist_cpu_itr; struct affinity saved_affinity, *affinity = NULL; - int err; - bool second_pass = false; + int err, open_err = 0; + bool second_pass = false, has_supported_counters; if (forks) { if (evlist__prepare_workload(evsel_list, &target, argv, is_pipe, workload_exec_failed_signal) < 0) { @@ -737,14 +768,17 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) if (target.use_bpf) break; - if (counter->reset_group || counter->errored) + if (counter->reset_group || !counter->supported) continue; if (evsel__is_bperf(counter)) continue; -try_again: - if (create_perf_stat_counter(counter, &stat_config, &target, - evlist_cpu_itr.cpu_map_idx) < 0) { + while (true) { + if (create_perf_stat_counter(counter, &stat_config, + evlist_cpu_itr.cpu_map_idx) == 0) + break; + + open_err = errno; /* * Weak group failed. We cannot just undo this here * because earlier CPUs might be in group mode, and the kernel @@ -752,29 +786,19 @@ try_again: * it to later. * Don't close here because we're in the wrong affinity. */ - if ((errno == EINVAL || errno == EBADF) && + if ((open_err == EINVAL || open_err == EBADF) && evsel__leader(counter) != counter && counter->weak_group) { evlist__reset_weak_group(evsel_list, counter, false); assert(counter->reset_group); + counter->supported = true; second_pass = true; - continue; - } - - switch (stat_handle_error(counter)) { - case COUNTER_FATAL: - err = -1; - goto err_out; - case COUNTER_RETRY: - goto try_again; - case COUNTER_SKIP: - continue; - default: break; } + if (stat_handle_error(counter, open_err) != COUNTER_RETRY) + break; } - counter->supported = true; } if (second_pass) { @@ -787,7 +811,7 @@ try_again: evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; - if (!counter->reset_group && !counter->errored) + if (!counter->reset_group && counter->supported) continue; perf_evsel__close_cpu(&counter->core, evlist_cpu_itr.cpu_map_idx); @@ -798,34 +822,29 @@ try_again: if (!counter->reset_group) continue; -try_again_reset: - pr_debug2("reopening weak %s\n", evsel__name(counter)); - if (create_perf_stat_counter(counter, &stat_config, &target, - evlist_cpu_itr.cpu_map_idx) < 0) { - - switch (stat_handle_error(counter)) { - case COUNTER_FATAL: - err = -1; - goto err_out; - case COUNTER_RETRY: - goto try_again_reset; - case COUNTER_SKIP: - continue; - default: + + while (true) { + pr_debug2("reopening weak %s\n", evsel__name(counter)); + if (create_perf_stat_counter(counter, &stat_config, + evlist_cpu_itr.cpu_map_idx) == 0) + break; + + open_err = errno; + if (stat_handle_error(counter, open_err) != COUNTER_RETRY) break; - } } - counter->supported = true; } } affinity__cleanup(affinity); affinity = NULL; + has_supported_counters = false; evlist__for_each_entry(evsel_list, counter) { if (!counter->supported) { perf_evsel__free_fd(&counter->core); continue; } + has_supported_counters = true; l = strlen(counter->unit); if (l > stat_config.unit_width) @@ -837,6 +856,16 @@ try_again_reset: goto err_out; } } + if (!has_supported_counters) { + evsel__open_strerror(evlist__first(evsel_list), &target, open_err, + msg, sizeof(msg)); + ui__error("No supported events found.\n%s\n", msg); + + if (child_pid != -1) + kill(child_pid, SIGTERM); + err = -1; + goto err_out; + } if (evlist__apply_filters(evsel_list, &counter, &target)) { pr_err("failed to set filter \"%s\" on event %s with %d (%s)\n", diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index fe737b3ac6e6..c607f39b8c8b 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -196,6 +196,7 @@ struct trace { unsigned int max_stack; unsigned int min_stack; enum trace_summary_mode summary_mode; + int max_summary; int raw_augmented_syscalls_args_size; bool raw_augmented_syscalls; bool fd_path_disabled; @@ -4440,7 +4441,7 @@ create_maps: if (trace->summary_mode == SUMMARY__BY_TOTAL && !trace->summary_bpf) { trace->syscall_stats = alloc_syscall_stats(); - if (trace->syscall_stats == NULL) + if (IS_ERR(trace->syscall_stats)) goto out_delete_evlist; } @@ -4599,7 +4600,7 @@ out_disable: if (!err) { if (trace->summary) { if (trace->summary_bpf) - trace_print_bpf_summary(trace->output); + trace_print_bpf_summary(trace->output, trace->max_summary); else if (trace->summary_mode == SUMMARY__BY_TOTAL) trace__fprintf_total_summary(trace, trace->output); else @@ -4748,7 +4749,7 @@ static int trace__replay(struct trace *trace) if (trace->summary_mode == SUMMARY__BY_TOTAL) { trace->syscall_stats = alloc_syscall_stats(); - if (trace->syscall_stats == NULL) + if (IS_ERR(trace->syscall_stats)) goto out; } @@ -4822,6 +4823,7 @@ static size_t syscall__dump_stats(struct trace *trace, int e_machine, FILE *fp, struct hashmap *syscall_stats) { size_t printed = 0; + int lines = 0; struct syscall *sc; struct syscall_entry *entries; @@ -4866,7 +4868,11 @@ static size_t syscall__dump_stats(struct trace *trace, int e_machine, FILE *fp, fprintf(fp, "\t\t\t\t%s: %d\n", perf_env__arch_strerrno(trace->host->env, e + 1), stats->errnos[e]); } } + lines++; } + + if (trace->max_summary && trace->max_summary <= lines) + break; } free(entries); @@ -5443,6 +5449,8 @@ int cmd_trace(int argc, const char **argv) OPT_BOOLEAN(0, "force-btf", &trace.force_btf, "Prefer btf_dump general pretty printer" "to customized ones"), OPT_BOOLEAN(0, "bpf-summary", &trace.summary_bpf, "Summary syscall stats in BPF"), + OPT_INTEGER(0, "max-summary", &trace.max_summary, + "Max number of entries in the summary."), OPTS_EVSWITCH(&trace.evswitch), OPT_END() }; diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index be519c433ce4..e0537f275da2 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -11,10 +11,16 @@ declare -a FILES=( "include/uapi/linux/bits.h" "include/uapi/linux/fadvise.h" "include/uapi/linux/fscrypt.h" + "include/uapi/linux/genetlink.h" + "include/uapi/linux/if_addr.h" + "include/uapi/linux/in.h" "include/uapi/linux/kcmp.h" "include/uapi/linux/kvm.h" - "include/uapi/linux/in.h" + "include/uapi/linux/neighbour.h" + "include/uapi/linux/netfilter.h" + "include/uapi/linux/netfilter_arp.h" "include/uapi/linux/perf_event.h" + "include/uapi/linux/rtnetlink.h" "include/uapi/linux/seccomp.h" "include/uapi/linux/stat.h" "include/linux/bits.h" @@ -23,6 +29,7 @@ declare -a FILES=( "include/linux/const.h" "include/vdso/const.h" "include/vdso/unaligned.h" + "include/linux/gfp_types.h" "include/linux/hash.h" "include/linux/list-sort.h" "include/uapi/linux/hw_breakpoint.h" @@ -40,15 +47,12 @@ declare -a FILES=( "arch/s390/include/uapi/asm/perf_regs.h" "arch/x86/include/uapi/asm/perf_regs.h" "arch/x86/include/uapi/asm/kvm.h" - "arch/x86/include/uapi/asm/kvm_perf.h" "arch/x86/include/uapi/asm/svm.h" "arch/x86/include/uapi/asm/unistd.h" "arch/x86/include/uapi/asm/vmx.h" "arch/powerpc/include/uapi/asm/kvm.h" "arch/s390/include/uapi/asm/kvm.h" - "arch/s390/include/uapi/asm/kvm_perf.h" "arch/s390/include/uapi/asm/sie.h" - "arch/arm/include/uapi/asm/kvm.h" "arch/arm64/include/uapi/asm/kvm.h" "arch/arm64/include/uapi/asm/unistd.h" "arch/alpha/include/uapi/asm/errno.h" diff --git a/tools/perf/perf.h b/tools/perf/perf.h index 3cb40965549f..e004178472d9 100644 --- a/tools/perf/perf.h +++ b/tools/perf/perf.h @@ -2,9 +2,7 @@ #ifndef _PERF_PERF_H #define _PERF_PERF_H -#ifndef MAX_NR_CPUS #define MAX_NR_CPUS 4096 -#endif enum perf_affinity { PERF_AFFINITY_SYS = 0, diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/metrics.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/metrics.json index afcdad58ef89..324104438e78 100644 --- a/tools/perf/pmu-events/arch/arm64/ampere/ampereone/metrics.json +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereone/metrics.json @@ -113,7 +113,7 @@ { "MetricName": "load_store_spec_rate", "MetricExpr": "((LDST_SPEC / INST_SPEC) * 100)", - "BriefDescription": "The rate of load or store instructions speculatively executed to overall instructions speclatively executed", + "BriefDescription": "The rate of load or store instructions speculatively executed to overall instructions speculatively executed", "MetricGroup": "Operation_Mix", "ScaleUnit": "1percent of operations" }, @@ -132,7 +132,7 @@ { "MetricName": "pc_write_spec_rate", "MetricExpr": "((PC_WRITE_SPEC / INST_SPEC) * 100)", - "BriefDescription": "The rate of software change of the PC speculatively executed to overall instructions speclatively executed", + "BriefDescription": "The rate of software change of the PC speculatively executed to overall instructions speculatively executed", "MetricGroup": "Operation_Mix", "ScaleUnit": "1percent of operations" }, @@ -195,14 +195,14 @@ { "MetricName": "stall_frontend_cache_rate", "MetricExpr": "((STALL_FRONTEND_CACHE / CPU_CYCLES) * 100)", - "BriefDescription": "Proportion of cycles stalled and no ops delivered from frontend and cache miss", + "BriefDescription": "Proportion of cycles stalled and no operations delivered from frontend and cache miss", "MetricGroup": "Stall", "ScaleUnit": "1percent of cycles" }, { "MetricName": "stall_frontend_tlb_rate", "MetricExpr": "((STALL_FRONTEND_TLB / CPU_CYCLES) * 100)", - "BriefDescription": "Proportion of cycles stalled and no ops delivered from frontend and TLB miss", + "BriefDescription": "Proportion of cycles stalled and no operations delivered from frontend and TLB miss", "MetricGroup": "Stall", "ScaleUnit": "1percent of cycles" }, diff --git a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/metrics.json b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/metrics.json index 5228f94a793f..6817cac149e0 100644 --- a/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/metrics.json +++ b/tools/perf/pmu-events/arch/arm64/ampere/ampereonex/metrics.json @@ -113,7 +113,7 @@ { "MetricName": "load_store_spec_rate", "MetricExpr": "LDST_SPEC / INST_SPEC", - "BriefDescription": "The rate of load or store instructions speculatively executed to overall instructions speclatively executed", + "BriefDescription": "The rate of load or store instructions speculatively executed to overall instructions speculatively executed", "MetricGroup": "Operation_Mix", "ScaleUnit": "100percent of operations" }, @@ -132,7 +132,7 @@ { "MetricName": "pc_write_spec_rate", "MetricExpr": "PC_WRITE_SPEC / INST_SPEC", - "BriefDescription": "The rate of software change of the PC speculatively executed to overall instructions speclatively executed", + "BriefDescription": "The rate of software change of the PC speculatively executed to overall instructions speculatively executed", "MetricGroup": "Operation_Mix", "ScaleUnit": "100percent of operations" }, @@ -195,14 +195,14 @@ { "MetricName": "stall_frontend_cache_rate", "MetricExpr": "STALL_FRONTEND_CACHE / CPU_CYCLES", - "BriefDescription": "Proportion of cycles stalled and no ops delivered from frontend and cache miss", + "BriefDescription": "Proportion of cycles stalled and no operations delivered from frontend and cache miss", "MetricGroup": "Stall", "ScaleUnit": "100percent of cycles" }, { "MetricName": "stall_frontend_tlb_rate", "MetricExpr": "STALL_FRONTEND_TLB / CPU_CYCLES", - "BriefDescription": "Proportion of cycles stalled and no ops delivered from frontend and TLB miss", + "BriefDescription": "Proportion of cycles stalled and no operations delivered from frontend and TLB miss", "MetricGroup": "Stall", "ScaleUnit": "100percent of cycles" }, @@ -391,7 +391,7 @@ "ScaleUnit": "100percent of cache acceses" }, { - "MetricName": "l1d_cache_access_prefetces", + "MetricName": "l1d_cache_access_prefetches", "MetricExpr": "L1D_CACHE_PRFM / L1D_CACHE", "BriefDescription": "L1D cache access - prefetch", "MetricGroup": "Cache", diff --git a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json index 377dfecd96bd..cae7c0cf02f2 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/adl-metrics.json @@ -1,56 +1,56 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c8\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C8_Pkg_Residency", "ScaleUnit": "100%" @@ -552,7 +552,7 @@ }, { "BriefDescription": "Average CPU Utilization", - "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_atom@", "MetricName": "tma_info_system_cpu_utilization", "Unit": "cpu_atom" }, @@ -751,7 +751,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvOB;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -790,11 +790,20 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", + "Unit": "cpu_core" + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full", "Unit": "cpu_core" }, @@ -802,23 +811,14 @@ "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", - "Unit": "cpu_core" - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20", @@ -826,7 +826,7 @@ }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -862,7 +862,7 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -879,7 +879,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", - "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BadSpec;BrMispredicts;BvMP;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", @@ -992,7 +992,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(25 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) + 24 * tma_info_system_core_frequency * cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "BvMS;DataSharing;LockCont;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", @@ -1109,7 +1108,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -1238,7 +1237,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -1851,7 +1850,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute", @@ -1912,7 +1911,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc\\,cpu=cpu_core@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency", "Unit": "cpu_core" @@ -1926,7 +1925,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_core@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized", "Unit": "cpu_core" @@ -1936,7 +1935,7 @@ "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / tma_info_system_time / 1e3", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full", "Unit": "cpu_core" }, { @@ -1980,7 +1979,6 @@ }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(UNC_ARB_TRK_OCCUPANCY.RD + UNC_ARB_DAT_OCCUPANCY.RD) / UNC_ARB_TRK_REQUESTS.RD", "MetricGroup": "Mem;MemoryLat;SoC", "MetricName": "tma_info_system_mem_read_latency", @@ -2032,6 +2030,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / tma_info_system_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency", + "Unit": "cpu_core" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD@", "MetricGroup": "Pipeline", @@ -2150,12 +2155,12 @@ "Unit": "cpu_core" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", "MetricExpr": "min(2 * (cpu_core@MEM_INST_RETIRED.ALL_LOADS@ - cpu_core@MEM_LOAD_RETIRED.FB_HIT@ - cpu_core@MEM_LOAD_RETIRED.L1_MISS@) * 20 / 100, max(cpu_core@CYCLE_ACTIVITY.CYCLES_MEM_ANY@ - cpu_core@MEMORY_ACTIVITY.CYCLES_L1D_MISS@, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2171,7 +2176,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L2 cache under unloaded scenarios (possibly L2 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "3 * tma_info_system_core_frequency * cpu_core@MEM_LOAD_RETIRED.L2_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_l2_bound_group", "MetricName": "tma_l2_hit_latency", @@ -2192,12 +2196,11 @@ }, { "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "9 * tma_info_system_core_frequency * (cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2279,6 +2282,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(16 * max(0, cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ - cpu_core@L2_RQSTS.ALL_RFO@) + cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ / cpu_core@MEM_INST_RETIRED.ALL_STORES@ * (10 * cpu_core@L2_RQSTS.RFO_HIT@ + min(cpu_core@CPU_CLK_UNHALTED.THREAD@, cpu_core@OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO@))) / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -2314,7 +2318,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2324,13 +2328,13 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", - "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -2341,7 +2345,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", @@ -2400,7 +2403,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "max(cpu_core@IDQ.MS_CYCLES_ANY@, cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@ / (cpu_core@UOPS_RETIRED.SLOTS@ / cpu_core@UOPS_ISSUED.ANY@)) / tma_info_core_core_clks / 2", + "MetricExpr": "max(cpu_core@IDQ.MS_CYCLES_ANY@, cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@ / (cpu_core@UOPS_RETIRED.SLOTS@ / cpu_core@UOPS_ISSUED.ANY@)) / tma_info_core_core_clks / 2.4", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -2439,6 +2442,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", @@ -2507,6 +2511,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_3_PORTS_UTIL@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_3_PORTS_UTIL@) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -2517,6 +2522,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "(cpu_core@EXE_ACTIVITY.EXE_BOUND_0_PORTS@ + max(cpu_core@RS.EMPTY_RESOURCE@ - cpu_core@RESOURCE_STALLS.SCOREBOARD@, 0)) / tma_info_thread_clks * (cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", @@ -2527,6 +2533,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", @@ -2537,7 +2544,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.2_PORTS_UTIL@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", @@ -2548,7 +2554,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@UOPS_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks", "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", @@ -2560,7 +2565,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2591,7 +2596,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.PAUSE@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", @@ -2626,7 +2630,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%", "Unit": "cpu_core" }, diff --git a/tools/perf/pmu-events/arch/x86/alderlake/cache.json b/tools/perf/pmu-events/arch/x86/alderlake/cache.json index 5461576dafc7..4cd535baf703 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/cache.json @@ -4,7 +4,6 @@ "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.HWPF_MISS", - "PublicDescription": "L1D.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20", "Unit": "cpu_core" @@ -14,7 +13,7 @@ "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.REPLACEMENT", - "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace. Available PDIST counters: 0", + "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -24,7 +23,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -36,7 +35,7 @@ "EdgeDetect": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL_PERIODS", - "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -47,7 +46,6 @@ "Deprecated": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.L2_STALL", - "PublicDescription": "This event is deprecated. Refer to new event L1D_PEND_MISS.L2_STALLS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4", "Unit": "cpu_core" @@ -57,7 +55,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.L2_STALLS", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x4", "Unit": "cpu_core" @@ -67,7 +65,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING", - "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -78,7 +76,7 @@ "CounterMask": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING_CYCLES", - "PublicDescription": "Counts duration of L1D miss outstanding in cycles. Available PDIST counters: 0", + "PublicDescription": "Counts duration of L1D miss outstanding in cycles.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -88,7 +86,7 @@ "Counter": "0,1,2,3", "EventCode": "0x25", "EventName": "L2_LINES_IN.ALL", - "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects. Available PDIST counters: 0", + "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects.", "SampleAfterValue": "100003", "UMask": "0x1f", "Unit": "cpu_core" @@ -98,7 +96,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.NON_SILENT", - "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3 Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3", "SampleAfterValue": "200003", "UMask": "0x2", "Unit": "cpu_core" @@ -108,7 +106,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.SILENT", - "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event. Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event.", "SampleAfterValue": "200003", "UMask": "0x1", "Unit": "cpu_core" @@ -118,7 +116,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.USELESS_HWPF", - "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache Available PDIST counters: 0", + "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache", "SampleAfterValue": "200003", "UMask": "0x4", "Unit": "cpu_core" @@ -137,7 +135,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.ALL", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES]", "SampleAfterValue": "200003", "UMask": "0xff", "Unit": "cpu_core" @@ -167,7 +165,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f", "Unit": "cpu_core" @@ -177,7 +175,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_CODE_RD", - "PublicDescription": "Counts the total number of L2 code requests. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of L2 code requests.", "SampleAfterValue": "200003", "UMask": "0xe4", "Unit": "cpu_core" @@ -187,7 +185,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD", - "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0xe1", "Unit": "cpu_core" @@ -197,7 +195,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_MISS", - "PublicDescription": "Counts demand requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts demand requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x27", "Unit": "cpu_core" @@ -207,7 +205,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_HWPF", - "PublicDescription": "L2_RQSTS.ALL_HWPF Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0xf0", "Unit": "cpu_core" @@ -217,7 +214,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_RFO", - "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches.", "SampleAfterValue": "200003", "UMask": "0xe2", "Unit": "cpu_core" @@ -227,7 +224,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_HIT", - "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads.", "SampleAfterValue": "200003", "UMask": "0xc4", "Unit": "cpu_core" @@ -237,7 +234,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_MISS", - "PublicDescription": "Counts L2 cache misses when fetching instructions. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache misses when fetching instructions.", "SampleAfterValue": "200003", "UMask": "0x24", "Unit": "cpu_core" @@ -247,7 +244,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT", - "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc1", "Unit": "cpu_core" @@ -257,7 +254,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_MISS", - "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0x21", "Unit": "cpu_core" @@ -267,7 +264,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.HWPF_MISS", - "PublicDescription": "L2_RQSTS.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x30", "Unit": "cpu_core" @@ -277,7 +273,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f", "Unit": "cpu_core" @@ -287,7 +283,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.REFERENCES", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL]", "SampleAfterValue": "200003", "UMask": "0xff", "Unit": "cpu_core" @@ -297,7 +293,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_HIT", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc2", "Unit": "cpu_core" @@ -307,7 +303,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_MISS", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x22", "Unit": "cpu_core" @@ -317,7 +313,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_HIT", - "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0xc8", "Unit": "cpu_core" @@ -327,7 +323,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_MISS", - "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0x28", "Unit": "cpu_core" @@ -337,7 +333,7 @@ "Counter": "0,1,2,3", "EventCode": "0x23", "EventName": "L2_TRANS.L2_WB", - "PublicDescription": "Counts L2 writebacks that access L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts L2 writebacks that access L2 cache.", "SampleAfterValue": "200003", "UMask": "0x40", "Unit": "cpu_core" @@ -357,7 +353,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.MISS", - "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x41", "Unit": "cpu_core" @@ -377,7 +373,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.REFERENCE", - "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x4f", "Unit": "cpu_core" @@ -552,7 +548,7 @@ "Counter": "0,1,2,3", "EventCode": "0x43", "EventName": "MEM_LOAD_COMPLETED.L1_MISS_ANY", - "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss) Available PDIST counters: 0", + "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss)", "SampleAfterValue": "1000003", "UMask": "0xfd", "Unit": "cpu_core" @@ -853,7 +849,6 @@ "Counter": "0,1,2,3", "EventCode": "0x44", "EventName": "MEM_STORE_RETIRED.L2_HIT", - "PublicDescription": "MEM_STORE_RETIRED.L2_HIT Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x1", "Unit": "cpu_core" @@ -1050,7 +1045,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe5", "EventName": "MEM_UOP_RETIRED.ANY", - "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses Available PDIST counters: 0", + "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses", "SampleAfterValue": "1000003", "UMask": "0x3", "Unit": "cpu_core" @@ -1068,6 +1063,30 @@ "Unit": "cpu_atom" }, { + "BriefDescription": "Counts writebacks of modified cachelines that hit in the L3 or were snooped from another core's caches.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_M.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x1F803C0008", + "PublicDescription": "Counts writebacks of modified cachelines that hit in the L3 or were snooped from another core's caches. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts writebacks of non-modified cachelines that hit in the L3 or were snooped from another core's caches.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_NONM.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x1F803C1000", + "PublicDescription": "Counts writebacks of non-modified cachelines that hit in the L3 or were snooped from another core's caches. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts demand instruction fetches and L1 instruction cache prefetches that have any type of response.", "Counter": "0,1,2,3,4,5", "EventCode": "0xB7", @@ -1308,6 +1327,18 @@ "Unit": "cpu_atom" }, { + "BriefDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that hit in the L3 or were snooped from another core's caches.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.READS_TO_CORE.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x1F803C4477", + "PublicDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that hit in the L3 or were snooped from another core's caches. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts L1 data cache software prefetches which include T0/T1/T2 and NTA (except PREFETCHW) that have any type of response.", "Counter": "0,1,2,3,4,5", "EventCode": "0xB7", @@ -1372,7 +1403,6 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.ALL_REQUESTS", - "PublicDescription": "OFFCORE_REQUESTS.ALL_REQUESTS Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x80", "Unit": "cpu_core" @@ -1382,7 +1412,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DATA_RD", - "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -1392,7 +1422,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_CODE_RD", - "PublicDescription": "Counts both cacheable and non-cacheable code read requests. Available PDIST counters: 0", + "PublicDescription": "Counts both cacheable and non-cacheable code read requests.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -1402,7 +1432,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD", - "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore. Available PDIST counters: 0", + "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -1412,7 +1442,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_RFO", - "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM. Available PDIST counters: 0", + "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -1424,7 +1454,6 @@ "Errata": "ADL038", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD", - "PublicDescription": "This event is deprecated. Refer to new event OFFCORE_REQUESTS_OUTSTANDING.DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -1436,7 +1465,6 @@ "Errata": "ADL038", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -1447,7 +1475,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1458,7 +1486,6 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "PublicDescription": "Cycles where at least 1 outstanding demand data read request is pending. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1469,7 +1496,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", - "PublicDescription": "Counts the number of offcore outstanding demand rfo Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding demand rfo Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1480,7 +1507,6 @@ "Errata": "ADL038", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -1490,7 +1516,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1500,7 +1526,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1510,7 +1536,7 @@ "Counter": "0,1,2,3", "EventCode": "0x2c", "EventName": "SQ_MISC.BUS_LOCK", - "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory. Available PDIST counters: 0", + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -1520,7 +1546,6 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.ANY", - "PublicDescription": "Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0xf", "Unit": "cpu_core" @@ -1530,7 +1555,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.NTA", - "PublicDescription": "Counts the number of PREFETCHNTA instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHNTA instructions executed.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -1540,7 +1565,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.PREFETCHW", - "PublicDescription": "Counts the number of PREFETCHW instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHW instructions executed.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -1550,7 +1575,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T0", - "PublicDescription": "Counts the number of PREFETCHT0 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT0 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -1560,7 +1585,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T1_T2", - "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json b/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json index d01f1b163ed8..62fd70f220e5 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/floating-point.json @@ -14,7 +14,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.FPDIV_ACTIVE", - "PublicDescription": "ARITH.FPDIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -33,7 +32,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.FP", - "PublicDescription": "Counts all microcode Floating Point assists. Available PDIST counters: 0", + "PublicDescription": "Counts all microcode Floating Point assists.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -43,7 +42,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.SSE_AVX_MIX", - "PublicDescription": "ASSISTS.SSE_AVX_MIX Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -53,7 +51,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_0", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -63,7 +60,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_1", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -73,7 +69,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_5", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -83,7 +78,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V0", - "PublicDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -93,7 +87,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V1", - "PublicDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -103,7 +96,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V2", - "PublicDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -113,7 +105,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -123,7 +115,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -133,7 +125,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -143,7 +135,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -153,7 +145,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x18", "Unit": "cpu_core" @@ -163,7 +155,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR", - "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0x3", "Unit": "cpu_core" @@ -173,7 +165,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -183,7 +175,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_SINGLE", - "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -193,7 +185,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.VECTOR", - "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0xfc", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json index dae3174a74fb..ff3b30c2619a 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/frontend.json @@ -14,7 +14,7 @@ "Counter": "0,1,2,3", "EventCode": "0x60", "EventName": "BACLEARS.ANY", - "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0", + "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -24,7 +24,7 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.LCP", - "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk. Available PDIST counters: 0", + "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk.", "SampleAfterValue": "500009", "UMask": "0x1", "Unit": "cpu_core" @@ -34,7 +34,6 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.MS_BUSY", - "PublicDescription": "Cycles the Microcode Sequencer is busy. Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x2", "Unit": "cpu_core" @@ -44,7 +43,7 @@ "Counter": "0,1,2,3", "EventCode": "0x61", "EventName": "DSB2MITE_SWITCHES.PENALTY_CYCLES", - "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE. Available PDIST counters: 0", + "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -302,7 +301,7 @@ "Counter": "0,1,2,3", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALLS", - "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity.", "SampleAfterValue": "500009", "UMask": "0x4", "Unit": "cpu_core" @@ -314,7 +313,6 @@ "EdgeDetect": "1", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALL_PERIODS", - "PublicDescription": "ICACHE_DATA.STALL_PERIODS Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x4", "Unit": "cpu_core" @@ -324,7 +322,7 @@ "Counter": "0,1,2,3", "EventCode": "0x83", "EventName": "ICACHE_TAG.STALLS", - "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss.", "SampleAfterValue": "200003", "UMask": "0x4", "Unit": "cpu_core" @@ -335,7 +333,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -346,7 +344,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -356,7 +354,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.DSB_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -367,7 +365,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -378,7 +376,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -388,7 +386,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MITE_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -399,7 +397,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MS_CYCLES_ANY", - "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE.", "SampleAfterValue": "2000003", "UMask": "0x20", "Unit": "cpu_core" @@ -411,7 +409,7 @@ "EdgeDetect": "1", "EventCode": "0x79", "EventName": "IDQ.MS_SWITCHES", - "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer. Available PDIST counters: 0", + "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -421,7 +419,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MS_UOPS", - "PublicDescription": "Counts the total number of uops delivered by the Microcode Sequencer (MS). Available PDIST counters: 0", + "PublicDescription": "Counts the total number of uops delivered by the Microcode Sequencer (MS).", "SampleAfterValue": "1000003", "UMask": "0x20", "Unit": "cpu_core" @@ -431,7 +429,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CORE", - "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -442,7 +440,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -454,7 +452,7 @@ "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -464,7 +462,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CORE", - "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -475,7 +473,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -487,7 +485,7 @@ "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/alderlake/memory.json b/tools/perf/pmu-events/arch/x86/alderlake/memory.json index 07f5786bdbc0..a0260d5b8619 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/memory.json @@ -5,7 +5,6 @@ "CounterMask": "6", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x6", "Unit": "cpu_core" @@ -79,7 +78,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", - "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture Available PDIST counters: 0", + "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -90,7 +89,6 @@ "CounterMask": "2", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -101,7 +99,6 @@ "CounterMask": "3", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x3", "Unit": "cpu_core" @@ -112,7 +109,7 @@ "CounterMask": "5", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_core" @@ -123,7 +120,7 @@ "CounterMask": "9", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x9", "Unit": "cpu_core" @@ -417,7 +414,6 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "Counts demand data read requests that miss the L3 cache. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -427,7 +423,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache.", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/alderlake/other.json b/tools/perf/pmu-events/arch/x86/alderlake/other.json index 5f64138edfe4..af46cde26b54 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/other.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/other.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.HARDWARE", - "PublicDescription": "Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events. This includes, but not limited to, assists at EXE or MEM uop writeback like AVX* load/store/gather/scatter (non-FP GSSE-assist ) , assists generated by ROB like PEBS and RTIT, Uncore trap, RAR (Remote Action Request) and CET (Control flow Enforcement Technology) assists. the event also counts for Machine Ordering count. Available PDIST counters: 0", + "PublicDescription": "Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events. This includes, but not limited to, assists at EXE or MEM uop writeback like AVX* load/store/gather/scatter (non-FP GSSE-assist ) , assists generated by ROB like PEBS and RTIT, Uncore trap, RAR (Remote Action Request) and CET (Control flow Enforcement Technology) assists. the event also counts for Machine Ordering count.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -14,7 +14,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.PAGE_FAULT", - "PublicDescription": "ASSISTS.PAGE_FAULT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -24,7 +23,6 @@ "Counter": "0,1,2,3", "EventCode": "0x28", "EventName": "CORE_POWER.LICENSE_1", - "PublicDescription": "CORE_POWER.LICENSE_1 Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x2", "Unit": "cpu_core" @@ -34,7 +32,6 @@ "Counter": "0,1,2,3", "EventCode": "0x28", "EventName": "CORE_POWER.LICENSE_2", - "PublicDescription": "CORE_POWER.LICENSE_2 Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x4", "Unit": "cpu_core" @@ -44,7 +41,6 @@ "Counter": "0,1,2,3", "EventCode": "0x28", "EventName": "CORE_POWER.LICENSE_3", - "PublicDescription": "CORE_POWER.LICENSE_3 Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x8", "Unit": "cpu_core" @@ -113,7 +109,7 @@ "CounterMask": "1", "EventCode": "0x2d", "EventName": "XQ.FULL_CYCLES", - "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache). Available PDIST counters: 0", + "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache).", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json index 48ef2a8cc49a..33d1f39e441f 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/pipeline.json @@ -6,7 +6,6 @@ "Deprecated": "1", "EventCode": "0xb0", "EventName": "ARITH.DIVIDER_ACTIVE", - "PublicDescription": "This event is deprecated. Refer to new event ARITH.DIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x9", "Unit": "cpu_core" @@ -27,7 +26,7 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.DIV_ACTIVE", - "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations.", "SampleAfterValue": "1000003", "UMask": "0x9", "Unit": "cpu_core" @@ -57,7 +56,6 @@ "Deprecated": "1", "EventCode": "0xb0", "EventName": "ARITH.FP_DIVIDER_ACTIVE", - "PublicDescription": "This event is deprecated. Refer to new event ARITH.FPDIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -78,7 +76,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.IDIV_ACTIVE", - "PublicDescription": "This event counts the cycles the integer divider is busy. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -108,7 +105,6 @@ "Deprecated": "1", "EventCode": "0xb0", "EventName": "ARITH.INT_DIVIDER_ACTIVE", - "PublicDescription": "This event is deprecated. Refer to new event ARITH.IDIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -118,7 +114,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.ANY", - "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists. Available PDIST counters: 0", + "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists.", "SampleAfterValue": "100003", "UMask": "0x1b", "Unit": "cpu_core" @@ -549,7 +545,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C01", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -559,7 +555,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C02", - "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x20", "Unit": "cpu_core" @@ -569,7 +565,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C0_WAIT", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction.", "SampleAfterValue": "2000003", "UMask": "0x70", "Unit": "cpu_core" @@ -597,7 +593,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.DISTRIBUTED", - "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -607,7 +603,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", - "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted. Available PDIST counters: 0", + "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted.", "SampleAfterValue": "25003", "UMask": "0x2", "Unit": "cpu_core" @@ -617,7 +613,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40", "Unit": "cpu_core" @@ -629,7 +624,6 @@ "EdgeDetect": "1", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE_INST", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE_INST Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40", "Unit": "cpu_core" @@ -649,7 +643,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_DISTRIBUTED", - "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -687,7 +681,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", - "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case. Available PDIST counters: 0", + "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -724,7 +718,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.THREAD_P", - "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time. Available PDIST counters: 0", + "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.", "SampleAfterValue": "2000003", "Unit": "cpu_core" }, @@ -734,7 +728,6 @@ "CounterMask": "8", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -745,7 +738,6 @@ "CounterMask": "1", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L2_MISS", - "PublicDescription": "Cycles while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -756,7 +748,6 @@ "CounterMask": "16", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_MEM_ANY", - "PublicDescription": "Cycles while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -767,7 +758,6 @@ "CounterMask": "12", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xc", "Unit": "cpu_core" @@ -778,7 +768,6 @@ "CounterMask": "5", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_core" @@ -789,7 +778,6 @@ "CounterMask": "4", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_TOTAL", - "PublicDescription": "Total execution stalls. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4", "Unit": "cpu_core" @@ -799,7 +787,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.1_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -809,7 +797,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_3_PORTS_UTIL", - "PublicDescription": "Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0xc", "Unit": "cpu_core" @@ -819,7 +806,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -829,7 +816,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.3_PORTS_UTIL", - "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -839,7 +826,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.4_PORTS_UTIL", - "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -850,7 +837,6 @@ "CounterMask": "5", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_LOADS", - "PublicDescription": "Execution stalls while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x21", "Unit": "cpu_core" @@ -861,7 +847,7 @@ "CounterMask": "2", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_STORES", - "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall.", "SampleAfterValue": "1000003", "UMask": "0x40", "Unit": "cpu_core" @@ -871,7 +857,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.EXE_BOUND_0_PORTS", - "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load. Available PDIST counters: 0", + "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load.", "SampleAfterValue": "1000003", "UMask": "0x80", "Unit": "cpu_core" @@ -881,7 +867,7 @@ "Counter": "0,1,2,3", "EventCode": "0x75", "EventName": "INST_DECODED.DECODERS", - "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions. Available PDIST counters: 0", + "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -927,7 +913,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", - "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -937,7 +922,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", - "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions Available PDIST counters: 0", + "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -956,7 +941,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", - "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0", + "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -968,7 +953,7 @@ "EdgeDetect": "1", "EventCode": "0xad", "EventName": "INT_MISC.CLEARS_COUNT", - "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears Available PDIST counters: 0", + "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", "SampleAfterValue": "500009", "UMask": "0x1", "Unit": "cpu_core" @@ -978,7 +963,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.CLEAR_RESTEER_CYCLES", - "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path. Available PDIST counters: 0", + "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.", "SampleAfterValue": "500009", "UMask": "0x80", "Unit": "cpu_core" @@ -988,7 +973,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.RECOVERY_CYCLES", - "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event. Available PDIST counters: 0", + "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event.", "SampleAfterValue": "500009", "UMask": "0x1", "Unit": "cpu_core" @@ -1000,7 +985,6 @@ "EventName": "INT_MISC.UNKNOWN_BRANCH_CYCLES", "MSRIndex": "0x3F7", "MSRValue": "0x7", - "PublicDescription": "Bubble cycles of BAClear (Unknown Branch). Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40", "Unit": "cpu_core" @@ -1010,7 +994,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.UOP_DROPPING", - "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons Available PDIST counters: 0", + "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1020,7 +1004,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.128BIT", - "PublicDescription": "INT_VEC_RETIRED.128BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x13", "Unit": "cpu_core" @@ -1030,7 +1013,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.256BIT", - "PublicDescription": "INT_VEC_RETIRED.256BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xac", "Unit": "cpu_core" @@ -1040,7 +1022,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_128", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0x3", "Unit": "cpu_core" @@ -1050,7 +1032,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_256", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0xc", "Unit": "cpu_core" @@ -1060,7 +1042,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.MUL_256", - "PublicDescription": "INT_VEC_RETIRED.MUL_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x80", "Unit": "cpu_core" @@ -1070,7 +1051,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.SHUFFLES", - "PublicDescription": "INT_VEC_RETIRED.SHUFFLES Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40", "Unit": "cpu_core" @@ -1080,7 +1060,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_128", - "PublicDescription": "INT_VEC_RETIRED.VNNI_128 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1090,7 +1069,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_256", - "PublicDescription": "INT_VEC_RETIRED.VNNI_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20", "Unit": "cpu_core" @@ -1119,7 +1097,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.ADDRESS_ALIAS", - "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -1138,7 +1116,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.NO_SR", - "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.", "SampleAfterValue": "100003", "UMask": "0x88", "Unit": "cpu_core" @@ -1148,7 +1126,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.STORE_FORWARD", - "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide.", "SampleAfterValue": "100003", "UMask": "0x82", "Unit": "cpu_core" @@ -1158,7 +1136,7 @@ "Counter": "0,1,2,3", "EventCode": "0x4c", "EventName": "LOAD_HIT_PREFETCH.SWPF", - "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -1169,7 +1147,7 @@ "CounterMask": "1", "EventCode": "0xa8", "EventName": "LSD.CYCLES_ACTIVE", - "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1180,7 +1158,7 @@ "CounterMask": "6", "EventCode": "0xa8", "EventName": "LSD.CYCLES_OK", - "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1190,7 +1168,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa8", "EventName": "LSD.UOPS", - "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector).", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1202,7 +1180,7 @@ "EdgeDetect": "1", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.COUNT", - "PublicDescription": "Counts the number of machine clears (nukes) of any type. Available PDIST counters: 0", + "PublicDescription": "Counts the number of machine clears (nukes) of any type.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -1258,7 +1236,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.SMC", - "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear. Available PDIST counters: 0", + "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -1268,7 +1246,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe0", "EventName": "MISC2_RETIRED.LFENCE", - "PublicDescription": "number of LFENCE retired instructions Available PDIST counters: 0", + "PublicDescription": "number of LFENCE retired instructions", "SampleAfterValue": "400009", "UMask": "0x20", "Unit": "cpu_core" @@ -1288,7 +1266,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcc", "EventName": "MISC_RETIRED.LBR_INSERTS", - "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT. Available PDIST counters: 0", + "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT.", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -1298,7 +1276,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SB", - "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end. Available PDIST counters: 0", + "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -1308,7 +1286,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SCOREBOARD", - "PublicDescription": "Counts cycles where the pipeline is stalled due to serializing operations. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -1318,7 +1295,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY", - "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses) Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", "SampleAfterValue": "1000003", "UMask": "0x7", "Unit": "cpu_core" @@ -1331,7 +1308,7 @@ "EventCode": "0xa5", "EventName": "RS.EMPTY_COUNT", "Invert": "1", - "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events) Available PDIST counters: 0", + "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events)", "SampleAfterValue": "100003", "UMask": "0x7", "Unit": "cpu_core" @@ -1341,7 +1318,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY_RESOURCE", - "PublicDescription": "Cycles when Reservation Station (RS) is empty due to a resource in the back-end Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1355,7 +1331,6 @@ "EventCode": "0xa5", "EventName": "RS_EMPTY.COUNT", "Invert": "1", - "PublicDescription": "This event is deprecated. Refer to new event RS.EMPTY_COUNT Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x7", "Unit": "cpu_core" @@ -1366,7 +1341,6 @@ "Deprecated": "1", "EventCode": "0xa5", "EventName": "RS_EMPTY.CYCLES", - "PublicDescription": "This event is deprecated. Refer to new event RS.EMPTY Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x7", "Unit": "cpu_core" @@ -1395,7 +1369,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS", - "PublicDescription": "Number of slots in TMA method where no micro-operations were being issued from front-end to back-end of the machine due to lack of back-end resources. Available PDIST counters: 0", + "PublicDescription": "Number of slots in TMA method where no micro-operations were being issued from front-end to back-end of the machine due to lack of back-end resources.", "SampleAfterValue": "10000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1405,7 +1379,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BAD_SPEC_SLOTS", - "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations. Available PDIST counters: 0", + "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations.", "SampleAfterValue": "10000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1415,7 +1389,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS", - "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction. Available PDIST counters: 0", + "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction.", "SampleAfterValue": "10000003", "UMask": "0x8", "Unit": "cpu_core" @@ -1425,7 +1399,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.MEMORY_BOUND_SLOTS", - "PublicDescription": "TOPDOWN.MEMORY_BOUND_SLOTS Available PDIST counters: 0", "SampleAfterValue": "10000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1444,7 +1417,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.SLOTS_P", - "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Available PDIST counters: 0", + "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core.", "SampleAfterValue": "10000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1661,7 +1634,6 @@ "Counter": "0,1,2,3", "EventCode": "0x76", "EventName": "UOPS_DECODED.DEC0_UOPS", - "PublicDescription": "UOPS_DECODED.DEC0_UOPS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1671,7 +1643,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_0", - "PublicDescription": "Number of uops dispatch to execution port 0. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 0.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1681,7 +1653,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_1", - "PublicDescription": "Number of uops dispatch to execution port 1. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 1.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1691,7 +1663,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_2_3_10", - "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1701,7 +1673,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_4_9", - "PublicDescription": "Number of uops dispatch to execution ports 4 and 9 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 4 and 9", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1711,7 +1683,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_5_11", - "PublicDescription": "Number of uops dispatch to execution ports 5 and 11 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 5 and 11", "SampleAfterValue": "2000003", "UMask": "0x20", "Unit": "cpu_core" @@ -1721,7 +1693,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_6", - "PublicDescription": "Number of uops dispatch to execution port 6. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 6.", "SampleAfterValue": "2000003", "UMask": "0x40", "Unit": "cpu_core" @@ -1731,7 +1703,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_7_8", - "PublicDescription": "Number of uops dispatch to execution ports 7 and 8. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 7 and 8.", "SampleAfterValue": "2000003", "UMask": "0x80", "Unit": "cpu_core" @@ -1742,7 +1714,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_1", - "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1753,7 +1725,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_2", - "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1764,7 +1736,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_3", - "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1775,7 +1747,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_4", - "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1786,7 +1758,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_1", - "PublicDescription": "Cycles where at least 1 uop was executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 1 uop was executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1797,7 +1769,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_2", - "PublicDescription": "Cycles where at least 2 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 2 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1808,7 +1780,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_3", - "PublicDescription": "Cycles where at least 3 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 3 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1819,7 +1791,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_4", - "PublicDescription": "Cycles where at least 4 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 4 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1831,7 +1803,7 @@ "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.STALLS", "Invert": "1", - "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1844,7 +1816,6 @@ "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.STALL_CYCLES", "Invert": "1", - "PublicDescription": "This event is deprecated. Refer to new event UOPS_EXECUTED.STALLS Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1854,7 +1825,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.THREAD", - "PublicDescription": "Counts the number of uops to be executed per-thread each cycle. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1864,7 +1834,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.X87", - "PublicDescription": "Counts the number of x87 uops executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of x87 uops executed.", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1883,7 +1853,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xae", "EventName": "UOPS_ISSUED.ANY", - "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS).", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1894,7 +1864,6 @@ "CounterMask": "1", "EventCode": "0xae", "EventName": "UOPS_ISSUED.CYCLES", - "PublicDescription": "UOPS_ISSUED.CYCLES Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1913,7 +1882,7 @@ "CounterMask": "1", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.CYCLES", - "PublicDescription": "Counts cycles where at least one uop has retired. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where at least one uop has retired.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1923,7 +1892,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.HEAVY", - "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count. Available PDIST counters: 0", + "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1954,7 +1923,6 @@ "EventName": "UOPS_RETIRED.MS", "MSRIndex": "0x3F7", "MSRValue": "0x8", - "PublicDescription": "UOPS_RETIRED.MS Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1964,7 +1932,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.SLOTS", - "PublicDescription": "Counts the retirement slots used each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the retirement slots used each cycle.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1976,7 +1944,7 @@ "EventCode": "0xc2", "EventName": "UOPS_RETIRED.STALLS", "Invert": "1", - "PublicDescription": "This event counts cycles without actually retired uops. Available PDIST counters: 0", + "PublicDescription": "This event counts cycles without actually retired uops.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1989,7 +1957,6 @@ "EventCode": "0xc2", "EventName": "UOPS_RETIRED.STALL_CYCLES", "Invert": "1", - "PublicDescription": "This event is deprecated. Refer to new event UOPS_RETIRED.STALLS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/alderlake/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/alderlake/uncore-interconnect.json index 7c0779c74154..b5604c7534e1 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/uncore-interconnect.json @@ -65,7 +65,6 @@ "Counter": "0,1", "EventCode": "0x81", "EventName": "UNC_ARB_REQ_TRK_REQUEST.DRD", - "Experimental": "1", "PerPkg": "1", "UMask": "0x2", "Unit": "ARB" @@ -103,7 +102,6 @@ "Counter": "0,1", "EventCode": "0x81", "EventName": "UNC_ARB_TRK_REQUESTS.RD", - "Experimental": "1", "PerPkg": "1", "UMask": "0x2", "Unit": "ARB" diff --git a/tools/perf/pmu-events/arch/x86/alderlake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/alderlake/virtual-memory.json index ffbbd08acc68..132ce48af6d9 100644 --- a/tools/perf/pmu-events/arch/x86/alderlake/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/alderlake/virtual-memory.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.STLB_HIT", - "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB).", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -15,7 +15,7 @@ "CounterMask": "1", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -35,7 +35,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe", "Unit": "cpu_core" @@ -45,7 +45,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -55,7 +55,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -65,7 +65,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -75,7 +75,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -85,7 +85,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.STLB_HIT", - "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB).", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -96,7 +96,7 @@ "CounterMask": "1", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -116,7 +116,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe", "Unit": "cpu_core" @@ -126,7 +126,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -136,7 +136,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -146,7 +146,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -156,7 +156,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -184,7 +184,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.STLB_HIT", - "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB).", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -195,7 +195,7 @@ "CounterMask": "1", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -215,7 +215,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe", "Unit": "cpu_core" @@ -225,7 +225,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -235,7 +235,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -245,7 +245,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json index ce93648043ef..0f72c9192df6 100644 --- a/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json +++ b/tools/perf/pmu-events/arch/x86/alderlaken/adln-metrics.json @@ -1,56 +1,56 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c8\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C8_Pkg_Residency", "ScaleUnit": "100%" @@ -460,12 +460,12 @@ }, { "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricName": "tma_info_system_cpu_utilization" }, { "BriefDescription": "Fraction of cycles spent in Kernel mode", - "MetricExpr": "cpu@CPU_CLK_UNHALTED.CORE_P@k / CPU_CLK_UNHALTED.CORE", + "MetricExpr": "CPU_CLK_UNHALTED.CORE_P:k / CPU_CLK_UNHALTED.CORE", "MetricGroup": "Summary", "MetricName": "tma_info_system_kernel_utilization" }, diff --git a/tools/perf/pmu-events/arch/x86/alderlaken/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/alderlaken/uncore-interconnect.json index 7c0779c74154..b5604c7534e1 100644 --- a/tools/perf/pmu-events/arch/x86/alderlaken/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/alderlaken/uncore-interconnect.json @@ -65,7 +65,6 @@ "Counter": "0,1", "EventCode": "0x81", "EventName": "UNC_ARB_REQ_TRK_REQUEST.DRD", - "Experimental": "1", "PerPkg": "1", "UMask": "0x2", "Unit": "ARB" @@ -103,7 +102,6 @@ "Counter": "0,1", "EventCode": "0x81", "EventName": "UNC_ARB_TRK_REQUESTS.RD", - "Experimental": "1", "PerPkg": "1", "UMask": "0x2", "Unit": "ARB" diff --git a/tools/perf/pmu-events/arch/x86/arrowlake/arl-metrics.json b/tools/perf/pmu-events/arch/x86/arrowlake/arl-metrics.json index b22a02450e6c..4f1f77404943 100644 --- a/tools/perf/pmu-events/arch/x86/arrowlake/arl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/arrowlake/arl-metrics.json @@ -1,56 +1,56 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c8\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C8_Pkg_Residency", "ScaleUnit": "100%" @@ -567,7 +567,7 @@ }, { "BriefDescription": "Average CPU Utilization", - "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_atom@", "MetricName": "tma_info_system_cpu_utilization", "Unit": "cpu_atom" }, @@ -774,7 +774,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvOB;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -786,7 +786,7 @@ { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-bad\\-spec@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-bad\\-spec@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -813,35 +813,35 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", + "Unit": "cpu_core" + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full", "Unit": "cpu_core" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_capacity / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_capacity / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", - "Unit": "cpu_core" - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20", @@ -849,7 +849,7 @@ }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_microcode_sequencer + tma_few_uops_instructions) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_microcode_sequencer + tma_few_uops_instructions) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -858,7 +858,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_bottleneck_memory_data_tlbs > 20", @@ -885,7 +885,7 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -902,7 +902,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", - "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BadSpec;BrMispredicts;BvMP;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", @@ -1042,7 +1042,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@R, 24 * tma_info_system_core_frequency) + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM@R, 25 * tma_info_system_core_frequency)) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "BvMS;DataSharing;LockCont;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", @@ -1095,7 +1094,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(cpu_core@IDQ.DSB_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ + cpu_core@IDQ.DSB_UOPS@ / (cpu_core@IDQ.DSB_UOPS@ + cpu_core@IDQ.MITE_UOPS@) * (cpu_core@IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE@ - cpu_core@IDQ_BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks", + "MetricExpr": "(cpu_core@IDQ.DSB_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / 2 + cpu_core@IDQ.DSB_UOPS@ / (cpu_core@IDQ.DSB_UOPS@ + cpu_core@IDQ.MITE_UOPS@) * (cpu_core@IDQ_BUBBLES.STARVATION_CYCLES@ - cpu_core@IDQ_BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", @@ -1149,7 +1148,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -1166,7 +1165,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "cpu_core@topdown\\-fetch\\-lat@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-fetch\\-lat@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -1216,7 +1215,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", - "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -1226,7 +1225,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.VECTOR@ / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "cpu_core@FP_ARITH_OPS_RETIRED.VECTOR@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -1236,7 +1235,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "(cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_128b", "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -1246,7 +1245,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", - "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.VECTOR\\,umask\\=0x30@ / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "cpu_core@FP_ARITH_OPS_RETIRED.VECTOR\\,umask\\=0x30@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_256b", "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -1257,7 +1256,7 @@ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvFB;BvIO;Default;PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -1278,7 +1277,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -1456,7 +1455,7 @@ }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_thread_clks", + "MetricExpr": "(cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_OPS_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE@) / tma_info_thread_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc", "Unit": "cpu_core" @@ -1597,7 +1596,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + cpu_core@FP_ARITH_INST_RETIRED.VECTOR@)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + cpu_core@FP_ARITH_OPS_RETIRED.VECTOR@)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", @@ -1606,7 +1605,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_SINGLE@)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", @@ -1615,7 +1614,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE@)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", @@ -1624,7 +1623,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_OPS_RETIRED.SCALAR_DOUBLE@", "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", @@ -1633,7 +1632,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_OPS_RETIRED.SCALAR_SINGLE@", "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", @@ -1658,7 +1657,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_OPS_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE@)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10", @@ -1713,7 +1712,7 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / tma_info_system_time", + "MetricExpr": "64 * cpu_core@L1D.L1_REPLACEMENT@ / 1e9 / tma_info_system_time", "MetricGroup": "Mem;MemoryBW", "MetricName": "tma_info_memory_l1d_cache_fill_bw", "Unit": "cpu_core" @@ -1726,6 +1725,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "L0 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * (cpu_core@MEM_LOAD_RETIRED.L1_MISS@ + cpu_core@MEM_LOAD_RETIRED.L1_HIT_L1@) / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheHits;Mem", + "MetricName": "tma_info_memory_l1dl0_mpki", + "Unit": "cpu_core" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / cpu_core@INST_RETIRED.ANY@", "MetricGroup": "CacheHits;Mem", @@ -1941,6 +1947,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Average number of uops fetched from MS per cycle", + "MetricExpr": "cpu_core@IDQ.MS_UOPS@ / cpu_core@IDQ.MS_UOPS\\,cmask\\=1@", + "MetricGroup": "Fed;FetchLat;MicroSeq", + "MetricName": "tma_info_pipeline_fetch_ms", + "Unit": "cpu_core" + }, + { "BriefDescription": "Instructions per a microcode Assist invocation", "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@ASSISTS.ANY@", "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", @@ -1974,7 +1987,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc\\,cpu=cpu_core@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency", "Unit": "cpu_core" @@ -1988,14 +2001,22 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_core@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized", "Unit": "cpu_core" }, { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "32 * UNC_M_TOTAL_DATA / 1e9 / tma_info_system_time", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full", + "Unit": "cpu_core" + }, + { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / tma_info_system_time", + "MetricExpr": "(cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_OPS_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE@) / 1e9 / tma_info_system_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width", @@ -2062,6 +2083,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / tma_info_system_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency", + "Unit": "cpu_core" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD@", "MetricGroup": "Pipeline", @@ -2182,12 +2210,12 @@ "Unit": "cpu_core" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", - "MetricExpr": "4 * cpu_core@DEPENDENT_LOADS.ANY@ / tma_info_thread_clks", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "MetricExpr": "4 * cpu_core@DEPENDENT_LOADS.ANY\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2203,7 +2231,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L2 cache under unloaded scenarios (possibly L2 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu_core@MEM_LOAD_RETIRED.L2_HIT@ * min(cpu_core@MEM_LOAD_RETIRED.L2_HIT@R, 3 * tma_info_system_core_frequency) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_l2_bound_group", "MetricName": "tma_l2_hit_latency", @@ -2224,12 +2251,11 @@ }, { "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * min(cpu_core@MEM_LOAD_RETIRED.L3_HIT@R, 9 * tma_info_system_core_frequency) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2311,6 +2337,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ * cpu_core@MEM_INST_RETIRED.LOCK_LOADS@R / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -2321,7 +2348,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", - "MetricExpr": "cpu_core@LSD.UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / tma_info_thread_clks", + "MetricExpr": "cpu_core@LSD.UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / tma_info_thread_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2", @@ -2346,7 +2373,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2356,13 +2383,13 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", - "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -2373,7 +2400,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", @@ -2412,7 +2438,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(cpu_core@IDQ.MITE_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / 2 + cpu_core@IDQ.MITE_UOPS@ / (cpu_core@IDQ.DSB_UOPS@ + cpu_core@IDQ.MITE_UOPS@) * (cpu_core@IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE@ - cpu_core@IDQ_BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks", + "MetricExpr": "(cpu_core@IDQ.MITE_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / 2 + cpu_core@IDQ.MITE_UOPS@ / (cpu_core@IDQ.DSB_UOPS@ + cpu_core@IDQ.MITE_UOPS@) * (cpu_core@IDQ_BUBBLES.STARVATION_CYCLES@ - cpu_core@IDQ_BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", @@ -2432,7 +2458,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "cpu_core@IDQ.MS_CYCLES_ANY@ / tma_info_thread_clks", + "MetricExpr": "cpu_core@IDQ.MS_CYCLES_ANY@ / tma_info_thread_clks / 1.8", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -2471,7 +2497,8 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", - "MetricExpr": "max(0, tma_light_operations - (tma_x87_use + (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + cpu_core@FP_ARITH_INST_RETIRED.VECTOR@) / (tma_retiring * tma_info_thread_slots) + (cpu_core@INT_VEC_RETIRED.ADD_128@ + cpu_core@INT_VEC_RETIRED.VNNI_128@ + cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots) + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "max(0, tma_light_operations - (tma_x87_use + (cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + cpu_core@FP_ARITH_OPS_RETIRED.VECTOR@) / (tma_retiring * tma_info_thread_slots) + (cpu_core@INT_VEC_RETIRED.ADD_128@ + cpu_core@INT_VEC_RETIRED.VNNI_128@ + cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots) + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -2509,6 +2536,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "((cpu_core@EXE_ACTIVITY.EXE_BOUND_0_PORTS@ + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_3_PORTS_UTIL@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_3_PORTS_UTIL@) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -2519,6 +2547,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.EXE_BOUND_0_PORTS@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", @@ -2529,6 +2558,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", @@ -2539,7 +2569,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.2_PORTS_UTIL@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", @@ -2550,7 +2579,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@UOPS_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks", "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", @@ -2571,7 +2599,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2586,7 +2614,7 @@ "MetricGroup": "BvIO;PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group;tma_issueSO", "MetricName": "tma_serializing_operation", "MetricThreshold": "tma_serializing_operation > 0.1 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", - "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: RESOURCE_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", + "PublicDescription": "This metric represents fraction of cycles the CPU issue-pipeline was stalled due to serializing operations. Instructions like CPUID; WRMSR or LFENCE serialize the out-of-order execution which may limit performance. Sample with: PARTIAL_RAT_STALLS.SCOREBOARD. Related metrics: tma_ms_switches", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2602,7 +2630,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.PAUSE@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", @@ -2637,7 +2664,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2652,6 +2679,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "This metric estimates clocks wasted due to loads blocked due to unknown store address (did not do memory disambiguation) or due to unknown store data", + "MetricExpr": "7 * cpu_core@LD_BLOCKS.STORE_EARLY\\,cmask\\=1@ / tma_info_thread_clks", + "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", + "MetricName": "tma_store_early_blk", + "MetricThreshold": "tma_store_early_blk > 0.2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", "MetricExpr": "13 * cpu_core@LD_BLOCKS.STORE_FORWARD@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", diff --git a/tools/perf/pmu-events/arch/x86/arrowlake/cache.json b/tools/perf/pmu-events/arch/x86/arrowlake/cache.json index 91929d8bcf47..30dd56b487ba 100644 --- a/tools/perf/pmu-events/arch/x86/arrowlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/arrowlake/cache.json @@ -29,6 +29,16 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Cachelines replaced into the L1 d-cache. Successful replacements only (not blocked) and exclude WB-miss case", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x51", + "EventName": "L1D.L1_REPLACEMENT", + "PublicDescription": "Counts cachelines replaced into the L1 d-cache.", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { "BriefDescription": "Cachelines replaced into the L0 and L1 d-cache. Successful replacements only (not blocked) and exclude WB-miss case", "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x51", @@ -540,7 +550,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ALL_LOADS", - "PublicDescription": "Counts Instructions with at least one architecturally visible load retired. Available PDIST counters: 0", + "PublicDescription": "Counts Instructions with at least one architecturally visible load retired. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x81", "Unit": "cpu_core" @@ -551,7 +561,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ALL_STORES", - "PublicDescription": "Counts all retired store instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all retired store instructions. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x82", "Unit": "cpu_core" @@ -561,7 +571,7 @@ "Counter": "0,1,2,3", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ALL_SWPF", - "PublicDescription": "Counts all retired software prefetch instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all retired software prefetch instructions. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x84", "Unit": "cpu_core" @@ -572,7 +582,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ANY", - "PublicDescription": "Counts all retired memory instructions - loads and stores. Available PDIST counters: 0", + "PublicDescription": "Counts all retired memory instructions - loads and stores. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x87", "Unit": "cpu_core" @@ -583,7 +593,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.LOCK_LOADS", - "PublicDescription": "Counts retired load instructions with locked access. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with locked access. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x21", "Unit": "cpu_core" @@ -594,7 +604,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.SPLIT_LOADS", - "PublicDescription": "Counts retired load instructions that split across a cacheline boundary. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions that split across a cacheline boundary. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x41", "Unit": "cpu_core" @@ -605,18 +615,29 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.SPLIT_STORES", - "PublicDescription": "Counts retired store instructions that split across a cacheline boundary. Available PDIST counters: 0", + "PublicDescription": "Counts retired store instructions that split across a cacheline boundary. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x42", "Unit": "cpu_core" }, { + "BriefDescription": "Retired instructions that hit the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_HIT_ANY", + "PublicDescription": "Number of retired instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0xf", + "Unit": "cpu_core" + }, + { "BriefDescription": "Retired load instructions that hit the STLB.", "Counter": "0,1,2,3", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.STLB_HIT_LOADS", - "PublicDescription": "Number of retired load instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0", + "PublicDescription": "Number of retired load instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x9", "Unit": "cpu_core" @@ -627,18 +648,39 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.STLB_HIT_STORES", - "PublicDescription": "Number of retired store instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0", + "PublicDescription": "Number of retired store instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0xa", "Unit": "cpu_core" }, { + "BriefDescription": "Retired SWPF instructions that hit the STLB.", + "Counter": "0,1,2,3", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_HIT_SWPF", + "PublicDescription": "Number of retired SWPF instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0xc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that miss the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_MISS_ANY", + "PublicDescription": "Retired instructions that miss the STLB. Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x17", + "Unit": "cpu_core" + }, + { "BriefDescription": "Retired load instructions that miss the STLB.", "Counter": "0,1,2,3", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.STLB_MISS_LOADS", - "PublicDescription": "Number of retired load instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0", + "PublicDescription": "Number of retired load instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x11", "Unit": "cpu_core" @@ -649,18 +691,28 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.STLB_MISS_STORES", - "PublicDescription": "Number of retired store instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0", + "PublicDescription": "Number of retired store instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x12", "Unit": "cpu_core" }, { + "BriefDescription": "Retired SWPF instructions that miss the STLB.", + "Counter": "0,1,2,3", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_MISS_SWPF", + "PublicDescription": "Number of retired SWPF instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x14", + "Unit": "cpu_core" + }, + { "BriefDescription": "Retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$)", "Counter": "0,1,2,3", "Data_LA": "1", "EventCode": "0xd2", "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", - "PublicDescription": "Counts retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$) Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$) Available PDIST counters: 0,1", "SampleAfterValue": "20011", "UMask": "0x10", "Unit": "cpu_core" @@ -671,7 +723,7 @@ "Data_LA": "1", "EventCode": "0xd2", "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM", - "PublicDescription": "Counts retired load instructions whose data sources were HitM responses from shared L3, Hit-with-FWD is normally excluded. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions whose data sources were HitM responses from shared L3, Hit-with-FWD is normally excluded. Available PDIST counters: 0,1", "SampleAfterValue": "20011", "UMask": "0x4", "Unit": "cpu_core" @@ -682,7 +734,7 @@ "Data_LA": "1", "EventCode": "0xd2", "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", - "PublicDescription": "Counts the retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Available PDIST counters: 0", + "PublicDescription": "Counts the retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Available PDIST counters: 0,1", "SampleAfterValue": "20011", "UMask": "0x1", "Unit": "cpu_core" @@ -693,7 +745,7 @@ "Data_LA": "1", "EventCode": "0xd2", "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", - "PublicDescription": "Counts retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache. Available PDIST counters: 0,1", "SampleAfterValue": "20011", "UMask": "0x2", "Unit": "cpu_core" @@ -704,7 +756,7 @@ "Data_LA": "1", "EventCode": "0xd4", "EventName": "MEM_LOAD_MISC_RETIRED.UC", - "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access (Bus Lock). Available PDIST counters: 0", + "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access (Bus Lock). Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x4", "Unit": "cpu_core" @@ -715,7 +767,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.FB_HIT", - "PublicDescription": "Counts retired load instructions with at least one uop was load missed in L1 but hit FB (Fill Buffers) due to preceding miss to the same cache line with data not ready. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop was load missed in L1 but hit FB (Fill Buffers) due to preceding miss to the same cache line with data not ready. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x40", "Unit": "cpu_core" @@ -726,7 +778,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L1_HIT", - "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x101", "Unit": "cpu_core" @@ -737,7 +789,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L1_HIT_L0", - "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 0 of the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 0 of the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -747,7 +799,7 @@ "Counter": "0,1,2,3", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L1_HIT_L1", - "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 1 of the L1 data cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 1 of the L1 data cache. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "Unit": "cpu_core" }, @@ -757,7 +809,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L1_MISS", - "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L1 cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L1 cache. Available PDIST counters: 0,1", "SampleAfterValue": "200003", "UMask": "0x8", "Unit": "cpu_core" @@ -768,7 +820,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L2_HIT", - "PublicDescription": "Counts retired load instructions with L2 cache hits as data sources. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with L2 cache hits as data sources. Available PDIST counters: 0,1", "SampleAfterValue": "200003", "UMask": "0x2", "Unit": "cpu_core" @@ -779,7 +831,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L2_MISS", - "PublicDescription": "Counts retired load instructions missed L2 cache as data sources. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions missed L2 cache as data sources. Available PDIST counters: 0,1", "SampleAfterValue": "100021", "UMask": "0x10", "Unit": "cpu_core" @@ -790,7 +842,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L3_HIT", - "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L3 cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L3 cache. Available PDIST counters: 0,1", "SampleAfterValue": "100021", "UMask": "0x4", "Unit": "cpu_core" @@ -801,7 +853,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L3_MISS", - "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L3 cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L3 cache. Available PDIST counters: 0,1", "SampleAfterValue": "50021", "UMask": "0x20", "Unit": "cpu_core" @@ -1029,7 +1081,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024", @@ -1053,7 +1105,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128", @@ -1077,7 +1129,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16", @@ -1089,7 +1141,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048", @@ -1113,7 +1165,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256", @@ -1137,7 +1189,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32", @@ -1161,7 +1213,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4", @@ -1185,7 +1237,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512", @@ -1209,7 +1261,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64", @@ -1233,7 +1285,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8", @@ -1384,8 +1436,32 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts writebacks of modified cachelines that hit in the L3 or were snooped from another core's caches.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_M.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x7E001E00008", + "PublicDescription": "Counts writebacks of modified cachelines that hit in the L3 or were snooped from another core's caches. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts writebacks of non-modified cachelines that hit in the L3 or were snooped from another core's caches.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_NONM.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x7E001E01000", + "PublicDescription": "Counts writebacks of non-modified cachelines that hit in the L3 or were snooped from another core's caches. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts demand data reads that have any type of response.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", @@ -1397,7 +1473,7 @@ }, { "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", "MSRIndex": "0x1a6,0x1a7", @@ -1409,7 +1485,7 @@ }, { "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches which forwarded the unmodified data to the requesting core.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD", "MSRIndex": "0x1a6,0x1a7", @@ -1421,7 +1497,7 @@ }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", @@ -1433,7 +1509,7 @@ }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", "MSRIndex": "0x1a6,0x1a7", @@ -1444,6 +1520,18 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that hit in the L3 or were snooped from another core's caches.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.READS_TO_CORE.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x7E001E04477", + "PublicDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that hit in the L3 or were snooped from another core's caches. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Any memory transaction that reached the SQ.", "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x21", diff --git a/tools/perf/pmu-events/arch/x86/arrowlake/frontend.json b/tools/perf/pmu-events/arch/x86/arrowlake/frontend.json index 56cf1ec63200..db2ef84ca041 100644 --- a/tools/perf/pmu-events/arch/x86/arrowlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/arrowlake/frontend.json @@ -81,7 +81,7 @@ "EventName": "FRONTEND_RETIRED.ANY_ANT", "MSRIndex": "0x3F7", "MSRValue": "0x9", - "PublicDescription": "Always Not Taken (ANT) conditional retired branches (no BTB entry and not mispredicted) Available PDIST counters: 0", + "PublicDescription": "Always Not Taken (ANT) conditional retired branches (no BTB entry and not mispredicted) Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -93,7 +93,7 @@ "EventName": "FRONTEND_RETIRED.ANY_DSB_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x1", - "PublicDescription": "Counts retired Instructions that experienced DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions that experienced DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -179,7 +179,7 @@ "EventName": "FRONTEND_RETIRED.DSB_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x11", - "PublicDescription": "Number of retired Instructions that experienced a critical DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Critical means stalls were exposed to the back-end as a result of the DSB miss. Available PDIST counters: 0", + "PublicDescription": "Number of retired Instructions that experienced a critical DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Critical means stalls were exposed to the back-end as a result of the DSB miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -218,7 +218,7 @@ "EventName": "FRONTEND_RETIRED.ITLB_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x14", - "PublicDescription": "Counts retired Instructions that experienced iTLB (Instruction TLB) true miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions that experienced iTLB (Instruction TLB) true miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -239,7 +239,7 @@ "EventName": "FRONTEND_RETIRED.L1I_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x12", - "PublicDescription": "Counts retired Instructions who experienced Instruction L1 Cache true miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions who experienced Instruction L1 Cache true miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -251,7 +251,7 @@ "EventName": "FRONTEND_RETIRED.L2_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x13", - "PublicDescription": "Counts retired Instructions who experienced Instruction L2 Cache true miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions who experienced Instruction L2 Cache true miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -263,7 +263,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_128", "MSRIndex": "0x3F7", "MSRValue": "0x608006", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -275,7 +275,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_16", "MSRIndex": "0x3F7", "MSRValue": "0x601006", - "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 16 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 16 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -287,7 +287,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_2", "MSRIndex": "0x3F7", "MSRValue": "0x600206", - "PublicDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of at least 2 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of at least 2 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -299,7 +299,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_256", "MSRIndex": "0x3F7", "MSRValue": "0x610006", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -311,7 +311,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1", "MSRIndex": "0x3F7", "MSRValue": "0x100206", - "PublicDescription": "Counts retired instructions that are delivered to the back-end after the front-end had at least 1 bubble-slot for a period of 2 cycles. A bubble-slot is an empty issue-pipeline slot while there was no RAT stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after the front-end had at least 1 bubble-slot for a period of 2 cycles. A bubble-slot is an empty issue-pipeline slot while there was no RAT stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -323,7 +323,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_32", "MSRIndex": "0x3F7", "MSRValue": "0x602006", - "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 32 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 32 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -335,7 +335,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_4", "MSRIndex": "0x3F7", "MSRValue": "0x600406", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -347,7 +347,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_512", "MSRIndex": "0x3F7", "MSRValue": "0x620006", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -359,7 +359,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_64", "MSRIndex": "0x3F7", "MSRValue": "0x604006", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -371,7 +371,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_8", "MSRIndex": "0x3F7", "MSRValue": "0x600806", - "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 8 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 8 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -383,7 +383,7 @@ "EventName": "FRONTEND_RETIRED.MISP_ANT", "MSRIndex": "0x3F7", "MSRValue": "0x9", - "PublicDescription": "ANT retired branches that got just mispredicted Available PDIST counters: 0", + "PublicDescription": "ANT retired branches that got just mispredicted Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x2", "Unit": "cpu_core" @@ -395,7 +395,7 @@ "EventName": "FRONTEND_RETIRED.MS_FLOWS", "MSRIndex": "0x3F7", "MSRValue": "0x8", - "PublicDescription": "Counts flows delivered by the Microcode Sequencer Available PDIST counters: 0", + "PublicDescription": "Counts flows delivered by the Microcode Sequencer Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -443,7 +443,7 @@ "EventName": "FRONTEND_RETIRED.STLB_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x15", - "PublicDescription": "Counts retired Instructions that experienced STLB (2nd level TLB) true miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions that experienced STLB (2nd level TLB) true miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -455,7 +455,7 @@ "EventName": "FRONTEND_RETIRED.UNKNOWN_BRANCH", "MSRIndex": "0x3F7", "MSRValue": "0x17", - "PublicDescription": "Number retired branch instructions that caused the front-end to be resteered when it finds the instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0", + "PublicDescription": "Number retired branch instructions that caused the front-end to be resteered when it finds the instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/arrowlake/memory.json b/tools/perf/pmu-events/arch/x86/arrowlake/memory.json index fb8d4ac69bda..aba1e27e5e37 100644 --- a/tools/perf/pmu-events/arch/x86/arrowlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/arrowlake/memory.json @@ -163,7 +163,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_1024", "MSRIndex": "0x3F6", "MSRValue": "0x400", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "53", "UMask": "0x1", "Unit": "cpu_core" @@ -176,7 +176,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128", "MSRIndex": "0x3F6", "MSRValue": "0x80", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "1009", "UMask": "0x1", "Unit": "cpu_core" @@ -189,7 +189,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16", "MSRIndex": "0x3F6", "MSRValue": "0x10", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "20011", "UMask": "0x1", "Unit": "cpu_core" @@ -202,7 +202,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_2048", "MSRIndex": "0x3F6", "MSRValue": "0x800", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "23", "UMask": "0x1", "Unit": "cpu_core" @@ -215,7 +215,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256", "MSRIndex": "0x3F6", "MSRValue": "0x100", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "503", "UMask": "0x1", "Unit": "cpu_core" @@ -228,7 +228,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32", "MSRIndex": "0x3F6", "MSRValue": "0x20", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "100007", "UMask": "0x1", "Unit": "cpu_core" @@ -241,7 +241,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4", "MSRIndex": "0x3F6", "MSRValue": "0x4", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -254,7 +254,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512", "MSRIndex": "0x3F6", "MSRValue": "0x200", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "101", "UMask": "0x1", "Unit": "cpu_core" @@ -267,7 +267,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64", "MSRIndex": "0x3F6", "MSRValue": "0x40", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "2003", "UMask": "0x1", "Unit": "cpu_core" @@ -280,7 +280,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8", "MSRIndex": "0x3F6", "MSRValue": "0x8", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "50021", "UMask": "0x1", "Unit": "cpu_core" @@ -291,7 +291,7 @@ "Data_LA": "1", "EventCode": "0xcd", "EventName": "MEM_TRANS_RETIRED.STORE_SAMPLE", - "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8 Available PDIST counters: 0", + "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8 Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -334,7 +334,7 @@ }, { "BriefDescription": "Counts demand data reads that were supplied by DRAM.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.DRAM", "MSRIndex": "0x1a6,0x1a7", @@ -346,7 +346,7 @@ }, { "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", "MSRIndex": "0x1a6,0x1a7", @@ -358,7 +358,7 @@ }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.L3_MISS", "MSRIndex": "0x1a6,0x1a7", diff --git a/tools/perf/pmu-events/arch/x86/arrowlake/other.json b/tools/perf/pmu-events/arch/x86/arrowlake/other.json index 51bc763a5887..ab7aac14e697 100644 --- a/tools/perf/pmu-events/arch/x86/arrowlake/other.json +++ b/tools/perf/pmu-events/arch/x86/arrowlake/other.json @@ -66,7 +66,7 @@ }, { "BriefDescription": "Counts streaming stores that have any type of response.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.STREAMING_WR.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", diff --git a/tools/perf/pmu-events/arch/x86/arrowlake/pipeline.json b/tools/perf/pmu-events/arch/x86/arrowlake/pipeline.json index 18a22368b99b..0651e2c4561e 100644 --- a/tools/perf/pmu-events/arch/x86/arrowlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/arrowlake/pipeline.json @@ -74,13 +74,14 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.ALL_BRANCHES", - "PublicDescription": "Counts all branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts all branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "Unit": "cpu_core" }, { "BriefDescription": "Counts the total number of branch instructions retired for all branch types.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "ARL010, ARL011", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.ALL_BRANCHES", "PublicDescription": "Counts the total number of instructions in which the instruction pointer (IP) of the processor is resteered due to a branch instruction and the branch instruction successfully retires. All branch type instructions are accounted for.", @@ -101,7 +102,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND", - "PublicDescription": "Counts conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x111", "Unit": "cpu_core" @@ -109,6 +110,7 @@ { "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "ARL011", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND", "SampleAfterValue": "200003", @@ -120,7 +122,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND_NTAKEN", - "PublicDescription": "Counts not taken branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts not taken branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x10", "Unit": "cpu_core" @@ -139,7 +141,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND_TAKEN", - "PublicDescription": "Counts taken conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x101", "Unit": "cpu_core" @@ -158,7 +160,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND_TAKEN_BWD", - "PublicDescription": "Counts taken backward conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken backward conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x1", "Unit": "cpu_core" @@ -168,7 +170,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND_TAKEN_FWD", - "PublicDescription": "Counts taken forward conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken forward conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x102", "Unit": "cpu_core" @@ -187,7 +189,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.FAR_BRANCH", - "PublicDescription": "Counts far branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts far branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x40", "Unit": "cpu_core" @@ -195,6 +197,7 @@ { "BriefDescription": "Counts the number of far branch instructions retired, includes far jump, far call and return, and interrupt call and return.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "ARL011", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.FAR_BRANCH", "SampleAfterValue": "200003", @@ -215,7 +218,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.INDIRECT", - "PublicDescription": "Counts near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0", + "PublicDescription": "Counts near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x80", "Unit": "cpu_core" @@ -223,6 +226,7 @@ { "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "ARL011", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.INDIRECT", "SampleAfterValue": "200003", @@ -241,6 +245,7 @@ { "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "ARL011", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.INDIRECT_CALL", "SampleAfterValue": "200003", @@ -270,7 +275,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_CALL", - "PublicDescription": "Counts both direct and indirect near call instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts both direct and indirect near call instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x2", "Unit": "cpu_core" @@ -278,6 +283,7 @@ { "BriefDescription": "Counts the number of near CALL branch instructions retired.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "ARL010, ARL011", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_CALL", "SampleAfterValue": "200003", @@ -298,7 +304,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_RETURN", - "PublicDescription": "Counts return instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts return instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x8", "Unit": "cpu_core" @@ -317,7 +323,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_TAKEN", - "PublicDescription": "Counts taken branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x20", "Unit": "cpu_core" @@ -325,6 +331,7 @@ { "BriefDescription": "Counts the number of near taken branch instructions retired.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "ARL011", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_TAKEN", "SampleAfterValue": "200003", @@ -372,7 +379,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.ALL_BRANCHES", - "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. Available PDIST counters: 0", + "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "Unit": "cpu_core" }, @@ -390,7 +397,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.ALL_BRANCHES_COST", - "PublicDescription": "All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x44", "Unit": "cpu_core" @@ -409,7 +416,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND", - "PublicDescription": "Counts mispredicted conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts mispredicted conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x111", "Unit": "cpu_core" @@ -428,7 +435,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_COST", - "PublicDescription": "Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x151", "Unit": "cpu_core" @@ -438,7 +445,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_NTAKEN", - "PublicDescription": "Counts the number of conditional branch instructions retired that were mispredicted and the branch direction was not taken. Available PDIST counters: 0", + "PublicDescription": "Counts the number of conditional branch instructions retired that were mispredicted and the branch direction was not taken. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x10", "Unit": "cpu_core" @@ -448,7 +455,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_NTAKEN_COST", - "PublicDescription": "Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x50", "Unit": "cpu_core" @@ -467,7 +474,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN", - "PublicDescription": "Counts taken conditional mispredicted branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken conditional mispredicted branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x101", "Unit": "cpu_core" @@ -486,7 +493,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_BWD", - "PublicDescription": "Counts taken backward conditional mispredicted branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken backward conditional mispredicted branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x1", "Unit": "cpu_core" @@ -496,7 +503,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_BWD_COST", - "PublicDescription": "number of branch instructions retired that were mispredicted and taken backward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "number of branch instructions retired that were mispredicted and taken backward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x8001", "Unit": "cpu_core" @@ -506,7 +513,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_COST", - "PublicDescription": "Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x141", "Unit": "cpu_core" @@ -516,7 +523,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_FWD", - "PublicDescription": "Counts taken forward conditional mispredicted branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken forward conditional mispredicted branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "Unit": "cpu_core" }, @@ -525,7 +532,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_FWD_COST", - "PublicDescription": "number of branch instructions retired that were mispredicted and taken forward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "number of branch instructions retired that were mispredicted and taken forward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x8002", "Unit": "cpu_core" @@ -544,7 +551,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.INDIRECT", - "PublicDescription": "Counts miss-predicted near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0", + "PublicDescription": "Counts miss-predicted near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x80", "Unit": "cpu_core" @@ -572,7 +579,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.INDIRECT_CALL", - "PublicDescription": "Counts retired mispredicted indirect (near taken) CALL instructions, including both register and memory indirect. Available PDIST counters: 0", + "PublicDescription": "Counts retired mispredicted indirect (near taken) CALL instructions, including both register and memory indirect. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x2", "Unit": "cpu_core" @@ -591,7 +598,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.INDIRECT_CALL_COST", - "PublicDescription": "Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x42", "Unit": "cpu_core" @@ -601,7 +608,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.INDIRECT_COST", - "PublicDescription": "Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0xc0", "Unit": "cpu_core" @@ -620,7 +627,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.NEAR_TAKEN", - "PublicDescription": "Counts number of near branch instructions retired that were mispredicted and taken. Available PDIST counters: 0", + "PublicDescription": "Counts number of near branch instructions retired that were mispredicted and taken. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x20", "Unit": "cpu_core" @@ -639,7 +646,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.NEAR_TAKEN_COST", - "PublicDescription": "Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x60", "Unit": "cpu_core" @@ -649,7 +656,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.RET", - "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted return instructions retired. Available PDIST counters: 0", + "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted return instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x8", "Unit": "cpu_core" @@ -677,7 +684,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.RET_COST", - "PublicDescription": "Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x48", "Unit": "cpu_core" @@ -1046,7 +1053,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.ANY_P", - "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter. Available PDIST counters: 0", + "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter. Available PDIST counters: 0,1", "SampleAfterValue": "2000003", "Unit": "cpu_core" }, @@ -1063,7 +1070,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.BR_FUSED", - "PublicDescription": "retired macro-fused uops when there is a branch in the macro-fused pair (the two instructions that got macro-fused count once in this pmon) Available PDIST counters: 0", + "PublicDescription": "retired macro-fused uops when there is a branch in the macro-fused pair (the two instructions that got macro-fused count once in this pmon) Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1073,7 +1080,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", - "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0", + "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0,1", "SampleAfterValue": "2000003", "UMask": "0x30", "Unit": "cpu_core" @@ -1083,7 +1090,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", - "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions Available PDIST counters: 0", + "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions Available PDIST counters: 0,1", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1102,7 +1109,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", - "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0", + "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0,1", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -1349,6 +1356,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of times a load got early blocked due to preceding store operation with unknown address or unknown data. Excluding in-line (immediate) wakeups", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.STORE_EARLY", + "SampleAfterValue": "100003", + "UMask": "0xa1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the number of occurrences a retired load gets blocked because its address partially overlaps with an older store (size mismatch) - unknown_sta/bad_forward", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x03", @@ -1563,7 +1579,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xe4", "EventName": "MISC_RETIRED.LBR_INSERTS", - "PublicDescription": "LBR record is inserted Available PDIST counters: 0", + "PublicDescription": "LBR record is inserted Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1929,7 +1945,7 @@ }, { "BriefDescription": "Fixed Counter: Counts the number of retirement slots not consumed due to front end stalls.", - "Counter": "37", + "Counter": "Fixed counter 5", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003", "UMask": "0x6", @@ -2126,7 +2142,7 @@ }, { "BriefDescription": "Fixed Counter: Counts the number of consumed retirement slots.", - "Counter": "38", + "Counter": "Fixed counter 6", "EventName": "TOPDOWN_RETIRING.ALL", "SampleAfterValue": "1000003", "UMask": "0x7", diff --git a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json index 89750117a7f6..1d8e910f5961 100644 --- a/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwell/bdw-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -80,7 +80,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -98,7 +97,6 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", @@ -139,7 +137,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -640,7 +637,7 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -653,7 +650,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -665,7 +662,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -854,7 +851,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -1032,7 +1028,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1041,7 +1037,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1050,7 +1046,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", diff --git a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json index 81175f0f2603..a5e408ca46a7 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellde/bdwde-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -80,7 +80,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -98,7 +97,6 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", @@ -139,7 +137,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -632,7 +629,7 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -645,7 +642,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -657,7 +654,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -846,7 +843,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -1021,7 +1017,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1030,7 +1026,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1039,7 +1035,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", diff --git a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json index 5d06a3f72be2..5b83b040060c 100644 --- a/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/broadwellx/bdx-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -282,7 +282,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -300,7 +299,6 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", @@ -341,7 +339,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -842,7 +839,7 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -855,7 +852,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -867,7 +864,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -907,6 +904,7 @@ }, { "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", "MetricName": "tma_info_system_mem_parallel_reads", @@ -1076,7 +1074,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -1086,6 +1083,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_mem", @@ -1263,7 +1261,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else CYCLE_ACTIVITY.STALLS_TOTAL - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1272,7 +1270,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1281,7 +1279,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1308,6 +1306,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_mem", diff --git a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json index 6485b565acbc..2e50a91b6728 100644 --- a/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/cascadelakex/clx-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -319,6 +319,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -356,6 +357,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_bottleneck_big_code", @@ -370,31 +372,35 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", @@ -402,6 +408,7 @@ }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", @@ -410,7 +417,8 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_bottleneck_memory_data_tlbs > 20", @@ -418,7 +426,8 @@ }, { "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", "MetricGroup": "BvMS;LockCont;Mem;Offcore;tma_issueSyncxn", "MetricName": "tma_bottleneck_memory_synchronization", "MetricThreshold": "tma_bottleneck_memory_synchronization > 10", @@ -426,6 +435,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM", "MetricName": "tma_bottleneck_mispredictions", @@ -434,7 +444,8 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -442,6 +453,7 @@ }, { "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + 2 * BR_INST_RETIRED.NEAR_CALL + INST_RETIRED.NOP) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "BvUW;Ret", "MetricName": "tma_bottleneck_useful_work", @@ -469,6 +481,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -539,6 +552,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external CXL Memory by loads (e.g", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 1)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricName": "tma_cxl_mem_bound", + "MetricThreshold": "tma_cxl_mem_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external CXL Memory by loads (e.g. 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory, PMM - Persistent Memory Module [from CLX to SPR] or any other CXL Type3 Memory [EMR onwards]).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "44 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE / (OCR.DEMAND_DATA_RD.L3_HIT.HITM_OTHER_CORE + OCR.DEMAND_DATA_RD.L3_HIT.HIT_OTHER_CORE_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", @@ -569,7 +591,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound - tma_cxl_mem_bound if #has_pmem > 0 else CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound)", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -630,7 +652,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -693,7 +715,6 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", @@ -768,6 +789,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Cycles representing fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_bottleneck_mispredictions * tma_info_thread_slots / 4 / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", @@ -803,6 +825,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_mite)))", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", "MetricName": "tma_info_botlnk_l2_dsb_bandwidth", @@ -820,6 +843,7 @@ }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", "MetricName": "tma_info_botlnk_l2_ic_misses", @@ -961,7 +985,6 @@ }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", @@ -1249,7 +1272,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -1267,6 +1290,12 @@ "MetricName": "tma_info_pipeline_fetch_mite" }, { + "BriefDescription": "Average number of uops fetched from MS per cycle", + "MetricExpr": "IDQ.MS_UOPS / cpu@IDQ.MS_UOPS\\,cmask\\=1@", + "MetricGroup": "Fed;FetchLat;MicroSeq", + "MetricName": "tma_info_pipeline_fetch_ms" + }, + { "BriefDescription": "Instructions per a microcode Assist invocation", "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)", "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", @@ -1282,7 +1311,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1294,16 +1323,28 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_read_bw" + }, + { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_write_bw" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / tma_info_system_time", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1362,6 +1403,13 @@ "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" }, { + "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", + "MetricExpr": "(1e9 * (UNC_M_PMM_RPQ_OCCUPANCY.ALL / UNC_M_PMM_RPQ_INSERTS) / imc_0@event\\=0x0@ if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", + "MetricName": "tma_info_system_mem_pmm_read_latency", + "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" + }, + { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / tma_info_system_time)", "MetricGroup": "Mem;MemoryLat;SoC", @@ -1499,12 +1547,13 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1541,7 +1590,7 @@ "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1565,6 +1614,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -1591,6 +1641,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 1 GB pages for data load accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_load_stlb_miss * DTLB_LOAD_MISSES.WALK_COMPLETED_1G / (DTLB_LOAD_MISSES.WALK_COMPLETED_4K + DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M + DTLB_LOAD_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_load_stlb_miss_group", "MetricName": "tma_load_stlb_miss_1g", @@ -1599,6 +1650,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 2 or 4 MB pages for data load accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_load_stlb_miss * DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / (DTLB_LOAD_MISSES.WALK_COMPLETED_4K + DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M + DTLB_LOAD_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_load_stlb_miss_group", "MetricName": "tma_load_stlb_miss_2m", @@ -1607,6 +1659,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 4 KB pages for data load accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_load_stlb_miss * DTLB_LOAD_MISSES.WALK_COMPLETED_4K / (DTLB_LOAD_MISSES.WALK_COMPLETED_4K + DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M + DTLB_LOAD_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_load_stlb_miss_group", "MetricName": "tma_load_stlb_miss_4k", @@ -1624,6 +1677,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -1648,7 +1702,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1657,7 +1711,7 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1681,7 +1735,6 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", @@ -1691,6 +1744,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;BvMP;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", @@ -1745,6 +1799,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", @@ -1754,6 +1809,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", "MetricGroup": "BrMispredicts;BvIO;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", "MetricName": "tma_other_mispredicts", @@ -1762,6 +1818,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", "MetricGroup": "BvIO;Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", "MetricName": "tma_other_nukes", @@ -1842,6 +1899,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -1956,7 +2014,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -2013,6 +2071,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 1 GB pages for data store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_store_stlb_miss * DTLB_STORE_MISSES.WALK_COMPLETED_1G / (DTLB_STORE_MISSES.WALK_COMPLETED_4K + DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M + DTLB_STORE_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_store_stlb_miss_group", "MetricName": "tma_store_stlb_miss_1g", @@ -2021,6 +2080,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 2 or 4 MB pages for data store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_store_stlb_miss * DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M / (DTLB_STORE_MISSES.WALK_COMPLETED_4K + DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M + DTLB_STORE_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_store_stlb_miss_group", "MetricName": "tma_store_stlb_miss_2m", @@ -2029,6 +2089,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 4 KB pages for data store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_store_stlb_miss * DTLB_STORE_MISSES.WALK_COMPLETED_4K / (DTLB_STORE_MISSES.WALK_COMPLETED_4K + DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M + DTLB_STORE_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_store_stlb_miss_group", "MetricName": "tma_store_stlb_miss_4k", diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/cache.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/cache.json index 10bdb193c16f..26568e4b77f7 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/cache.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/cache.json @@ -1,10 +1,72 @@ [ { + "BriefDescription": "Hit snoop reply with data, line invalidated.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.I_FWD_FE", + "PublicDescription": "Counts responses to snoops indicating the line will now be (I)nvalidated: removed from this core's cache, after the data is forwarded back to the requestor and indicating the data was found unmodified in the (FE) Forward or Exclusive State in this cores caches cache. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x20" + }, + { + "BriefDescription": "HitM snoop reply with data, line invalidated.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.I_FWD_M", + "PublicDescription": "Counts responses to snoops indicating the line will now be (I)nvalidated: removed from this core's caches, after the data is forwarded back to the requestor, and indicating the data was found modified(M) in this cores caches cache (aka HitM response). A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { + "BriefDescription": "Hit snoop reply without sending the data, line invalidated.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.I_HIT_FSE", + "PublicDescription": "Counts responses to snoops indicating the line will now be (I)nvalidated in this core's caches without forwarded back to the requestor. The line was in Forward, Shared or Exclusive (FSE) state in this cores caches. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Line not found snoop reply", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.MISS", + "PublicDescription": "Counts responses to snoops indicating that the data was not found (IHitI) in this core's caches. A single snoop response from the core counts on all hyperthreads of the Core.", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Hit snoop reply with data, line kept in Shared state.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.S_FWD_FE", + "PublicDescription": "Counts responses to snoops indicating the line may be kept on this core in the (S)hared state, after the data is forwarded back to the requestor, initially the data was found in the cache in the (FS) Forward or Shared state. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x40" + }, + { + "BriefDescription": "HitM snoop reply with data, line kept in Shared state", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.S_FWD_M", + "PublicDescription": "Counts responses to snoops indicating the line may be kept on this core in the (S)hared state, after the data is forwarded back to the requestor, initially the data was found in the cache in the (M)odified state. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x8" + }, + { + "BriefDescription": "Hit snoop reply without sending the data, line kept in Shared state.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.S_HIT_FSE", + "PublicDescription": "Counts responses to snoops indicating the line was kept on this core in the (S)hared state, and that the data was found unmodified but not forwarded back to the requestor, initially the data was found in the cache in the (FSE) Forward, Shared state or Exclusive state. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x4" + }, + { "BriefDescription": "L1D.HWPF_MISS", "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.HWPF_MISS", - "PublicDescription": "L1D.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -13,7 +75,7 @@ "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.REPLACEMENT", - "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace. Available PDIST counters: 0", + "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -22,7 +84,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -33,7 +95,7 @@ "EdgeDetect": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL_PERIODS", - "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -43,7 +105,6 @@ "Deprecated": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.L2_STALL", - "PublicDescription": "This event is deprecated. Refer to new event L1D_PEND_MISS.L2_STALLS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -52,7 +113,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.L2_STALLS", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -61,7 +122,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING", - "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -71,7 +132,7 @@ "CounterMask": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING_CYCLES", - "PublicDescription": "Counts duration of L1D miss outstanding in cycles. Available PDIST counters: 0", + "PublicDescription": "Counts duration of L1D miss outstanding in cycles.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -80,7 +141,7 @@ "Counter": "0,1,2,3", "EventCode": "0x25", "EventName": "L2_LINES_IN.ALL", - "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects. Available PDIST counters: 0", + "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects.", "SampleAfterValue": "100003", "UMask": "0x1f" }, @@ -89,7 +150,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.NON_SILENT", - "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3 Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3", "SampleAfterValue": "200003", "UMask": "0x2" }, @@ -98,7 +159,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.SILENT", - "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event. Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event.", "SampleAfterValue": "200003", "UMask": "0x1" }, @@ -107,7 +168,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.USELESS_HWPF", - "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache Available PDIST counters: 0", + "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache", "SampleAfterValue": "200003", "UMask": "0x4" }, @@ -116,7 +177,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.ALL", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES]", "SampleAfterValue": "200003", "UMask": "0xff" }, @@ -125,7 +186,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f" }, @@ -134,7 +195,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_CODE_RD", - "PublicDescription": "Counts the total number of L2 code requests. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of L2 code requests.", "SampleAfterValue": "200003", "UMask": "0xe4" }, @@ -143,7 +204,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD", - "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0xe1" }, @@ -152,7 +213,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_MISS", - "PublicDescription": "Counts demand requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts demand requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x27" }, @@ -161,7 +222,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_REFERENCES", - "PublicDescription": "Counts demand requests to L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts demand requests to L2 cache.", "SampleAfterValue": "200003", "UMask": "0xe7" }, @@ -170,7 +231,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_HWPF", - "PublicDescription": "L2_RQSTS.ALL_HWPF Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0xf0" }, @@ -179,7 +239,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_RFO", - "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches.", "SampleAfterValue": "200003", "UMask": "0xe2" }, @@ -188,7 +248,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_HIT", - "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads.", "SampleAfterValue": "200003", "UMask": "0xc4" }, @@ -197,7 +257,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_MISS", - "PublicDescription": "Counts L2 cache misses when fetching instructions. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache misses when fetching instructions.", "SampleAfterValue": "200003", "UMask": "0x24" }, @@ -206,7 +266,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT", - "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc1" }, @@ -215,7 +275,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_MISS", - "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0x21" }, @@ -224,7 +284,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.HWPF_MISS", - "PublicDescription": "L2_RQSTS.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x30" }, @@ -233,7 +292,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f" }, @@ -242,7 +301,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.REFERENCES", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL]", "SampleAfterValue": "200003", "UMask": "0xff" }, @@ -251,7 +310,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_HIT", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc2" }, @@ -260,7 +319,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_MISS", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x22" }, @@ -269,7 +328,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_HIT", - "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0xc8" }, @@ -278,7 +337,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_MISS", - "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0x28" }, @@ -287,7 +346,7 @@ "Counter": "0,1,2,3", "EventCode": "0x23", "EventName": "L2_TRANS.L2_WB", - "PublicDescription": "Counts L2 writebacks that access L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts L2 writebacks that access L2 cache.", "SampleAfterValue": "200003", "UMask": "0x40" }, @@ -296,7 +355,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.MISS", - "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x41" }, @@ -305,7 +364,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.REFERENCE", - "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x4f" }, @@ -394,7 +453,7 @@ "Counter": "0,1,2,3", "EventCode": "0x43", "EventName": "MEM_LOAD_COMPLETED.L1_MISS_ANY", - "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss) Available PDIST counters: 0", + "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss)", "SampleAfterValue": "1000003", "UMask": "0xfd" }, @@ -563,7 +622,6 @@ "Counter": "0,1,2,3", "EventCode": "0x44", "EventName": "MEM_STORE_RETIRED.L2_HIT", - "PublicDescription": "MEM_STORE_RETIRED.L2_HIT Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x1" }, @@ -572,7 +630,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe5", "EventName": "MEM_UOP_RETIRED.ANY", - "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses Available PDIST counters: 0", + "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -999,7 +1057,6 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.ALL_REQUESTS", - "PublicDescription": "OFFCORE_REQUESTS.ALL_REQUESTS Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -1008,7 +1065,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DATA_RD", - "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -1017,7 +1074,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_CODE_RD", - "PublicDescription": "Counts both cacheable and non-cacheable code read requests. Available PDIST counters: 0", + "PublicDescription": "Counts both cacheable and non-cacheable code read requests.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -1026,7 +1083,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD", - "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore. Available PDIST counters: 0", + "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -1035,7 +1092,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_RFO", - "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM. Available PDIST counters: 0", + "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -1045,7 +1102,6 @@ "Deprecated": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD", - "PublicDescription": "This event is deprecated. Refer to new event OFFCORE_REQUESTS_OUTSTANDING.DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -1055,7 +1111,6 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -1065,7 +1120,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1075,7 +1130,6 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "PublicDescription": "Cycles where at least 1 outstanding demand data read request is pending. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1085,7 +1139,6 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -1094,7 +1147,6 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -1103,7 +1155,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1112,7 +1164,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -1121,7 +1173,7 @@ "Counter": "0,1,2,3", "EventCode": "0x2c", "EventName": "SQ_MISC.BUS_LOCK", - "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory. Available PDIST counters: 0", + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -1130,7 +1182,6 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.ANY", - "PublicDescription": "Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0xf" }, @@ -1139,7 +1190,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.NTA", - "PublicDescription": "Counts the number of PREFETCHNTA instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHNTA instructions executed.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -1148,7 +1199,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.PREFETCHW", - "PublicDescription": "Counts the number of PREFETCHW instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHW instructions executed.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -1157,7 +1208,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T0", - "PublicDescription": "Counts the number of PREFETCHT0 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT0 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -1166,7 +1217,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T1_T2", - "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x4" } diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/emr-metrics.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/emr-metrics.json index 34e1cbcd722c..433ae5f50704 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/emr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/emr-metrics.json @@ -1,28 +1,28 @@ [ { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" @@ -40,6 +40,18 @@ "ScaleUnit": "1per_instr" }, { + "BriefDescription": "The average number of cores that are in cstate C0 as observed by the power control unit (PCU)", + "MetricExpr": "UNC_P_POWER_STATE_OCCUPANCY_CORES_C0 / UNC_P_CLOCKTICKS * #num_packages", + "MetricGroup": "cpu_cstate", + "MetricName": "cpu_cstate_c0" + }, + { + "BriefDescription": "The average number of cores are in cstate C6 as observed by the power control unit (PCU)", + "MetricExpr": "UNC_P_POWER_STATE_OCCUPANCY_CORES_C6 / UNC_P_CLOCKTICKS * #num_packages", + "MetricGroup": "cpu_cstate", + "MetricName": "cpu_cstate_c6" + }, + { "BriefDescription": "CPU operating frequency (in GHz)", "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", "MetricName": "cpu_operating_frequency", @@ -79,6 +91,12 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "Bandwidth of inbound IO reads that are initiated by end device controllers that are requesting memory from the CPU and miss the L3 cache", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_read_l3_miss", + "ScaleUnit": "1MB/s" + }, + { "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the local CPU socket", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL * 64 / 1e6 / duration_time", "MetricName": "io_bandwidth_read_local", @@ -97,6 +115,12 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "Bandwidth of inbound IO writes that are initiated by end device controllers that are writing memory to the CPU", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write_l3_miss", + "ScaleUnit": "1MB/s" + }, + { "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the local CPU socket", "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_ITOM_LOCAL + UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_LOCAL) * 64 / 1e6 / duration_time", "MetricName": "io_bandwidth_write_local", @@ -111,19 +135,19 @@ { "BriefDescription": "Percentage of inbound full cacheline writes initiated by end device controllers that miss the L3 cache", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM / UNC_CHA_TOR_INSERTS.IO_ITOM", - "MetricName": "io_percent_of_inbound_full_writes_that_miss_l3", + "MetricName": "io_full_write_l3_miss", "ScaleUnit": "100%" }, { "BriefDescription": "Percentage of inbound partial cacheline writes initiated by end device controllers that miss the L3 cache", "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_RFO) / (UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_RFO)", - "MetricName": "io_percent_of_inbound_partial_writes_that_miss_l3", + "MetricName": "io_partial_write_l3_miss", "ScaleUnit": "100%" }, { "BriefDescription": "Percentage of inbound reads initiated by end device controllers that miss the L3 cache", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR / UNC_CHA_TOR_INSERTS.IO_PCIRDCUR", - "MetricName": "io_percent_of_inbound_reads_that_miss_l3", + "MetricName": "io_read_l3_miss", "ScaleUnit": "100%" }, { @@ -335,7 +359,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvOB;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -370,39 +394,39 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * tma_amx_busy / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + 0 / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + 0 / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * tma_amx_busy / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20" }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * ((1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + RS.EMPTY_RESOURCE / tma_info_thread_clks * tma_ports_utilized_0) / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricExpr": "100 * ((1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + RS.EMPTY_RESOURCE / tma_info_thread_clks * tma_ports_utilized_0) / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -434,7 +458,7 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -450,7 +474,7 @@ { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", "DefaultMetricgroupName": "TopdownL2", - "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BadSpec;BrMispredicts;BvMP;Default;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", @@ -551,7 +575,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(76.6 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 74.6 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "BvMS;DataSharing;LockCont;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", @@ -658,7 +681,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -786,7 +809,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", "DefaultMetricgroupName": "TopdownL2", - "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "Default;Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -1297,19 +1320,19 @@ { "BriefDescription": "Off-core accesses per kilo instruction for modified write requests", "MetricExpr": "1e3 * OCR.MODIFIED_WRITE.ANY_RESPONSE / tma_info_inst_mix_instructions", - "MetricGroup": "Offcore", + "MetricGroup": "Offcore;Server", "MetricName": "tma_info_memory_mix_offcore_mwrite_any_pki" }, { "BriefDescription": "Off-core accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.ANY_RESPONSE / tma_info_inst_mix_instructions", - "MetricGroup": "CacheHits;Offcore", + "MetricGroup": "CacheHits;Offcore;Server", "MetricName": "tma_info_memory_mix_offcore_read_any_pki" }, { "BriefDescription": "L3 cache misses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.L3_MISS / tma_info_inst_mix_instructions", - "MetricGroup": "Offcore", + "MetricGroup": "Offcore;Server", "MetricName": "tma_info_memory_mix_offcore_read_l3m_pki" }, { @@ -1335,21 +1358,21 @@ { "BriefDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket", "MetricExpr": "64 * OCR.READS_TO_CORE.DRAM / 1e9 / tma_info_system_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricGroup": "HPC;Mem;MemoryBW;Offcore;Server", "MetricName": "tma_info_memory_soc_r2c_dram_bw", "PublicDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket. See R2C_Offcore_BW." }, { "BriefDescription": "Average L3-cache miss BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.L3_MISS / 1e9 / tma_info_system_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricGroup": "HPC;Mem;MemoryBW;Offcore;Server", "MetricName": "tma_info_memory_soc_r2c_l3m_bw", "PublicDescription": "Average L3-cache miss BW for Reads-to-Core (R2C). This covering going to DRAM or other memory off-chip memory tears. See R2C_Offcore_BW." }, { "BriefDescription": "Average Off-core access BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.ANY_RESPONSE / 1e9 / tma_info_system_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricGroup": "HPC;Mem;MemoryBW;Offcore;Server", "MetricName": "tma_info_memory_soc_r2c_offcore_bw", "PublicDescription": "Average Off-core access BW for Reads-to-Core (R2C). R2C account for demand or prefetch load/RFO/code access that fill data into the Core caches." }, @@ -1379,7 +1402,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -1426,7 +1449,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1438,16 +1461,28 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_read_bw" + }, + { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_write_bw" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / tma_info_system_time", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1513,7 +1548,6 @@ }, { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / tma_info_system_time)", "MetricGroup": "Mem;MemoryLat;SoC", "MetricName": "tma_info_system_mem_read_latency", @@ -1674,12 +1708,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1693,7 +1727,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L2 cache under unloaded scenarios (possibly L2 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "4.4 * tma_info_system_core_frequency * MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_l2_bound_group", "MetricName": "tma_l2_hit_latency", @@ -1712,12 +1745,11 @@ }, { "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "32.6 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1800,6 +1832,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -1832,7 +1865,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1841,13 +1874,13 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "DefaultMetricgroupName": "TopdownL2", - "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "Backend;Default;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -1857,7 +1890,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", @@ -1910,7 +1942,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "max(IDQ.MS_CYCLES_ANY, cpu@UOPS_RETIRED.MS\\,cmask\\=1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY)) / tma_info_core_core_clks / 2", + "MetricExpr": "max(IDQ.MS_CYCLES_ANY, cpu@UOPS_RETIRED.MS\\,cmask\\=1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY)) / tma_info_core_core_clks / 2.4", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -1945,6 +1977,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", @@ -2006,6 +2039,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_3_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_3_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -2015,6 +2049,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + max(RS.EMPTY_RESOURCE - RESOURCE_STALLS.SCOREBOARD, 0)) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", @@ -2024,6 +2059,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", @@ -2033,7 +2069,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", @@ -2043,7 +2078,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", @@ -2072,7 +2106,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2100,7 +2134,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", @@ -2132,7 +2165,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/floating-point.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/floating-point.json index 8c9207750c82..bc475e163227 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/floating-point.json @@ -5,7 +5,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.FPDIV_ACTIVE", - "PublicDescription": "ARITH.FPDIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -14,7 +13,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.FP", - "PublicDescription": "Counts all microcode Floating Point assists. Available PDIST counters: 0", + "PublicDescription": "Counts all microcode Floating Point assists.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -23,7 +22,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.SSE_AVX_MIX", - "PublicDescription": "ASSISTS.SSE_AVX_MIX Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -32,7 +30,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_0", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -41,7 +38,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_1", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -50,7 +46,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_5", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -59,7 +54,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V0", - "PublicDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -68,7 +62,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V1", - "PublicDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -77,7 +70,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V2", - "PublicDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -86,7 +78,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -95,7 +87,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -104,7 +96,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -113,7 +105,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -122,7 +114,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x18" }, @@ -131,7 +123,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x40" }, @@ -140,7 +132,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 16 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 16 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -149,7 +141,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.8_FLOPS", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision and double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision and double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x60" }, @@ -158,7 +150,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR", - "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -167,7 +159,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -176,7 +168,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_SINGLE", - "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -185,7 +177,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.VECTOR", - "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0xfc" }, @@ -194,7 +186,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.128B_PACKED_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -203,7 +194,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.256B_PACKED_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -212,7 +202,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.512B_PACKED_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -221,7 +210,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -230,7 +218,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.SCALAR", - "PublicDescription": "FP_ARITH_INST_RETIRED2.SCALAR Available PDIST counters: 0", + "PublicDescription": "FP_ARITH_INST_RETIRED2.SCALAR", "SampleAfterValue": "100003", "UMask": "0x3" }, @@ -239,7 +227,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.SCALAR_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.SCALAR_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -248,7 +235,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.VECTOR", - "PublicDescription": "FP_ARITH_INST_RETIRED2.VECTOR Available PDIST counters: 0", + "PublicDescription": "FP_ARITH_INST_RETIRED2.VECTOR", "SampleAfterValue": "100003", "UMask": "0x1c" } diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json index 9fe9d62b867a..793c486ffabe 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/frontend.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3", "EventCode": "0x60", "EventName": "BACLEARS.ANY", - "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0", + "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -13,7 +13,7 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.LCP", - "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk. Available PDIST counters: 0", + "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk.", "SampleAfterValue": "500009", "UMask": "0x1" }, @@ -22,7 +22,6 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.MS_BUSY", - "PublicDescription": "Cycles the Microcode Sequencer is busy. Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x2" }, @@ -31,7 +30,7 @@ "Counter": "0,1,2,3", "EventCode": "0x61", "EventName": "DSB2MITE_SWITCHES.PENALTY_CYCLES", - "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE. Available PDIST counters: 0", + "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -249,7 +248,7 @@ "Counter": "0,1,2,3", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALLS", - "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity.", "SampleAfterValue": "500009", "UMask": "0x4" }, @@ -260,7 +259,6 @@ "EdgeDetect": "1", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALL_PERIODS", - "PublicDescription": "ICACHE_DATA.STALL_PERIODS Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x4" }, @@ -269,7 +267,7 @@ "Counter": "0,1,2,3", "EventCode": "0x83", "EventName": "ICACHE_TAG.STALLS", - "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss.", "SampleAfterValue": "200003", "UMask": "0x4" }, @@ -279,7 +277,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -289,7 +287,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -298,7 +296,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.DSB_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -308,7 +306,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -318,7 +316,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -327,7 +325,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MITE_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -337,7 +335,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MS_CYCLES_ANY", - "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE.", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -348,7 +346,7 @@ "EdgeDetect": "1", "EventCode": "0x79", "EventName": "IDQ.MS_SWITCHES", - "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer. Available PDIST counters: 0", + "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -357,7 +355,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MS_UOPS", - "PublicDescription": "Counts the total number of uops delivered by the Microcode Sequencer (MS). Available PDIST counters: 0", + "PublicDescription": "Counts the total number of uops delivered by the Microcode Sequencer (MS).", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -366,7 +364,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CORE", - "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -376,7 +374,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -387,7 +385,7 @@ "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -396,7 +394,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CORE", - "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -406,7 +404,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -417,7 +415,7 @@ "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1" } diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json index 7c3f9b76d367..5e6c1f05c981 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/memory.json @@ -5,7 +5,6 @@ "CounterMask": "6", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x6" }, @@ -14,7 +13,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", - "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture Available PDIST counters: 0", + "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -24,7 +23,6 @@ "CounterMask": "2", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -34,7 +32,6 @@ "CounterMask": "3", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -44,7 +41,7 @@ "CounterMask": "5", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x5" }, @@ -54,7 +51,7 @@ "CounterMask": "9", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x9" }, @@ -478,7 +475,6 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "Counts demand data read requests that miss the L3 cache. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -487,7 +483,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -505,7 +501,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_EVENTS", - "PublicDescription": "Counts the number of times an RTM execution aborted due to none of the previous 3 categories (e.g. interrupt). Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to none of the previous 3 categories (e.g. interrupt).", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -514,7 +510,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_MEM", - "PublicDescription": "Counts the number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts). Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts).", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -523,7 +519,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_MEMTYPE", - "PublicDescription": "Counts the number of times an RTM execution aborted due to incompatible memory type. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to incompatible memory type.", "SampleAfterValue": "100003", "UMask": "0x40" }, @@ -532,7 +528,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_UNFRIENDLY", - "PublicDescription": "Counts the number of times an RTM execution aborted due to HLE-unfriendly instructions. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to HLE-unfriendly instructions.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -541,7 +537,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.COMMIT", - "PublicDescription": "Counts the number of times RTM commit succeeded. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times RTM commit succeeded.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -550,7 +546,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.START", - "PublicDescription": "Counts the number of times we entered an RTM region. Does not count nested transactions. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times we entered an RTM region. Does not count nested transactions.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -559,7 +555,7 @@ "Counter": "0,1,2,3", "EventCode": "0x54", "EventName": "TX_MEM.ABORT_CAPACITY_READ", - "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional reads Available PDIST counters: 0", + "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional reads", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -568,7 +564,7 @@ "Counter": "0,1,2,3", "EventCode": "0x54", "EventName": "TX_MEM.ABORT_CAPACITY_WRITE", - "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional writes. Available PDIST counters: 0", + "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional writes.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -577,7 +573,7 @@ "Counter": "0,1,2,3", "EventCode": "0x54", "EventName": "TX_MEM.ABORT_CONFLICT", - "PublicDescription": "Counts the number of times a TSX line had a cache conflict. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times a TSX line had a cache conflict.", "SampleAfterValue": "100003", "UMask": "0x1" } diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/other.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/other.json index a58d65556609..21f49f609ed4 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/other.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/other.json @@ -4,11 +4,35 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.PAGE_FAULT", - "PublicDescription": "ASSISTS.PAGE_FAULT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, { + "BriefDescription": "HW_INTERRUPTS.MASKED", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xcb", + "EventName": "HW_INTERRUPTS.MASKED", + "SampleAfterValue": "100003", + "UMask": "0x2" + }, + { + "BriefDescription": "HW_INTERRUPTS.PENDING_AND_MASKED", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xcb", + "EventName": "HW_INTERRUPTS.PENDING_AND_MASKED", + "SampleAfterValue": "100003", + "UMask": "0x4" + }, + { + "BriefDescription": "Number of hardware interrupts received by the processor.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xcb", + "EventName": "HW_INTERRUPTS.RECEIVED", + "PublicDescription": "Counts the number of hardware interruptions received by the processor.", + "SampleAfterValue": "203", + "UMask": "0x1" + }, + { "BriefDescription": "Counts streaming stores that have any type of response.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -25,7 +49,7 @@ "CounterMask": "1", "EventCode": "0x2d", "EventName": "XQ.FULL_CYCLES", - "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache). Available PDIST counters: 0", + "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache).", "SampleAfterValue": "1000003", "UMask": "0x1" } diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json index 48bec483b49a..1fa7957956df 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/pipeline.json @@ -6,7 +6,6 @@ "Deprecated": "1", "EventCode": "0xb0", "EventName": "ARITH.DIVIDER_ACTIVE", - "PublicDescription": "This event is deprecated. Refer to new event ARITH.DIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x9" }, @@ -16,7 +15,7 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.DIV_ACTIVE", - "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations.", "SampleAfterValue": "1000003", "UMask": "0x9" }, @@ -27,7 +26,6 @@ "Deprecated": "1", "EventCode": "0xb0", "EventName": "ARITH.FP_DIVIDER_ACTIVE", - "PublicDescription": "This event is deprecated. Refer to new event ARITH.FPDIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -37,7 +35,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.IDIV_ACTIVE", - "PublicDescription": "This event counts the cycles the integer divider is busy. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -48,7 +45,6 @@ "Deprecated": "1", "EventCode": "0xb0", "EventName": "ARITH.INT_DIVIDER_ACTIVE", - "PublicDescription": "This event is deprecated. Refer to new event ARITH.IDIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -57,7 +53,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.ANY", - "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists. Available PDIST counters: 0", + "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists.", "SampleAfterValue": "100003", "UMask": "0x1b" }, @@ -217,7 +213,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C01", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -226,7 +222,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C02", - "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -235,7 +231,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C0_WAIT", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction.", "SampleAfterValue": "2000003", "UMask": "0x70" }, @@ -244,7 +240,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.DISTRIBUTED", - "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -253,7 +249,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", - "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted. Available PDIST counters: 0", + "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted.", "SampleAfterValue": "25003", "UMask": "0x2" }, @@ -262,7 +258,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40" }, @@ -273,7 +268,6 @@ "EdgeDetect": "1", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE_INST", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE_INST Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40" }, @@ -282,7 +276,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_DISTRIBUTED", - "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -299,7 +293,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", - "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case. Available PDIST counters: 0", + "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -316,7 +310,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.THREAD_P", - "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time. Available PDIST counters: 0", + "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.", "SampleAfterValue": "2000003" }, { @@ -325,7 +319,6 @@ "CounterMask": "8", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -335,7 +328,6 @@ "CounterMask": "1", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L2_MISS", - "PublicDescription": "Cycles while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -345,7 +337,6 @@ "CounterMask": "16", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_MEM_ANY", - "PublicDescription": "Cycles while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -355,7 +346,6 @@ "CounterMask": "12", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xc" }, @@ -365,7 +355,6 @@ "CounterMask": "5", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x5" }, @@ -375,7 +364,6 @@ "CounterMask": "4", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_TOTAL", - "PublicDescription": "Total execution stalls. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -384,7 +372,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb7", "EventName": "EXE.AMX_BUSY", - "PublicDescription": "Counts the cycles where the AMX (Advance Matrix Extension) unit is busy performing an operation. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -393,7 +380,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.1_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -402,7 +389,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_3_PORTS_UTIL", - "PublicDescription": "Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0xc" }, @@ -411,7 +397,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -420,7 +406,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.3_PORTS_UTIL", - "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -429,7 +415,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.4_PORTS_UTIL", - "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -439,7 +425,6 @@ "CounterMask": "5", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_LOADS", - "PublicDescription": "Execution stalls while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x21" }, @@ -449,7 +434,7 @@ "CounterMask": "2", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_STORES", - "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall.", "SampleAfterValue": "1000003", "UMask": "0x40" }, @@ -458,7 +443,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.EXE_BOUND_0_PORTS", - "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load. Available PDIST counters: 0", + "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load.", "SampleAfterValue": "1000003", "UMask": "0x80" }, @@ -467,7 +452,7 @@ "Counter": "0,1,2,3", "EventCode": "0x75", "EventName": "INST_DECODED.DECODERS", - "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions. Available PDIST counters: 0", + "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -492,7 +477,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", - "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -501,7 +485,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", - "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions Available PDIST counters: 0", + "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -518,7 +502,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", - "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0", + "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -529,7 +513,7 @@ "EdgeDetect": "1", "EventCode": "0xad", "EventName": "INT_MISC.CLEARS_COUNT", - "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears Available PDIST counters: 0", + "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", "SampleAfterValue": "500009", "UMask": "0x1" }, @@ -538,7 +522,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.CLEAR_RESTEER_CYCLES", - "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path. Available PDIST counters: 0", + "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.", "SampleAfterValue": "500009", "UMask": "0x80" }, @@ -547,7 +531,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.MBA_STALLS", - "PublicDescription": "INT_MISC.MBA_STALLS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -556,7 +539,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.RECOVERY_CYCLES", - "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event. Available PDIST counters: 0", + "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event.", "SampleAfterValue": "500009", "UMask": "0x1" }, @@ -567,7 +550,6 @@ "EventName": "INT_MISC.UNKNOWN_BRANCH_CYCLES", "MSRIndex": "0x3F7", "MSRValue": "0x7", - "PublicDescription": "Bubble cycles of BAClear (Unknown Branch). Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40" }, @@ -576,7 +558,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.UOP_DROPPING", - "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons Available PDIST counters: 0", + "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -585,7 +567,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.128BIT", - "PublicDescription": "INT_VEC_RETIRED.128BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x13" }, @@ -594,7 +575,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.256BIT", - "PublicDescription": "INT_VEC_RETIRED.256BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xac" }, @@ -603,7 +583,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_128", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -612,7 +592,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_256", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0xc" }, @@ -621,7 +601,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.MUL_256", - "PublicDescription": "INT_VEC_RETIRED.MUL_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x80" }, @@ -630,7 +609,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.SHUFFLES", - "PublicDescription": "INT_VEC_RETIRED.SHUFFLES Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40" }, @@ -639,7 +617,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_128", - "PublicDescription": "INT_VEC_RETIRED.VNNI_128 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -648,7 +625,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_256", - "PublicDescription": "INT_VEC_RETIRED.VNNI_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -657,7 +633,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.ADDRESS_ALIAS", - "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -666,7 +642,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.NO_SR", - "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.", "SampleAfterValue": "100003", "UMask": "0x88" }, @@ -675,7 +651,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.STORE_FORWARD", - "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide.", "SampleAfterValue": "100003", "UMask": "0x82" }, @@ -684,7 +660,7 @@ "Counter": "0,1,2,3", "EventCode": "0x4c", "EventName": "LOAD_HIT_PREFETCH.SWPF", - "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -694,7 +670,7 @@ "CounterMask": "1", "EventCode": "0xa8", "EventName": "LSD.CYCLES_ACTIVE", - "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -704,7 +680,7 @@ "CounterMask": "6", "EventCode": "0xa8", "EventName": "LSD.CYCLES_OK", - "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -713,7 +689,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa8", "EventName": "LSD.UOPS", - "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -724,7 +700,7 @@ "EdgeDetect": "1", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.COUNT", - "PublicDescription": "Counts the number of machine clears (nukes) of any type. Available PDIST counters: 0", + "PublicDescription": "Counts the number of machine clears (nukes) of any type.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -733,7 +709,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.SMC", - "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear. Available PDIST counters: 0", + "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -742,7 +718,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe0", "EventName": "MISC2_RETIRED.LFENCE", - "PublicDescription": "number of LFENCE retired instructions Available PDIST counters: 0", + "PublicDescription": "number of LFENCE retired instructions", "SampleAfterValue": "400009", "UMask": "0x20" }, @@ -751,7 +727,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcc", "EventName": "MISC_RETIRED.LBR_INSERTS", - "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT. Available PDIST counters: 0", + "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -760,7 +736,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SB", - "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end. Available PDIST counters: 0", + "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -769,7 +745,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SCOREBOARD", - "PublicDescription": "Counts cycles where the pipeline is stalled due to serializing operations. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -778,7 +753,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY", - "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses) Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", "SampleAfterValue": "1000003", "UMask": "0x7" }, @@ -790,7 +765,7 @@ "EventCode": "0xa5", "EventName": "RS.EMPTY_COUNT", "Invert": "1", - "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events) Available PDIST counters: 0", + "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events)", "SampleAfterValue": "100003", "UMask": "0x7" }, @@ -799,7 +774,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY_RESOURCE", - "PublicDescription": "Cycles when Reservation Station (RS) is empty due to a resource in the back-end Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -812,7 +786,6 @@ "EventCode": "0xa5", "EventName": "RS_EMPTY.COUNT", "Invert": "1", - "PublicDescription": "This event is deprecated. Refer to new event RS.EMPTY_COUNT Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x7" }, @@ -822,7 +795,6 @@ "Deprecated": "1", "EventCode": "0xa5", "EventName": "RS_EMPTY.CYCLES", - "PublicDescription": "This event is deprecated. Refer to new event RS.EMPTY Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x7" }, @@ -831,7 +803,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS", - "PublicDescription": "Number of slots in TMA method where no micro-operations were being issued from front-end to back-end of the machine due to lack of back-end resources. Available PDIST counters: 0", + "PublicDescription": "Number of slots in TMA method where no micro-operations were being issued from front-end to back-end of the machine due to lack of back-end resources.", "SampleAfterValue": "10000003", "UMask": "0x2" }, @@ -840,7 +812,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BAD_SPEC_SLOTS", - "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations. Available PDIST counters: 0", + "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations.", "SampleAfterValue": "10000003", "UMask": "0x4" }, @@ -849,7 +821,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS", - "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction. Available PDIST counters: 0", + "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction.", "SampleAfterValue": "10000003", "UMask": "0x8" }, @@ -858,7 +830,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.MEMORY_BOUND_SLOTS", - "PublicDescription": "TOPDOWN.MEMORY_BOUND_SLOTS Available PDIST counters: 0", "SampleAfterValue": "10000003", "UMask": "0x10" }, @@ -875,7 +846,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.SLOTS_P", - "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Available PDIST counters: 0", + "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core.", "SampleAfterValue": "10000003", "UMask": "0x1" }, @@ -884,7 +855,6 @@ "Counter": "0,1,2,3", "EventCode": "0x76", "EventName": "UOPS_DECODED.DEC0_UOPS", - "PublicDescription": "UOPS_DECODED.DEC0_UOPS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -893,7 +863,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_0", - "PublicDescription": "Number of uops dispatch to execution port 0. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 0.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -902,7 +872,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_1", - "PublicDescription": "Number of uops dispatch to execution port 1. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 1.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -911,7 +881,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_2_3_10", - "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -920,7 +890,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_4_9", - "PublicDescription": "Number of uops dispatch to execution ports 4 and 9 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 4 and 9", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -929,7 +899,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_5_11", - "PublicDescription": "Number of uops dispatch to execution ports 5 and 11 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 5 and 11", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -938,7 +908,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_6", - "PublicDescription": "Number of uops dispatch to execution port 6. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 6.", "SampleAfterValue": "2000003", "UMask": "0x40" }, @@ -947,7 +917,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_7_8", - "PublicDescription": "Number of uops dispatch to execution ports 7 and 8. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 7 and 8.", "SampleAfterValue": "2000003", "UMask": "0x80" }, @@ -956,7 +926,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE", - "PublicDescription": "Counts the number of uops executed from any thread. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops executed from any thread.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -966,7 +936,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_1", - "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -976,7 +946,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_2", - "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -986,7 +956,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_3", - "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -996,7 +966,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_4", - "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1006,7 +976,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_1", - "PublicDescription": "Cycles where at least 1 uop was executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 1 uop was executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1016,7 +986,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_2", - "PublicDescription": "Cycles where at least 2 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 2 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1026,7 +996,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_3", - "PublicDescription": "Cycles where at least 3 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 3 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1036,7 +1006,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_4", - "PublicDescription": "Cycles where at least 4 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 4 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1047,7 +1017,7 @@ "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.STALLS", "Invert": "1", - "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1059,7 +1029,6 @@ "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.STALL_CYCLES", "Invert": "1", - "PublicDescription": "This event is deprecated. Refer to new event UOPS_EXECUTED.STALLS Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1068,7 +1037,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.THREAD", - "PublicDescription": "Counts the number of uops to be executed per-thread each cycle. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1077,7 +1045,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.X87", - "PublicDescription": "Counts the number of x87 uops executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of x87 uops executed.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -1086,7 +1054,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xae", "EventName": "UOPS_ISSUED.ANY", - "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1096,7 +1064,6 @@ "CounterMask": "1", "EventCode": "0xae", "EventName": "UOPS_ISSUED.CYCLES", - "PublicDescription": "UOPS_ISSUED.CYCLES Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1106,7 +1073,7 @@ "CounterMask": "1", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.CYCLES", - "PublicDescription": "Counts cycles where at least one uop has retired. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where at least one uop has retired.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1115,7 +1082,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.HEAVY", - "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count. Available PDIST counters: 0", + "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1126,7 +1093,6 @@ "EventName": "UOPS_RETIRED.MS", "MSRIndex": "0x3F7", "MSRValue": "0x8", - "PublicDescription": "UOPS_RETIRED.MS Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -1135,7 +1101,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.SLOTS", - "PublicDescription": "Counts the retirement slots used each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the retirement slots used each cycle.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1146,7 +1112,7 @@ "EventCode": "0xc2", "EventName": "UOPS_RETIRED.STALLS", "Invert": "1", - "PublicDescription": "This event counts cycles without actually retired uops. Available PDIST counters: 0", + "PublicDescription": "This event counts cycles without actually retired uops.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1158,7 +1124,6 @@ "EventCode": "0xc2", "EventName": "UOPS_RETIRED.STALL_CYCLES", "Invert": "1", - "PublicDescription": "This event is deprecated. Refer to new event UOPS_RETIRED.STALLS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2" } diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json index f453202d80c2..92cf47967f0b 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-cache.json @@ -312,6 +312,17 @@ "Unit": "CHA" }, { + "BriefDescription": "Distress signal asserted : DPT Remote", + "Counter": "0,1,2,3", + "EventCode": "0xaf", + "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_NONLOCAL", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Distress signal asserted : DPT Remote : Counts the number of cycles either the local or incoming distress signals are asserted. : Dynamic Prefetch Throttle received by this tile", + "UMask": "0x8", + "Unit": "CHA" + }, + { "BriefDescription": "Egress Blocking due to Ordering requirements : Down", "Counter": "0,1,2,3", "EventCode": "0xba", diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-memory.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-memory.json index 68be01dad7c9..30044177ccf8 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-memory.json @@ -2770,6 +2770,88 @@ "Unit": "iMC" }, { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.HIGH", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Number of DRAM Refreshes Issued : Counts the number of refreshes issued.", + "UMask": "0x24", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.HIGH_ALL", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x24", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.HIGH_PCH0", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.HIGH_PCH1", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.PANIC", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Number of DRAM Refreshes Issued : Counts the number of refreshes issued.", + "UMask": "0x12", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.PANIC_ALL", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x12", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.PANIC_PCH0", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.PANIC_PCH1", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "iMC" + }, + { "BriefDescription": "ECC Correctable Errors", "Counter": "0,1,2,3", "EventCode": "0x09", @@ -3048,6 +3130,28 @@ "Unit": "iMC" }, { + "BriefDescription": "Throttle Cycles for Rank 0", + "Counter": "0,1,2,3", + "EventCode": "0x46", + "EventName": "UNC_M_POWER_THROTTLE_CYCLES.SLOT0", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Throttle Cycles for Rank 0 : Counts the number of cycles while the iMC is being throttled by either thermal constraints or by the PCU throttling. It is not possible to distinguish between the two. This can be filtered by rank. If multiple ranks are selected and are being throttled at the same time, the counter will only increment by 1. : Thermal throttling is performed per DIMM. We support 3 DIMMs per channel. This ID allows us to filter by ID.", + "UMask": "0x1", + "Unit": "iMC" + }, + { + "BriefDescription": "Throttle Cycles for Rank 0", + "Counter": "0,1,2,3", + "EventCode": "0x46", + "EventName": "UNC_M_POWER_THROTTLE_CYCLES.SLOT1", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Throttle Cycles for Rank 0 : Counts the number of cycles while the iMC is being throttled by either thermal constraints or by the PCU throttling. It is not possible to distinguish between the two. This can be filtered by rank. If multiple ranks are selected and are being throttled at the same time, the counter will only increment by 1.", + "UMask": "0x2", + "Unit": "iMC" + }, + { "BriefDescription": "Precharge due to read, write, underfill, or PGT.", "Counter": "0,1,2,3", "EventCode": "0x03", diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-power.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-power.json index 9482ddaea4d1..71c35b165a3e 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/uncore-power.json @@ -178,7 +178,6 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_P_POWER_STATE_OCCUPANCY_CORES_C0", - "Experimental": "1", "PerPkg": "1", "PublicDescription": "Number of cores in C0 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -198,7 +197,6 @@ "Counter": "0,1,2,3", "EventCode": "0x37", "EventName": "UNC_P_POWER_STATE_OCCUPANCY_CORES_C6", - "Experimental": "1", "PerPkg": "1", "PublicDescription": "Number of cores in C6 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" diff --git a/tools/perf/pmu-events/arch/x86/emeraldrapids/virtual-memory.json b/tools/perf/pmu-events/arch/x86/emeraldrapids/virtual-memory.json index 3d3f88600e26..609a9549cbf3 100644 --- a/tools/perf/pmu-events/arch/x86/emeraldrapids/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/emeraldrapids/virtual-memory.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.STLB_HIT", - "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB).", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -14,7 +14,7 @@ "CounterMask": "1", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -23,7 +23,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe" }, @@ -32,7 +32,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -41,7 +41,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -50,7 +50,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -59,7 +59,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -68,7 +68,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.STLB_HIT", - "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB).", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -78,7 +78,7 @@ "CounterMask": "1", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -87,7 +87,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe" }, @@ -96,7 +96,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -105,7 +105,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -114,7 +114,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -123,7 +123,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -132,7 +132,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.STLB_HIT", - "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB).", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -142,7 +142,7 @@ "CounterMask": "1", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -151,7 +151,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe" }, @@ -160,7 +160,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -169,7 +169,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -178,7 +178,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10" } diff --git a/tools/perf/pmu-events/arch/x86/grandridge/cache.json b/tools/perf/pmu-events/arch/x86/grandridge/cache.json index 877052db1490..9abddb06a837 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/cache.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/cache.json @@ -286,7 +286,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024", @@ -297,7 +297,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128", @@ -308,7 +308,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16", @@ -319,7 +319,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048", @@ -330,7 +330,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256", @@ -341,7 +341,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32", @@ -352,7 +352,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4", @@ -363,7 +363,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512", @@ -374,7 +374,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64", @@ -385,7 +385,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8", diff --git a/tools/perf/pmu-events/arch/x86/grandridge/grr-metrics.json b/tools/perf/pmu-events/arch/x86/grandridge/grr-metrics.json index 878b1caf12de..a0d637a24c1b 100644 --- a/tools/perf/pmu-events/arch/x86/grandridge/grr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/grandridge/grr-metrics.json @@ -1,56 +1,56 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c8\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C8_Pkg_Residency", "ScaleUnit": "100%" @@ -633,7 +633,7 @@ }, { "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricName": "tma_info_system_cpu_utilization" }, { @@ -645,7 +645,7 @@ }, { "BriefDescription": "Fraction of cycles spent in Kernel mode", - "MetricExpr": "cpu@CPU_CLK_UNHALTED.CORE_P@k / CPU_CLK_UNHALTED.CORE", + "MetricExpr": "CPU_CLK_UNHALTED.CORE_P:k / CPU_CLK_UNHALTED.CORE", "MetricGroup": "Summary", "MetricName": "tma_info_system_kernel_utilization" }, diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/cache.json b/tools/perf/pmu-events/arch/x86/graniterapids/cache.json index dbdeade6fe6f..7edb73583b07 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/cache.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/cache.json @@ -4,7 +4,6 @@ "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.HWPF_MISS", - "PublicDescription": "L1D.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -13,7 +12,7 @@ "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.REPLACEMENT", - "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace. Available PDIST counters: 0", + "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -22,7 +21,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -33,7 +32,7 @@ "EdgeDetect": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL_PERIODS", - "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -42,7 +41,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.L2_STALLS", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -51,7 +50,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING", - "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -61,7 +60,7 @@ "CounterMask": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING_CYCLES", - "PublicDescription": "Counts duration of L1D miss outstanding in cycles. Available PDIST counters: 0", + "PublicDescription": "Counts duration of L1D miss outstanding in cycles.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -70,7 +69,7 @@ "Counter": "0,1,2,3", "EventCode": "0x25", "EventName": "L2_LINES_IN.ALL", - "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects. Available PDIST counters: 0", + "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects.", "SampleAfterValue": "100003", "UMask": "0x1f" }, @@ -79,7 +78,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.NON_SILENT", - "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3 Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3", "SampleAfterValue": "200003", "UMask": "0x2" }, @@ -88,7 +87,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.SILENT", - "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event. Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event.", "SampleAfterValue": "200003", "UMask": "0x1" }, @@ -97,7 +96,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.USELESS_HWPF", - "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache Available PDIST counters: 0", + "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache", "SampleAfterValue": "200003", "UMask": "0x4" }, @@ -106,7 +105,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.ALL", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES]", "SampleAfterValue": "200003", "UMask": "0xff" }, @@ -115,7 +114,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.HIT", - "PublicDescription": "Counts all requests that hit L2 cache. [This event is alias to L2_RQSTS.HIT] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that hit L2 cache. [This event is alias to L2_RQSTS.HIT]", "SampleAfterValue": "200003", "UMask": "0xdf" }, @@ -124,7 +123,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f" }, @@ -133,7 +132,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_CODE_RD", - "PublicDescription": "Counts the total number of L2 code requests. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of L2 code requests.", "SampleAfterValue": "200003", "UMask": "0xe4" }, @@ -142,7 +141,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD", - "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0xe1" }, @@ -151,7 +150,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_MISS", - "PublicDescription": "Counts demand requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts demand requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x27" }, @@ -160,7 +159,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_REFERENCES", - "PublicDescription": "Counts demand requests to L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts demand requests to L2 cache.", "SampleAfterValue": "200003", "UMask": "0xe7" }, @@ -169,7 +168,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_HWPF", - "PublicDescription": "L2_RQSTS.ALL_HWPF Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0xf0" }, @@ -178,7 +176,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_RFO", - "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches.", "SampleAfterValue": "200003", "UMask": "0xe2" }, @@ -187,7 +185,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_HIT", - "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads.", "SampleAfterValue": "200003", "UMask": "0xc4" }, @@ -196,7 +194,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_MISS", - "PublicDescription": "Counts L2 cache misses when fetching instructions. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache misses when fetching instructions.", "SampleAfterValue": "200003", "UMask": "0x24" }, @@ -205,7 +203,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT", - "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc1" }, @@ -214,7 +212,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_MISS", - "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0x21" }, @@ -223,7 +221,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.HIT", - "PublicDescription": "Counts all requests that hit L2 cache. [This event is alias to L2_REQUEST.HIT] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that hit L2 cache. [This event is alias to L2_REQUEST.HIT]", "SampleAfterValue": "200003", "UMask": "0xdf" }, @@ -232,7 +230,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.HWPF_MISS", - "PublicDescription": "L2_RQSTS.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x30" }, @@ -241,7 +238,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f" }, @@ -250,7 +247,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.REFERENCES", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL]", "SampleAfterValue": "200003", "UMask": "0xff" }, @@ -259,7 +256,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_HIT", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc2" }, @@ -268,7 +265,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_MISS", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x22" }, @@ -277,7 +274,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_HIT", - "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0xc8" }, @@ -286,7 +283,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_MISS", - "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0x28" }, @@ -295,7 +292,7 @@ "Counter": "0,1,2,3", "EventCode": "0x23", "EventName": "L2_TRANS.L2_WB", - "PublicDescription": "Counts L2 writebacks that access L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts L2 writebacks that access L2 cache.", "SampleAfterValue": "200003", "UMask": "0x40" }, @@ -304,7 +301,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.MISS", - "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x41" }, @@ -313,7 +310,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.REFERENCE", - "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x4f" }, @@ -437,7 +434,7 @@ "Counter": "0,1,2,3", "EventCode": "0x43", "EventName": "MEM_LOAD_COMPLETED.L1_MISS_ANY", - "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss) Available PDIST counters: 0", + "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss)", "SampleAfterValue": "1000003", "UMask": "0xfd" }, @@ -504,6 +501,15 @@ "UMask": "0x1" }, { + "BriefDescription": "Retired load instructions with remote cxl mem as the data source where the data request missed all caches.", + "Counter": "0,1,2,3", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_CXL_MEM", + "PublicDescription": "Counts retired load instructions with remote cxl mem as the data source and the data request missed L3. Available PDIST counters: 0", + "SampleAfterValue": "100007", + "UMask": "0x10" + }, + { "BriefDescription": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM", "Counter": "0,1,2,3", "Data_LA": "1", @@ -629,11 +635,20 @@ "UMask": "0x20" }, { + "BriefDescription": "Retired load instructions with local cxl mem as the data source where the data request missed all caches.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.LOCAL_CXL_MEM", + "PublicDescription": "Counts retired load instructions with local cxl mem as the data source and the data request missed L3. Available PDIST counters: 0", + "SampleAfterValue": "1000003", + "UMask": "0x80" + }, + { "BriefDescription": "MEM_STORE_RETIRED.L2_HIT", "Counter": "0,1,2,3", "EventCode": "0x44", "EventName": "MEM_STORE_RETIRED.L2_HIT", - "PublicDescription": "MEM_STORE_RETIRED.L2_HIT Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x1" }, @@ -642,7 +657,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe5", "EventName": "MEM_UOP_RETIRED.ANY", - "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses Available PDIST counters: 0", + "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -691,6 +706,17 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts demand data reads that were supplied by CXL MEM (Type 2 or Type 3).", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_DATA_RD.CXL_MEM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x703C00001", + "PublicDescription": "Counts demand data reads that were supplied by CXL MEM (Type 2 or Type 3). Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts demand data reads that hit in the L3 or were snooped from another core's caches on the same socket.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -735,6 +761,17 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts demand data reads that were supplied by CXL MEM (Type 2 and Type 3) attached to local socket.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_DATA_RD.LOCAL_CXL_MEM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x700C00001", + "PublicDescription": "Counts demand data reads that were supplied by CXL MEM (Type 2 and Type 3) attached to local socket. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts demand data reads that were supplied by a cache on a remote socket where a snoop hit a modified line in another core's caches which forwarded the data.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -757,6 +794,17 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts demand data reads that were supplied by CXL MEM (Type 2 or Type 3) attached to another socket.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_DATA_RD.REMOTE_CXL_MEM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x703000001", + "PublicDescription": "Counts demand data reads that were supplied by CXL MEM (Type 2 or Type 3) attached to another socket. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts demand data reads that hit a modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -790,6 +838,17 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by CXL MEM (Type 2 or Type 3).", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_RFO.CXL_MEM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x703C00002", + "PublicDescription": "Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by CXL MEM (Type 2 or Type 3). Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that hit in the L3 or were snooped from another core's caches on the same socket.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -812,6 +871,28 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by CXL MEM (Type 2 and Type 3) attached to local socket.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_RFO.LOCAL_CXL_MEM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x700C00002", + "PublicDescription": "Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by CXL MEM (Type 2 and Type 3) attached to local socket. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { + "BriefDescription": "Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by CXL MEM (Type 2 or Type 3) attached to another socket.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_RFO.REMOTE_CXL_MEM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x703000002", + "PublicDescription": "Counts demand reads for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by CXL MEM (Type 2 or Type 3) attached to another socket. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts writebacks of modified cachelines and streaming stores that have any type of response.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -834,6 +915,17 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by CXL MEM (Type 2 or Type 3).", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.READS_TO_CORE.CXL_MEM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x703C04477", + "PublicDescription": "Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by CXL MEM (Type 2 or Type 3). Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that hit in the L3 or were snooped from another core's caches on the same socket.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -856,6 +948,17 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by CXL MEM (Type 2 and Type 3) attached to local socket.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.READS_TO_CORE.LOCAL_CXL_MEM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x700C04477", + "PublicDescription": "Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by CXL MEM (Type 2 and Type 3) attached to local socket. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were not supplied by the local socket's L1, L2, or L3 caches and were supplied by a remote socket.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -900,6 +1003,17 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by CXL MEM (Type 2 or Type 3) attached to another socket.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.READS_TO_CORE.REMOTE_CXL_MEM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x703004477", + "PublicDescription": "Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that were supplied by CXL MEM (Type 2 or Type 3) attached to another socket. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1" + }, + { "BriefDescription": "Counts all (cacheable) data read, code read and RFO requests including demands and prefetches to the core caches (L1 or L2) that hit a modified line in a distant L3 Cache or were snooped from a distant core's L1/L2 caches on this socket when the system is in SNC (sub-NUMA cluster) mode.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -937,7 +1051,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.ALL_REQUESTS", - "PublicDescription": "Counts memory transactions reached the super queue including requests initiated by the core, all L3 prefetches, page walks, etc.. Available PDIST counters: 0", + "PublicDescription": "Counts memory transactions reached the super queue including requests initiated by the core, all L3 prefetches, page walks, etc..", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -946,7 +1060,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DATA_RD", - "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -955,7 +1069,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_CODE_RD", - "PublicDescription": "Counts both cacheable and Non-Cacheable code read requests. Available PDIST counters: 0", + "PublicDescription": "Counts both cacheable and Non-Cacheable code read requests.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -964,7 +1078,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD", - "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore. Available PDIST counters: 0", + "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -973,7 +1087,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_RFO", - "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM. Available PDIST counters: 0", + "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -982,7 +1096,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.MEM_UC", - "PublicDescription": "This event counts noncacheable memory data read transactions. Available PDIST counters: 0", + "PublicDescription": "This event counts noncacheable memory data read transactions.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -992,7 +1106,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "PublicDescription": "Counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -1002,7 +1116,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1012,7 +1126,6 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "PublicDescription": "Cycles where at least 1 outstanding demand data read request is pending. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1022,7 +1135,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", - "PublicDescription": "Counts the number of offcore outstanding demand rfo Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding demand rfo Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -1031,7 +1144,6 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -1040,7 +1152,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1049,7 +1161,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -1058,7 +1170,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO", - "PublicDescription": "Counts the number of off-core outstanding read-for-ownership (RFO) store transactions every cycle. An RFO transaction is considered to be in the Off-core outstanding state between L2 cache miss and transaction completion. Available PDIST counters: 0", + "PublicDescription": "Counts the number of off-core outstanding read-for-ownership (RFO) store transactions every cycle. An RFO transaction is considered to be in the Off-core outstanding state between L2 cache miss and transaction completion.", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -1067,7 +1179,7 @@ "Counter": "0,1,2,3", "EventCode": "0x2c", "EventName": "SQ_MISC.BUS_LOCK", - "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory. Available PDIST counters: 0", + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -1076,7 +1188,6 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.ANY", - "PublicDescription": "Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0xf" }, @@ -1085,7 +1196,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.NTA", - "PublicDescription": "Counts the number of PREFETCHNTA instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHNTA instructions executed.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -1094,7 +1205,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.PREFETCHW", - "PublicDescription": "Counts the number of PREFETCHW instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHW instructions executed.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -1103,7 +1214,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T0", - "PublicDescription": "Counts the number of PREFETCHT0 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT0 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -1112,7 +1223,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T1_T2", - "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x4" } diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/floating-point.json b/tools/perf/pmu-events/arch/x86/graniterapids/floating-point.json index 1832dd952f66..59789eee060c 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/floating-point.json @@ -5,7 +5,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.FPDIV_ACTIVE", - "PublicDescription": "This event counts the cycles the floating point divider is busy. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -14,7 +13,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.FP", - "PublicDescription": "Counts all microcode Floating Point assists. Available PDIST counters: 0", + "PublicDescription": "Counts all microcode Floating Point assists.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -23,7 +22,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.SSE_AVX_MIX", - "PublicDescription": "ASSISTS.SSE_AVX_MIX Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -32,7 +30,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_0", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -41,7 +38,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_1", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -50,7 +46,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_5", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -59,7 +54,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V0", - "PublicDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -68,7 +62,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V1", - "PublicDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -77,7 +70,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V2", - "PublicDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -86,7 +78,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -95,7 +87,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -104,7 +96,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -113,7 +105,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -122,7 +114,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x18" }, @@ -131,7 +123,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x40" }, @@ -140,7 +132,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 16 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 16 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -149,7 +141,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.8_FLOPS", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision and double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision and double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x60" }, @@ -158,7 +150,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR", - "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -167,7 +159,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -176,7 +168,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_SINGLE", - "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -185,7 +177,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.VECTOR", - "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0xfc" }, @@ -194,7 +186,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.128B_PACKED_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -203,7 +194,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.256B_PACKED_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -212,7 +202,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.512B_PACKED_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -221,7 +210,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -230,7 +218,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.SCALAR", - "PublicDescription": "FP_ARITH_INST_RETIRED2.SCALAR Available PDIST counters: 0", + "PublicDescription": "FP_ARITH_INST_RETIRED2.SCALAR", "SampleAfterValue": "100003", "UMask": "0x3" }, @@ -239,7 +227,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.SCALAR_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.SCALAR_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -248,7 +235,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.VECTOR", - "PublicDescription": "FP_ARITH_INST_RETIRED2.VECTOR Available PDIST counters: 0", + "PublicDescription": "FP_ARITH_INST_RETIRED2.VECTOR", "SampleAfterValue": "100003", "UMask": "0x1c" } diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/frontend.json b/tools/perf/pmu-events/arch/x86/graniterapids/frontend.json index b7cd92fbecd5..d580d305c926 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/frontend.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/frontend.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3", "EventCode": "0x60", "EventName": "BACLEARS.ANY", - "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0", + "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -13,7 +13,7 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.LCP", - "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk. Available PDIST counters: 0", + "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk.", "SampleAfterValue": "500009", "UMask": "0x1" }, @@ -22,7 +22,6 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.MS_BUSY", - "PublicDescription": "Cycles the Microcode Sequencer is busy. Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x2" }, @@ -31,7 +30,7 @@ "Counter": "0,1,2,3", "EventCode": "0x61", "EventName": "DSB2MITE_SWITCHES.PENALTY_CYCLES", - "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE. Available PDIST counters: 0", + "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -303,7 +302,7 @@ "Counter": "0,1,2,3", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALLS", - "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity.", "SampleAfterValue": "500009", "UMask": "0x4" }, @@ -314,7 +313,6 @@ "EdgeDetect": "1", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALL_PERIODS", - "PublicDescription": "ICACHE_DATA.STALL_PERIODS Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x4" }, @@ -323,7 +321,7 @@ "Counter": "0,1,2,3", "EventCode": "0x83", "EventName": "ICACHE_TAG.STALLS", - "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss.", "SampleAfterValue": "200003", "UMask": "0x4" }, @@ -333,7 +331,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -343,7 +341,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -352,7 +350,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.DSB_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -362,7 +360,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -372,7 +370,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -381,7 +379,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MITE_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -391,7 +389,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MS_CYCLES_ANY", - "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE.", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -402,7 +400,7 @@ "EdgeDetect": "1", "EventCode": "0x79", "EventName": "IDQ.MS_SWITCHES", - "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer. Available PDIST counters: 0", + "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -411,7 +409,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MS_UOPS", - "PublicDescription": "Counts the number of uops initiated by MITE or Decode Stream Buffer (DSB) and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may 'bypass' the IDQ. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops initiated by MITE or Decode Stream Buffer (DSB) and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -420,7 +418,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CORE", - "PublicDescription": "This event counts a subset of the Topdown Slots event that when no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations. The count may be distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method. Available PDIST counters: 0", + "PublicDescription": "This event counts a subset of the Topdown Slots event that when no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations. The count may be distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -430,7 +428,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -441,7 +439,7 @@ "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -450,7 +448,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CORE", - "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -460,7 +458,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -471,7 +469,7 @@ "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1" } diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/gnr-metrics.json b/tools/perf/pmu-events/arch/x86/graniterapids/gnr-metrics.json index 9a620e1b8de8..cc3c834ca286 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/gnr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/gnr-metrics.json @@ -1,28 +1,28 @@ [ { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" @@ -381,7 +381,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvOB;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -416,39 +416,39 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * tma_amx_busy / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + 0 / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency) + tma_memory_bound * (tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * tma_amx_busy / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20" }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * ((1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + RS.EMPTY_RESOURCE / tma_info_thread_clks * tma_ports_utilized_0) / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricExpr": "100 * ((1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + RS.EMPTY_RESOURCE / tma_info_thread_clks * tma_ports_utilized_0) / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -456,7 +456,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_bottleneck_memory_data_tlbs > 20", @@ -464,7 +464,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", "MetricGroup": "BvMS;LockCont;Mem;Offcore;tma_issueSyncxn", "MetricName": "tma_bottleneck_memory_synchronization", "MetricThreshold": "tma_bottleneck_memory_synchronization > 10", @@ -480,7 +480,7 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -496,7 +496,7 @@ { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", "DefaultMetricgroupName": "TopdownL2", - "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BadSpec;BrMispredicts;BvMP;Default;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", @@ -613,7 +613,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS * min(MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS:R, 74.6 * tma_info_system_core_frequency) + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * min(MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD:R, 76.6 * tma_info_system_core_frequency) * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "BvMS;DataSharing;LockCont;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", @@ -633,6 +632,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external CXL Memory by loads (e.g", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_CXL_MEM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_CXL_MEM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 1)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_CXL_MEM + MEM_LOAD_RETIRED.LOCAL_CXL_MEM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricName": "tma_cxl_mem_bound", + "MetricThreshold": "tma_cxl_mem_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external CXL Memory by loads (e.g. 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory, PMM - Persistent Memory Module [from CLX to SPR] or any other CXL Type3 Memory [EMR onwards]).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD * min(MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD:R, 74.6 * tma_info_system_core_frequency) + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * min(MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD:R, 74.6 * tma_info_system_core_frequency) * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", @@ -662,7 +670,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", - "MetricExpr": "MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks - tma_cxl_mem_bound if #has_pmem > 0 else MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks)", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -720,7 +728,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -848,7 +856,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", "DefaultMetricgroupName": "TopdownL2", - "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "Default;Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -1395,19 +1403,19 @@ { "BriefDescription": "Off-core accesses per kilo instruction for modified write requests", "MetricExpr": "1e3 * OCR.MODIFIED_WRITE.ANY_RESPONSE / tma_info_inst_mix_instructions", - "MetricGroup": "Offcore", + "MetricGroup": "Offcore;Server", "MetricName": "tma_info_memory_mix_offcore_mwrite_any_pki" }, { "BriefDescription": "Off-core accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.ANY_RESPONSE / tma_info_inst_mix_instructions", - "MetricGroup": "CacheHits;Offcore", + "MetricGroup": "CacheHits;Offcore;Server", "MetricName": "tma_info_memory_mix_offcore_read_any_pki" }, { "BriefDescription": "L3 cache misses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.L3_MISS / tma_info_inst_mix_instructions", - "MetricGroup": "Offcore", + "MetricGroup": "Offcore;Server", "MetricName": "tma_info_memory_mix_offcore_read_l3m_pki" }, { @@ -1433,21 +1441,21 @@ { "BriefDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket", "MetricExpr": "64 * OCR.READS_TO_CORE.DRAM / 1e9 / tma_info_system_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricGroup": "HPC;Mem;MemoryBW;Offcore;Server", "MetricName": "tma_info_memory_soc_r2c_dram_bw", "PublicDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket. See R2C_Offcore_BW." }, { "BriefDescription": "Average L3-cache miss BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.L3_MISS / 1e9 / tma_info_system_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricGroup": "HPC;Mem;MemoryBW;Offcore;Server", "MetricName": "tma_info_memory_soc_r2c_l3m_bw", "PublicDescription": "Average L3-cache miss BW for Reads-to-Core (R2C). This covering going to DRAM or other memory off-chip memory tears. See R2C_Offcore_BW." }, { "BriefDescription": "Average Off-core access BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.ANY_RESPONSE / 1e9 / tma_info_system_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricGroup": "HPC;Mem;MemoryBW;Offcore;Server", "MetricName": "tma_info_memory_soc_r2c_offcore_bw", "PublicDescription": "Average Off-core access BW for Reads-to-Core (R2C). R2C account for demand or prefetch load/RFO/code access that fill data into the Core caches." }, @@ -1491,7 +1499,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -1538,7 +1546,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1550,16 +1558,28 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", + "MetricExpr": "(64 * UNC_CXLCM_RxC_PACK_BUF_INSERTS.MEM_DATA / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_read_bw" + }, + { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "(64 * UNC_CXLDP_TxC_AGF_INSERTS.M2S_DATA / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_write_bw" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT_SCH0.RD + UNC_M_CAS_COUNT_SCH1.RD + UNC_M_CAS_COUNT_SCH0.WR + UNC_M_CAS_COUNT_SCH1.WR) / 1e9 / tma_info_system_time", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1771,12 +1791,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1790,7 +1810,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L2 cache under unloaded scenarios (possibly L2 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_LOAD_RETIRED.L2_HIT * min(MEM_LOAD_RETIRED.L2_HIT:R, 4.4 * tma_info_system_core_frequency) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_l2_bound_group", "MetricName": "tma_l2_hit_latency", @@ -1809,12 +1828,11 @@ }, { "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_LOAD_RETIRED.L3_HIT * min(MEM_LOAD_RETIRED.L3_HIT:R, 32.6 * tma_info_system_core_frequency) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1897,6 +1915,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "MEM_INST_RETIRED.LOCK_LOADS * MEM_INST_RETIRED.LOCK_LOADS:R / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -1929,7 +1948,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1938,13 +1957,13 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "DefaultMetricgroupName": "TopdownL2", - "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "Backend;Default;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -1954,7 +1973,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", @@ -2007,7 +2025,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "max(IDQ.MS_CYCLES_ANY, cpu@UOPS_RETIRED.MS\\,cmask\\=1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY)) / tma_info_core_core_clks / 2", + "MetricExpr": "max(IDQ.MS_CYCLES_ANY, cpu@UOPS_RETIRED.MS\\,cmask\\=1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY)) / tma_info_core_core_clks / 2.4", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -2042,6 +2060,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", @@ -2103,6 +2122,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_3_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_3_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -2112,6 +2132,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "max(EXE_ACTIVITY.EXE_BOUND_0_PORTS - RESOURCE_STALLS.SCOREBOARD, 0) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", @@ -2121,6 +2142,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", @@ -2130,7 +2152,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", @@ -2140,7 +2161,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", @@ -2150,7 +2170,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote cache in other sockets including synchronizations issues", - "MetricExpr": "(MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM:R + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD:R) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "(MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * PEBS + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * PEBS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Offcore;Server;Snoop;TopdownL5;tma_L5_group;tma_issueSyncxn;tma_mem_latency_group", "MetricName": "tma_remote_cache", "MetricThreshold": "tma_remote_cache > 0.05 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -2159,7 +2179,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", - "MetricExpr": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM:R * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", + "MetricExpr": "MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * PEBS * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_mem", "MetricThreshold": "tma_remote_mem > 0.1 & (tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)))", @@ -2177,7 +2197,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2205,7 +2225,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", @@ -2237,7 +2256,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/memory.json b/tools/perf/pmu-events/arch/x86/graniterapids/memory.json index 4db39f304c2c..96f40390becf 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/memory.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/memory.json @@ -5,7 +5,6 @@ "CounterMask": "2", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L3_MISS", - "PublicDescription": "Cycles while L3 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -15,7 +14,6 @@ "CounterMask": "6", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x6" }, @@ -24,7 +22,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", - "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture Available PDIST counters: 0", + "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -34,7 +32,6 @@ "CounterMask": "2", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -44,7 +41,6 @@ "CounterMask": "3", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -54,7 +50,7 @@ "CounterMask": "5", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x5" }, @@ -64,7 +60,7 @@ "CounterMask": "9", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x9" }, @@ -412,7 +408,6 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "Counts demand data read requests that miss the L3 cache. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -422,7 +417,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "Cycles with at least 1 Demand Data Read requests who miss L3 cache in the superQ. Available PDIST counters: 0", + "PublicDescription": "Cycles with at least 1 Demand Data Read requests who miss L3 cache in the superQ.", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -431,7 +426,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -449,7 +444,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_EVENTS", - "PublicDescription": "Counts the number of times an RTM execution aborted due to none of the previous 3 categories (e.g. interrupt). Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to none of the previous 3 categories (e.g. interrupt).", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -458,7 +453,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_MEM", - "PublicDescription": "Counts the number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts). Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts).", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -467,7 +462,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_MEMTYPE", - "PublicDescription": "Counts the number of times an RTM execution aborted due to incompatible memory type. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to incompatible memory type.", "SampleAfterValue": "100003", "UMask": "0x40" }, @@ -476,7 +471,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_UNFRIENDLY", - "PublicDescription": "Counts the number of times an RTM execution aborted due to HLE-unfriendly instructions. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to HLE-unfriendly instructions.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -485,7 +480,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.COMMIT", - "PublicDescription": "Counts the number of times RTM commit succeeded. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times RTM commit succeeded.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -494,7 +489,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.START", - "PublicDescription": "Counts the number of times we entered an RTM region. Does not count nested transactions. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times we entered an RTM region. Does not count nested transactions.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -503,7 +498,7 @@ "Counter": "0,1,2,3", "EventCode": "0x54", "EventName": "TX_MEM.ABORT_CAPACITY_READ", - "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional reads Available PDIST counters: 0", + "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional reads", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -512,7 +507,7 @@ "Counter": "0,1,2,3", "EventCode": "0x54", "EventName": "TX_MEM.ABORT_CAPACITY_WRITE", - "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional writes. Available PDIST counters: 0", + "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional writes.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -521,7 +516,7 @@ "Counter": "0,1,2,3", "EventCode": "0x54", "EventName": "TX_MEM.ABORT_CONFLICT", - "PublicDescription": "Counts the number of times a TSX line had a cache conflict. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times a TSX line had a cache conflict.", "SampleAfterValue": "100003", "UMask": "0x1" } diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/other.json b/tools/perf/pmu-events/arch/x86/graniterapids/other.json index 8b7aa4caec46..c0747750b1a8 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/other.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/other.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.HARDWARE", - "PublicDescription": "Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events. This includes, but not limited to, assists at EXE or MEM uop writeback like AVX* load/store/gather/scatter (non-FP GSSE-assist ) , assists generated by ROB like PEBS and RTIT, Uncore trap, RAR (Remote Action Request) and CET (Control flow Enforcement Technology) assists. the event also counts for Machine Ordering count. Available PDIST counters: 0", + "PublicDescription": "Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events. This includes, but not limited to, assists at EXE or MEM uop writeback like AVX* load/store/gather/scatter (non-FP GSSE-assist ) , assists generated by ROB like PEBS and RTIT, Uncore trap, RAR (Remote Action Request) and CET (Control flow Enforcement Technology) assists. the event also counts for Machine Ordering count.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -13,11 +13,35 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.PAGE_FAULT", - "PublicDescription": "ASSISTS.PAGE_FAULT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, { + "BriefDescription": "HW_INTERRUPTS.MASKED", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xcb", + "EventName": "HW_INTERRUPTS.MASKED", + "SampleAfterValue": "100003", + "UMask": "0x2" + }, + { + "BriefDescription": "HW_INTERRUPTS.PENDING_AND_MASKED", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xcb", + "EventName": "HW_INTERRUPTS.PENDING_AND_MASKED", + "SampleAfterValue": "100003", + "UMask": "0x4" + }, + { + "BriefDescription": "Number of hardware interrupts received by the processor.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xcb", + "EventName": "HW_INTERRUPTS.RECEIVED", + "PublicDescription": "Counts the number of hardware interruptions received by the processor.", + "SampleAfterValue": "203", + "UMask": "0x1" + }, + { "BriefDescription": "Counts streaming stores that have any type of response.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -34,7 +58,7 @@ "CounterMask": "1", "EventCode": "0x2d", "EventName": "XQ.FULL_CYCLES", - "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache). Available PDIST counters: 0", + "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache).", "SampleAfterValue": "1000003", "UMask": "0x1" } diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/pipeline.json b/tools/perf/pmu-events/arch/x86/graniterapids/pipeline.json index 27af3bd6bacf..0fef8fd61974 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/pipeline.json @@ -5,7 +5,7 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.DIV_ACTIVE", - "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations.", "SampleAfterValue": "1000003", "UMask": "0x9" }, @@ -15,7 +15,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.IDIV_ACTIVE", - "PublicDescription": "This event counts the cycles the integer divider is busy. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -24,7 +23,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.ANY", - "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists. Available PDIST counters: 0", + "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists.", "SampleAfterValue": "100003", "UMask": "0x1b" }, @@ -271,7 +270,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C01", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -280,7 +279,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C02", - "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -289,7 +288,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C0_WAIT", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction.", "SampleAfterValue": "2000003", "UMask": "0x70" }, @@ -298,7 +297,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.DISTRIBUTED", - "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -307,7 +306,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", - "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted. Available PDIST counters: 0", + "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted.", "SampleAfterValue": "25003", "UMask": "0x2" }, @@ -316,7 +315,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40" }, @@ -327,7 +325,6 @@ "EdgeDetect": "1", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE_INST", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE_INST Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40" }, @@ -336,7 +333,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_DISTRIBUTED", - "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -344,7 +341,7 @@ "BriefDescription": "Reference cycles when the core is not in halt state.", "Counter": "Fixed counter 2", "EventName": "CPU_CLK_UNHALTED.REF_TSC", - "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case. Available PDIST counters: 0", + "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.", "SampleAfterValue": "2000003", "UMask": "0x3" }, @@ -353,7 +350,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", - "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case. Available PDIST counters: 0", + "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -361,7 +358,7 @@ "BriefDescription": "Core cycles when the thread is not in halt state", "Counter": "Fixed counter 1", "EventName": "CPU_CLK_UNHALTED.THREAD", - "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events. Available PDIST counters: 0", + "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -370,7 +367,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.THREAD_P", - "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time. Available PDIST counters: 0", + "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.", "SampleAfterValue": "2000003" }, { @@ -379,7 +376,6 @@ "CounterMask": "8", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -389,7 +385,6 @@ "CounterMask": "1", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L2_MISS", - "PublicDescription": "Cycles while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -399,7 +394,6 @@ "CounterMask": "16", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_MEM_ANY", - "PublicDescription": "Cycles while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -409,7 +403,6 @@ "CounterMask": "12", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xc" }, @@ -419,7 +412,6 @@ "CounterMask": "5", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x5" }, @@ -429,7 +421,6 @@ "CounterMask": "4", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_TOTAL", - "PublicDescription": "Total execution stalls. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -438,7 +429,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb7", "EventName": "EXE.AMX_BUSY", - "PublicDescription": "Counts the cycles where the AMX (Advance Matrix Extension) unit is busy performing an operation. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -447,7 +437,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.1_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -456,7 +446,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_3_PORTS_UTIL", - "PublicDescription": "Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0xc" }, @@ -465,7 +454,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -474,7 +463,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.3_PORTS_UTIL", - "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -483,7 +472,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.4_PORTS_UTIL", - "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -493,7 +482,6 @@ "CounterMask": "5", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_LOADS", - "PublicDescription": "Execution stalls while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x21" }, @@ -503,7 +491,7 @@ "CounterMask": "2", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_STORES", - "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall.", "SampleAfterValue": "1000003", "UMask": "0x40" }, @@ -512,7 +500,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.EXE_BOUND_0_PORTS", - "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load. Available PDIST counters: 0", + "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load.", "SampleAfterValue": "1000003", "UMask": "0x80" }, @@ -521,7 +509,7 @@ "Counter": "0,1,2,3", "EventCode": "0x75", "EventName": "INST_DECODED.DECODERS", - "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions. Available PDIST counters: 0", + "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -546,7 +534,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", - "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -555,7 +542,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", - "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions Available PDIST counters: 0", + "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -572,7 +559,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", - "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0", + "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -583,7 +570,7 @@ "EdgeDetect": "1", "EventCode": "0xad", "EventName": "INT_MISC.CLEARS_COUNT", - "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears Available PDIST counters: 0", + "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", "SampleAfterValue": "500009", "UMask": "0x1" }, @@ -592,7 +579,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.CLEAR_RESTEER_CYCLES", - "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path. Available PDIST counters: 0", + "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.", "SampleAfterValue": "500009", "UMask": "0x80" }, @@ -601,7 +588,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.MBA_STALLS", - "PublicDescription": "INT_MISC.MBA_STALLS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -610,7 +596,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.RECOVERY_CYCLES", - "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event. Available PDIST counters: 0", + "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event.", "SampleAfterValue": "500009", "UMask": "0x1" }, @@ -621,7 +607,6 @@ "EventName": "INT_MISC.UNKNOWN_BRANCH_CYCLES", "MSRIndex": "0x3F7", "MSRValue": "0x7", - "PublicDescription": "Bubble cycles of BAClear (Unknown Branch). Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40" }, @@ -630,7 +615,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.UOP_DROPPING", - "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons Available PDIST counters: 0", + "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -639,7 +624,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.128BIT", - "PublicDescription": "INT_VEC_RETIRED.128BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x13" }, @@ -648,7 +632,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.256BIT", - "PublicDescription": "INT_VEC_RETIRED.256BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xac" }, @@ -657,7 +640,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_128", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -666,7 +649,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_256", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0xc" }, @@ -675,7 +658,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.MUL_256", - "PublicDescription": "INT_VEC_RETIRED.MUL_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x80" }, @@ -684,7 +666,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.SHUFFLES", - "PublicDescription": "INT_VEC_RETIRED.SHUFFLES Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40" }, @@ -693,7 +674,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_128", - "PublicDescription": "INT_VEC_RETIRED.VNNI_128 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -702,7 +682,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_256", - "PublicDescription": "INT_VEC_RETIRED.VNNI_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -711,7 +690,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.ADDRESS_ALIAS", - "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -720,7 +699,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.NO_SR", - "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.", "SampleAfterValue": "100003", "UMask": "0x88" }, @@ -729,7 +708,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.STORE_FORWARD", - "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide.", "SampleAfterValue": "100003", "UMask": "0x82" }, @@ -738,7 +717,7 @@ "Counter": "0,1,2,3", "EventCode": "0x4c", "EventName": "LOAD_HIT_PREFETCH.SWPF", - "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -748,7 +727,7 @@ "CounterMask": "1", "EventCode": "0xa8", "EventName": "LSD.CYCLES_ACTIVE", - "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -758,7 +737,7 @@ "CounterMask": "6", "EventCode": "0xa8", "EventName": "LSD.CYCLES_OK", - "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -767,7 +746,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa8", "EventName": "LSD.UOPS", - "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -778,7 +757,7 @@ "EdgeDetect": "1", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.COUNT", - "PublicDescription": "Counts the number of machine clears (nukes) of any type. Available PDIST counters: 0", + "PublicDescription": "Counts the number of machine clears (nukes) of any type.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -787,7 +766,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.SMC", - "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear. Available PDIST counters: 0", + "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -796,7 +775,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe0", "EventName": "MISC2_RETIRED.LFENCE", - "PublicDescription": "number of LFENCE retired instructions Available PDIST counters: 0", + "PublicDescription": "number of LFENCE retired instructions", "SampleAfterValue": "400009", "UMask": "0x20" }, @@ -805,7 +784,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcc", "EventName": "MISC_RETIRED.LBR_INSERTS", - "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT. Available PDIST counters: 0", + "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -814,7 +793,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SB", - "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end. Available PDIST counters: 0", + "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -823,7 +802,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SCOREBOARD", - "PublicDescription": "Counts cycles where the pipeline is stalled due to serializing operations. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -832,7 +810,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY", - "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses) Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", "SampleAfterValue": "1000003", "UMask": "0x7" }, @@ -844,7 +822,7 @@ "EventCode": "0xa5", "EventName": "RS.EMPTY_COUNT", "Invert": "1", - "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events) Available PDIST counters: 0", + "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events)", "SampleAfterValue": "100003", "UMask": "0x7" }, @@ -853,7 +831,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY_RESOURCE", - "PublicDescription": "Cycles when RS was empty and a resource allocation stall is asserted Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -862,7 +839,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method. Available PDIST counters: 0", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "10000003", "UMask": "0x2" }, @@ -871,7 +848,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BAD_SPEC_SLOTS", - "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations. Available PDIST counters: 0", + "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations.", "SampleAfterValue": "10000003", "UMask": "0x4" }, @@ -880,7 +857,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS", - "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction. Available PDIST counters: 0", + "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction.", "SampleAfterValue": "10000003", "UMask": "0x8" }, @@ -889,7 +866,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.MEMORY_BOUND_SLOTS", - "PublicDescription": "TOPDOWN.MEMORY_BOUND_SLOTS Available PDIST counters: 0", "SampleAfterValue": "10000003", "UMask": "0x10" }, @@ -897,7 +873,7 @@ "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event", "Counter": "Fixed counter 3", "EventName": "TOPDOWN.SLOTS", - "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3). Available PDIST counters: 0", + "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).", "SampleAfterValue": "10000003", "UMask": "0x4" }, @@ -906,7 +882,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.SLOTS_P", - "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Available PDIST counters: 0", + "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core.", "SampleAfterValue": "10000003", "UMask": "0x1" }, @@ -915,7 +891,7 @@ "Counter": "0,1,2,3", "EventCode": "0x76", "EventName": "UOPS_DECODED.DEC0_UOPS", - "PublicDescription": "This event counts the number of not dec-by-all uops decoded by decoder 0. Available PDIST counters: 0", + "PublicDescription": "This event counts the number of not dec-by-all uops decoded by decoder 0.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -924,7 +900,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_0", - "PublicDescription": "Number of uops dispatch to execution port 0. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 0.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -933,7 +909,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_1", - "PublicDescription": "Number of uops dispatch to execution port 1. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 1.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -942,7 +918,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_2_3_10", - "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -951,7 +927,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_4_9", - "PublicDescription": "Number of uops dispatch to execution ports 4 and 9 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 4 and 9", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -960,7 +936,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_5_11", - "PublicDescription": "Number of uops dispatch to execution ports 5 and 11 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 5 and 11", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -969,7 +945,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_6", - "PublicDescription": "Number of uops dispatch to execution port 6. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 6.", "SampleAfterValue": "2000003", "UMask": "0x40" }, @@ -978,7 +954,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_7_8", - "PublicDescription": "Number of uops dispatch to execution ports 7 and 8. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 7 and 8.", "SampleAfterValue": "2000003", "UMask": "0x80" }, @@ -987,7 +963,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE", - "PublicDescription": "Counts the number of uops executed from any thread. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops executed from any thread.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -997,7 +973,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_1", - "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1007,7 +983,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_2", - "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1017,7 +993,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_3", - "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1027,7 +1003,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_4", - "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1037,7 +1013,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_1", - "PublicDescription": "Cycles where at least 1 uop was executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 1 uop was executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1047,7 +1023,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_2", - "PublicDescription": "Cycles where at least 2 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 2 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1057,7 +1033,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_3", - "PublicDescription": "Cycles where at least 3 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 3 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1067,7 +1043,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_4", - "PublicDescription": "Cycles where at least 4 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 4 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1078,7 +1054,7 @@ "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.STALLS", "Invert": "1", - "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1087,7 +1063,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.THREAD", - "PublicDescription": "Counts the number of uops to be executed per-thread each cycle. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1096,7 +1071,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.X87", - "PublicDescription": "Counts the number of x87 uops executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of x87 uops executed.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -1105,7 +1080,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xae", "EventName": "UOPS_ISSUED.ANY", - "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1115,7 +1090,6 @@ "CounterMask": "1", "EventCode": "0xae", "EventName": "UOPS_ISSUED.CYCLES", - "PublicDescription": "UOPS_ISSUED.CYCLES Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1125,7 +1099,7 @@ "CounterMask": "1", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.CYCLES", - "PublicDescription": "Counts cycles where at least one uop has retired. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where at least one uop has retired.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1134,7 +1108,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.HEAVY", - "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count. Available PDIST counters: 0", + "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1145,7 +1119,6 @@ "EventName": "UOPS_RETIRED.MS", "MSRIndex": "0x3F7", "MSRValue": "0x8", - "PublicDescription": "UOPS_RETIRED.MS Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -1154,7 +1127,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric. Software can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method. Available PDIST counters: 0", + "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric. Software can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1165,7 +1138,7 @@ "EventCode": "0xc2", "EventName": "UOPS_RETIRED.STALLS", "Invert": "1", - "PublicDescription": "This event counts cycles without actually retired uops. Available PDIST counters: 0", + "PublicDescription": "This event counts cycles without actually retired uops.", "SampleAfterValue": "1000003", "UMask": "0x2" } diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/graniterapids/uncore-interconnect.json index 6667fbc50452..5eb1145f204f 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/uncore-interconnect.json @@ -833,12 +833,20 @@ "Counter": "0,1,2,3", "EventCode": "0x1F", "EventName": "UNC_I_MISC1.LOST_FWD", - "Experimental": "1", "PerPkg": "1", "UMask": "0x10", "Unit": "IRP" }, { + "BriefDescription": "Misc Events - Set 1 : Received Invalid : Secondary received a transfer that did not have sufficient MESI state", + "Counter": "0,1,2,3", + "EventCode": "0x1F", + "EventName": "UNC_I_MISC1.SEC_RCVD_INVLD", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "IRP" + }, + { "BriefDescription": "Snoop Hit E/S responses", "Counter": "0,1,2,3", "EventCode": "0x12", diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/uncore-io.json b/tools/perf/pmu-events/arch/x86/graniterapids/uncore-io.json index f4f956966e16..2ea2637df3fb 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/uncore-io.json @@ -1321,7 +1321,6 @@ "FCMask": "0x01", "PerPkg": "1", "PortMask": "0x0FF", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IIO" }, diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/uncore-memory.json b/tools/perf/pmu-events/arch/x86/graniterapids/uncore-memory.json index b991f6e1afbe..f559e27e2815 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/uncore-memory.json @@ -60,6 +60,33 @@ "BriefDescription": "CAS count for SubChannel 0 regular reads", "Counter": "0,1,2,3", "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_NON_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc3", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 auto-precharge reads", + "Counter": "0,1,2,3", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_PRE_REG", + "PerPkg": "1", + "UMask": "0xc2", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 auto-precharge underfill reads", + "Counter": "0,1,2,3", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_PRE_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc8", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 regular reads", + "Counter": "0,1,2,3", + "EventCode": "0x05", "EventName": "UNC_M_CAS_COUNT_SCH0.RD_REG", "PerPkg": "1", "UMask": "0xc1", @@ -75,6 +102,15 @@ "Unit": "IMC" }, { + "BriefDescription": "CAS count for SubChannel 0 underfill reads", + "Counter": "0,1,2,3", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_UNDERFILL_ALL", + "PerPkg": "1", + "UMask": "0xcc", + "Unit": "IMC" + }, + { "BriefDescription": "CAS count for SubChannel 0, all writes", "Counter": "0,1,2,3", "EventCode": "0x05", @@ -125,6 +161,33 @@ "BriefDescription": "CAS count for SubChannel 1 regular reads", "Counter": "0,1,2,3", "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_NON_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc3", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 auto-precharge reads", + "Counter": "0,1,2,3", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_PRE_REG", + "PerPkg": "1", + "UMask": "0xc2", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 auto-precharge underfill reads", + "Counter": "0,1,2,3", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_PRE_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc8", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 regular reads", + "Counter": "0,1,2,3", + "EventCode": "0x06", "EventName": "UNC_M_CAS_COUNT_SCH1.RD_REG", "PerPkg": "1", "UMask": "0xc1", @@ -140,6 +203,15 @@ "Unit": "IMC" }, { + "BriefDescription": "CAS count for SubChannel 1 underfill reads", + "Counter": "0,1,2,3", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_UNDERFILL_ALL", + "PerPkg": "1", + "UMask": "0xcc", + "Unit": "IMC" + }, + { "BriefDescription": "CAS count for SubChannel 1, all writes", "Counter": "0,1,2,3", "EventCode": "0x06", @@ -189,13 +261,52 @@ "Unit": "IMC" }, { + "BriefDescription": "PMMNT is sending REF* commands while being in specified Refresh rate", + "Counter": "0,1,2,3", + "EventCode": "0x72", + "EventName": "UNC_M_MNTCMD_REFRATE.REFAB1X", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x1", + "Unit": "IMC" + }, + { + "BriefDescription": "PMMNT is sending REF* commands while being in specified Refresh rate", + "Counter": "0,1,2,3", + "EventCode": "0x72", + "EventName": "UNC_M_MNTCMD_REFRATE.REFAB2X", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "IMC" + }, + { + "BriefDescription": "PMMNT is sending REF* commands while being in specified Refresh rate", + "Counter": "0,1,2,3", + "EventCode": "0x72", + "EventName": "UNC_M_MNTCMD_REFRATE.REFSB1X", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "IMC" + }, + { + "BriefDescription": "PMMNT is sending REF* commands while being in specified Refresh rate", + "Counter": "0,1,2,3", + "EventCode": "0x72", + "EventName": "UNC_M_MNTCMD_REFRATE.REFSB2X", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x8", + "Unit": "IMC" + }, + { "BriefDescription": "# of cycles MR4 temp readings forced 2x refresh", "Counter": "0,1,2,3", "EventCode": "0xA7", "EventName": "UNC_M_MR4_2XREF_CYCLES.SCH0_DIMM0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -206,7 +317,6 @@ "EventName": "UNC_M_MR4_2XREF_CYCLES.SCH0_DIMM1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -217,7 +327,6 @@ "EventName": "UNC_M_MR4_2XREF_CYCLES.SCH1_DIMM0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IMC" }, @@ -228,7 +337,6 @@ "EventName": "UNC_M_MR4_2XREF_CYCLES.SCH1_DIMM1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x8", "Unit": "IMC" }, @@ -239,7 +347,6 @@ "EventName": "UNC_M_PDC_MR4ACTIVE_CYCLES.SCH0_DIMM0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -250,7 +357,6 @@ "EventName": "UNC_M_PDC_MR4ACTIVE_CYCLES.SCH0_DIMM1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -261,7 +367,6 @@ "EventName": "UNC_M_PDC_MR4ACTIVE_CYCLES.SCH1_DIMM0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IMC" }, @@ -272,7 +377,6 @@ "EventName": "UNC_M_PDC_MR4ACTIVE_CYCLES.SCH1_DIMM1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x8", "Unit": "IMC" }, @@ -283,7 +387,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH0_RANK0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -294,7 +397,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH0_RANK1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -305,7 +407,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH0_RANK2", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IMC" }, @@ -316,7 +417,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH0_RANK3", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x8", "Unit": "IMC" }, @@ -327,7 +427,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH1_RANK0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x10", "Unit": "IMC" }, @@ -338,7 +437,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH1_RANK1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x20", "Unit": "IMC" }, @@ -349,7 +447,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH1_RANK2", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x40", "Unit": "IMC" }, @@ -360,7 +457,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH1_RANK3", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x80", "Unit": "IMC" }, @@ -371,7 +467,6 @@ "EventName": "UNC_M_POWER_CHANNEL_PPD_CYCLES", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "Unit": "IMC" }, { @@ -381,7 +476,6 @@ "EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -392,7 +486,6 @@ "EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -423,7 +516,6 @@ "EventName": "UNC_M_POWER_THROTTLE_CYCLES.MR4BLKEN", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x8", "Unit": "IMC" }, @@ -434,7 +526,6 @@ "EventName": "UNC_M_POWER_THROTTLE_CYCLES.RAPLBLK", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IMC" }, @@ -615,7 +706,6 @@ "EventName": "UNC_M_SELF_REFRESH.ENTER_SUCCESS", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "UNC_M_SELF_REFRESH.ENTER_SUCCESS", "UMask": "0x2", "Unit": "IMC" }, @@ -626,7 +716,6 @@ "EventName": "UNC_M_SELF_REFRESH.ENTER_SUCCESS_CYCLES", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -637,7 +726,6 @@ "EventName": "UNC_M_THROTTLE_CRIT_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -648,7 +736,6 @@ "EventName": "UNC_M_THROTTLE_CRIT_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -659,7 +746,6 @@ "EventName": "UNC_M_THROTTLE_HIGH_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -670,7 +756,6 @@ "EventName": "UNC_M_THROTTLE_HIGH_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -681,7 +766,6 @@ "EventName": "UNC_M_THROTTLE_LOW_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -692,7 +776,6 @@ "EventName": "UNC_M_THROTTLE_LOW_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -703,7 +786,6 @@ "EventName": "UNC_M_THROTTLE_MID_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -714,7 +796,6 @@ "EventName": "UNC_M_THROTTLE_MID_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, diff --git a/tools/perf/pmu-events/arch/x86/graniterapids/virtual-memory.json b/tools/perf/pmu-events/arch/x86/graniterapids/virtual-memory.json index 3d3f88600e26..609a9549cbf3 100644 --- a/tools/perf/pmu-events/arch/x86/graniterapids/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/graniterapids/virtual-memory.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.STLB_HIT", - "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB).", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -14,7 +14,7 @@ "CounterMask": "1", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -23,7 +23,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe" }, @@ -32,7 +32,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -41,7 +41,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -50,7 +50,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -59,7 +59,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -68,7 +68,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.STLB_HIT", - "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB).", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -78,7 +78,7 @@ "CounterMask": "1", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -87,7 +87,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe" }, @@ -96,7 +96,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -105,7 +105,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -114,7 +114,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -123,7 +123,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -132,7 +132,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.STLB_HIT", - "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB).", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -142,7 +142,7 @@ "CounterMask": "1", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -151,7 +151,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe" }, @@ -160,7 +160,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -169,7 +169,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -178,7 +178,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10" } diff --git a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json index b26ea70a3628..aebd82ced1cf 100644 --- a/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswell/hsw-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -80,7 +80,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -98,7 +97,6 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", @@ -139,7 +137,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -509,7 +506,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -521,7 +518,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -696,7 +693,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -746,7 +742,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -856,7 +852,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -865,7 +861,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -874,7 +870,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -883,7 +879,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", diff --git a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json index 8245a98ad4b9..b8845f8a28b9 100644 --- a/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/haswellx/hsx-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -282,7 +282,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -300,7 +299,6 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", @@ -341,7 +339,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -711,7 +708,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -723,7 +720,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -756,6 +753,7 @@ }, { "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", "MetricName": "tma_info_system_mem_parallel_reads", @@ -918,7 +916,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -928,6 +925,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "200 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_mem", @@ -977,7 +975,7 @@ { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", + "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING) + RESOURCE_STALLS.SB) / (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) * tma_backend_bound", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -1087,7 +1085,7 @@ { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "(min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks", + "MetricExpr": "((min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@)) / 2 - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) + cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - (cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@ if tma_info_thread_ipc > 1.8 else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0) + RESOURCE_STALLS.SB) - RESOURCE_STALLS.SB - min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.STALLS_LDM_PENDING)) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", "MetricThreshold": "tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2)", @@ -1096,7 +1094,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1105,7 +1103,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1114,7 +1112,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1141,6 +1139,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "310 * (MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.L3_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_L3_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_mem", diff --git a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json index c5bfdb2f288b..cf9ed3edb694 100644 --- a/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelake/icl-metrics.json @@ -1,63 +1,63 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c8\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C8_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C9 residency percent per package", - "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c9\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C9_Pkg_Residency", "ScaleUnit": "100%" @@ -85,7 +85,6 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", @@ -134,6 +133,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_bottleneck_big_code", @@ -148,39 +148,44 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20" }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -188,6 +193,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", @@ -196,6 +202,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", "MetricGroup": "BvMS;LockCont;Mem;Offcore;tma_issueSyncxn", "MetricName": "tma_bottleneck_memory_synchronization", @@ -204,6 +211,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM", "MetricName": "tma_bottleneck_mispredictions", @@ -212,7 +220,8 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -220,6 +229,7 @@ }, { "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + 2 * BR_INST_RETIRED.NEAR_CALL + INST_RETIRED.NOP) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "BvUW;Ret", "MetricName": "tma_bottleneck_useful_work", @@ -427,7 +437,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -619,6 +629,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_lsd + tma_mite + tma_ms)))", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", "MetricName": "tma_info_botlnk_l2_dsb_bandwidth", @@ -1068,7 +1079,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -1092,6 +1103,12 @@ "MetricName": "tma_info_pipeline_fetch_mite" }, { + "BriefDescription": "Average number of uops fetched from MS per cycle", + "MetricExpr": "IDQ.MS_UOPS / cpu@IDQ.MS_UOPS\\,cmask\\=1@", + "MetricGroup": "Fed;FetchLat;MicroSeq", + "MetricName": "tma_info_pipeline_fetch_ms" + }, + { "BriefDescription": "Instructions per a microcode Assist invocation", "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY", "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", @@ -1107,7 +1124,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1119,7 +1136,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -1128,7 +1145,7 @@ "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / tma_info_system_time / 1e3", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1296,12 +1313,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1325,7 +1342,6 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", @@ -1339,7 +1355,7 @@ "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1445,7 +1461,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1454,7 +1470,7 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1522,7 +1538,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "cpu@IDQ.MS_UOPS\\,cmask\\=1@ / tma_info_core_core_clks / 2", + "MetricExpr": "cpu@IDQ.MS_UOPS\\,cmask\\=1@ / tma_info_core_core_clks / 3.3", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -1656,7 +1672,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1693,7 +1709,6 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", @@ -1707,7 +1722,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1721,7 +1736,6 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", diff --git a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json index a886a0cfee07..f58eec2a1788 100644 --- a/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/icelakex/icx-metrics.json @@ -1,28 +1,28 @@ [ { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" @@ -79,6 +79,12 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "Bandwidth of inbound IO reads that are initiated by end device controllers that are requesting memory from the CPU and miss the L3 cache", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_read_l3_miss", + "ScaleUnit": "1MB/s" + }, + { "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the local CPU socket", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL * 64 / 1e6 / duration_time", "MetricName": "io_bandwidth_read_local", @@ -97,6 +103,12 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "Bandwidth of inbound IO writes that are initiated by end device controllers that are writing memory to the CPU", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write_l3_miss", + "ScaleUnit": "1MB/s" + }, + { "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the local CPU socket", "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_ITOM_LOCAL + UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_LOCAL) * 64 / 1e6 / duration_time", "MetricName": "io_bandwidth_write_local", @@ -109,6 +121,24 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "The percent of inbound full cache line writes initiated by IO that miss the L3 cache", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM / UNC_CHA_TOR_INSERTS.IO_ITOM", + "MetricName": "io_full_write_l3_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "The percent of inbound partial writes initiated by IO that miss the L3 cache", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_RFO) / (UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_RFO)", + "MetricName": "io_partial_write_l3_miss", + "ScaleUnit": "100%" + }, + { + "BriefDescription": "The percent of inbound reads initiated by IO that miss the L3 cache", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR / UNC_CHA_TOR_INSERTS.IO_PCIRDCUR", + "MetricName": "io_read_l3_miss", + "ScaleUnit": "100%" + }, + { "BriefDescription": "Ratio of number of completed page walks (for 2 megabyte and 4 megabyte page sizes) caused by a code fetch to the total number of completed instructions", "MetricExpr": "ITLB_MISSES.WALK_COMPLETED_2M_4M / INST_RETIRED.ANY", "MetricName": "itlb_2nd_level_large_page_mpi", @@ -331,7 +361,6 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", @@ -380,6 +409,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_bottleneck_big_code", @@ -394,39 +424,44 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20" }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -434,7 +469,8 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_bottleneck_memory_data_tlbs > 20", @@ -442,7 +478,8 @@ }, { "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", "MetricGroup": "BvMS;LockCont;Mem;Offcore;tma_issueSyncxn", "MetricName": "tma_bottleneck_memory_synchronization", "MetricThreshold": "tma_bottleneck_memory_synchronization > 10", @@ -450,6 +487,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM", "MetricName": "tma_bottleneck_mispredictions", @@ -458,7 +496,8 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -466,6 +505,7 @@ }, { "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + 2 * BR_INST_RETIRED.NEAR_CALL + INST_RETIRED.NOP) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "BvUW;Ret", "MetricName": "tma_bottleneck_useful_work", @@ -585,6 +625,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external CXL Memory by loads (e.g", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 1)) * (CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricName": "tma_cxl_mem_bound", + "MetricThreshold": "tma_cxl_mem_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external CXL Memory by loads (e.g. 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory, PMM - Persistent Memory Module [from CLX to SPR] or any other CXL Type3 Memory [EMR onwards]).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "43.5 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_HIT + MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", @@ -615,7 +664,7 @@ { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", "MetricConstraint": "NO_GROUP_EVENTS", - "MetricExpr": "CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound", + "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound - tma_cxl_mem_bound if #has_pmem > 0 else CYCLE_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks + (CYCLE_ACTIVITY.STALLS_L1D_MISS - CYCLE_ACTIVITY.STALLS_L2_MISS) / tma_info_thread_clks - tma_l2_bound)", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -673,7 +722,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -865,6 +914,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_mite + tma_ms)))", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", "MetricName": "tma_info_botlnk_l2_dsb_bandwidth", @@ -1320,7 +1370,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -1338,6 +1388,12 @@ "MetricName": "tma_info_pipeline_fetch_mite" }, { + "BriefDescription": "Average number of uops fetched from MS per cycle", + "MetricExpr": "IDQ.MS_UOPS / cpu@IDQ.MS_UOPS\\,cmask\\=1@", + "MetricGroup": "Fed;FetchLat;MicroSeq", + "MetricName": "tma_info_pipeline_fetch_ms" + }, + { "BriefDescription": "Instructions per a microcode Assist invocation", "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY", "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", @@ -1353,7 +1409,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1365,16 +1421,28 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_read_bw" + }, + { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_write_bw" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / tma_info_system_time", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1433,12 +1501,20 @@ }, { "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD@thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", "MetricName": "tma_info_system_mem_parallel_reads", "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" }, { + "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", + "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / cha_0@event\\=0x0@ if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", + "MetricName": "tma_info_system_mem_pmm_read_latency", + "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" + }, + { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / tma_info_system_time)", "MetricGroup": "Mem;MemoryLat;SoC", @@ -1590,12 +1666,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1619,7 +1695,6 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", @@ -1633,7 +1708,7 @@ "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1739,7 +1814,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1748,7 +1823,7 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1816,7 +1891,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "cpu@IDQ.MS_UOPS\\,cmask\\=1@ / tma_info_core_core_clks / 2", + "MetricExpr": "cpu@IDQ.MS_UOPS\\,cmask\\=1@ / tma_info_core_core_clks / 3.3", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -1968,7 +2043,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2005,7 +2080,6 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", @@ -2019,7 +2093,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -2033,7 +2107,6 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", diff --git a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json index de651ff9f846..969cb519eec1 100644 --- a/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivybridge/ivb-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -80,7 +80,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -98,7 +97,6 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", @@ -139,7 +137,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -561,7 +558,7 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -574,7 +571,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -586,7 +583,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -775,7 +772,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -926,7 +922,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -935,7 +931,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -944,7 +940,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", diff --git a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json index 714d5e6d21e7..1cdd197ac883 100644 --- a/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/ivytown/ivt-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -80,7 +80,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5) / (3 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -98,7 +97,6 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", @@ -139,7 +137,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -561,7 +558,7 @@ "MetricThreshold": "tma_info_memory_tlb_page_walks_utilization > 0.5" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -574,7 +571,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -586,7 +583,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -626,6 +623,7 @@ }, { "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", "MetricName": "tma_info_system_mem_parallel_reads", @@ -795,7 +793,6 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -805,6 +802,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from local memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "200 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_local_mem", @@ -955,7 +953,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else (min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks)", + "MetricExpr": "(cpu@UOPS_EXECUTED.CORE\\,inv\\,cmask\\=1@ / 2 if #SMT_on else min(CPU_CLK_UNHALTED.THREAD, CYCLE_ACTIVITY.CYCLES_NO_EXECUTE) - (RS_EVENTS.EMPTY_CYCLES if tma_fetch_latency > 0.1 else 0)) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", "MetricThreshold": "tma_ports_utilized_0 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -964,7 +962,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=1@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_1_UOP_EXEC - UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", "MetricThreshold": "tma_ports_utilized_1 > 0.2 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -973,7 +971,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else (UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks)", + "MetricExpr": "((cpu@UOPS_EXECUTED.CORE\\,cmask\\=2@ - cpu@UOPS_EXECUTED.CORE\\,cmask\\=3@) / 2 if #SMT_on else UOPS_EXECUTED.CYCLES_GE_2_UOPS_EXEC - UOPS_EXECUTED.CYCLES_GE_3_UOPS_EXEC) / tma_info_core_core_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", "MetricThreshold": "tma_ports_utilized_2 > 0.15 & (tma_ports_utilization > 0.15 & (tma_core_bound > 0.1 & tma_backend_bound > 0.2))", @@ -1000,6 +998,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling loads from remote memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "310 * (MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_UOPS_RETIRED.HIT_LFB / (MEM_LOAD_UOPS_RETIRED.L2_HIT + MEM_LOAD_UOPS_RETIRED.LLC_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HIT + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_HITM + MEM_LOAD_UOPS_LLC_HIT_RETIRED.XSNP_MISS + MEM_LOAD_UOPS_LLC_MISS_RETIRED.LOCAL_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_DRAM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_HITM + MEM_LOAD_UOPS_LLC_MISS_RETIRED.REMOTE_FWD))) / tma_info_thread_clks", "MetricGroup": "Server;Snoop;TopdownL5;tma_L5_group;tma_mem_latency_group", "MetricName": "tma_remote_mem", diff --git a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json index 6f636ea0f216..250c73b21385 100644 --- a/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json +++ b/tools/perf/pmu-events/arch/x86/jaketown/jkt-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -71,7 +71,6 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", @@ -296,7 +295,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -308,7 +307,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -348,6 +347,7 @@ }, { "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182@ / UNC_C_TOR_OCCUPANCY.MISS_OPCODE@filter_opc\\=0x182\\,thresh\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", "MetricName": "tma_info_system_mem_parallel_reads", diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json index ff37d49611c3..402ca8fc50b6 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/cache.json @@ -29,6 +29,16 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Cachelines replaced into the L1 d-cache. Successful replacements only (not blocked) and exclude WB-miss case", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x51", + "EventName": "L1D.L1_REPLACEMENT", + "PublicDescription": "Counts cachelines replaced into the L1 d-cache.", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { "BriefDescription": "Cachelines replaced into the L0 and L1 d-cache. Successful replacements only (not blocked) and exclude WB-miss case", "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x51", @@ -592,7 +602,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ALL_LOADS", - "PublicDescription": "Counts Instructions with at least one architecturally visible load retired. Available PDIST counters: 0", + "PublicDescription": "Counts Instructions with at least one architecturally visible load retired. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x81", "Unit": "cpu_core" @@ -603,7 +613,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ALL_STORES", - "PublicDescription": "Counts all retired store instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all retired store instructions. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x82", "Unit": "cpu_core" @@ -613,7 +623,7 @@ "Counter": "0,1,2,3", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ALL_SWPF", - "PublicDescription": "Counts all retired software prefetch instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all retired software prefetch instructions. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x84", "Unit": "cpu_core" @@ -624,7 +634,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ANY", - "PublicDescription": "Counts all retired memory instructions - loads and stores. Available PDIST counters: 0", + "PublicDescription": "Counts all retired memory instructions - loads and stores. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x87", "Unit": "cpu_core" @@ -635,7 +645,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.LOCK_LOADS", - "PublicDescription": "Counts retired load instructions with locked access. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with locked access. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x21", "Unit": "cpu_core" @@ -646,7 +656,7 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.SPLIT_LOADS", - "PublicDescription": "Counts retired load instructions that split across a cacheline boundary. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions that split across a cacheline boundary. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x41", "Unit": "cpu_core" @@ -657,18 +667,29 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.SPLIT_STORES", - "PublicDescription": "Counts retired store instructions that split across a cacheline boundary. Available PDIST counters: 0", + "PublicDescription": "Counts retired store instructions that split across a cacheline boundary. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x42", "Unit": "cpu_core" }, { + "BriefDescription": "Retired instructions that hit the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_HIT_ANY", + "PublicDescription": "Number of retired instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0xf", + "Unit": "cpu_core" + }, + { "BriefDescription": "Retired load instructions that hit the STLB.", "Counter": "0,1,2,3", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.STLB_HIT_LOADS", - "PublicDescription": "Number of retired load instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0", + "PublicDescription": "Number of retired load instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x9", "Unit": "cpu_core" @@ -679,18 +700,39 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.STLB_HIT_STORES", - "PublicDescription": "Number of retired store instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0", + "PublicDescription": "Number of retired store instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0xa", "Unit": "cpu_core" }, { + "BriefDescription": "Retired SWPF instructions that hit the STLB.", + "Counter": "0,1,2,3", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_HIT_SWPF", + "PublicDescription": "Number of retired SWPF instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0xc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that miss the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_MISS_ANY", + "PublicDescription": "Retired instructions that miss the STLB. Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x17", + "Unit": "cpu_core" + }, + { "BriefDescription": "Retired load instructions that miss the STLB.", "Counter": "0,1,2,3", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.STLB_MISS_LOADS", - "PublicDescription": "Number of retired load instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0", + "PublicDescription": "Number of retired load instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x11", "Unit": "cpu_core" @@ -701,18 +743,28 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.STLB_MISS_STORES", - "PublicDescription": "Number of retired store instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0", + "PublicDescription": "Number of retired store instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x12", "Unit": "cpu_core" }, { + "BriefDescription": "Retired SWPF instructions that miss the STLB.", + "Counter": "0,1,2,3", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_MISS_SWPF", + "PublicDescription": "Number of retired SWPF instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x14", + "Unit": "cpu_core" + }, + { "BriefDescription": "Retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$)", "Counter": "0,1,2,3", "Data_LA": "1", "EventCode": "0xd2", "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", - "PublicDescription": "Counts retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$) Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$) Available PDIST counters: 0,1", "SampleAfterValue": "20011", "UMask": "0x10", "Unit": "cpu_core" @@ -723,7 +775,7 @@ "Data_LA": "1", "EventCode": "0xd2", "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM", - "PublicDescription": "Counts retired load instructions whose data sources were HitM responses from shared L3, Hit-with-FWD is normally excluded. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions whose data sources were HitM responses from shared L3, Hit-with-FWD is normally excluded. Available PDIST counters: 0,1", "SampleAfterValue": "20011", "UMask": "0x4", "Unit": "cpu_core" @@ -734,7 +786,7 @@ "Data_LA": "1", "EventCode": "0xd2", "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", - "PublicDescription": "Counts the retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Available PDIST counters: 0", + "PublicDescription": "Counts the retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Available PDIST counters: 0,1", "SampleAfterValue": "20011", "UMask": "0x1", "Unit": "cpu_core" @@ -745,7 +797,7 @@ "Data_LA": "1", "EventCode": "0xd2", "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", - "PublicDescription": "Counts retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache. Available PDIST counters: 0,1", "SampleAfterValue": "20011", "UMask": "0x2", "Unit": "cpu_core" @@ -756,7 +808,7 @@ "Data_LA": "1", "EventCode": "0xd3", "EventName": "MEM_LOAD_L3_MISS_RETIRED.MEMSIDE_CACHE", - "PublicDescription": "Retired load instructions which data source is memory side cache. Available PDIST counters: 0", + "PublicDescription": "Retired load instructions which data source is memory side cache. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "Unit": "cpu_core" }, @@ -766,7 +818,7 @@ "Data_LA": "1", "EventCode": "0xd4", "EventName": "MEM_LOAD_MISC_RETIRED.UC", - "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access (Bus Lock). Available PDIST counters: 0", + "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access (Bus Lock). Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x4", "Unit": "cpu_core" @@ -777,7 +829,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.FB_HIT", - "PublicDescription": "Counts retired load instructions with at least one uop was load missed in L1 but hit FB (Fill Buffers) due to preceding miss to the same cache line with data not ready. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop was load missed in L1 but hit FB (Fill Buffers) due to preceding miss to the same cache line with data not ready. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x40", "Unit": "cpu_core" @@ -788,7 +840,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L1_HIT", - "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x101", "Unit": "cpu_core" @@ -799,7 +851,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L1_HIT_L0", - "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 0 of the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 0 of the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -809,7 +861,7 @@ "Counter": "0,1,2,3", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L1_HIT_L1", - "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 1 of the L1 data cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 1 of the L1 data cache. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "Unit": "cpu_core" }, @@ -819,7 +871,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L1_MISS", - "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L1 cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L1 cache. Available PDIST counters: 0,1", "SampleAfterValue": "200003", "UMask": "0x8", "Unit": "cpu_core" @@ -830,7 +882,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L2_HIT", - "PublicDescription": "Counts retired load instructions with L2 cache hits as data sources. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with L2 cache hits as data sources. Available PDIST counters: 0,1", "SampleAfterValue": "200003", "UMask": "0x2", "Unit": "cpu_core" @@ -841,7 +893,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L2_MISS", - "PublicDescription": "Counts retired load instructions missed L2 cache as data sources. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions missed L2 cache as data sources. Available PDIST counters: 0,1", "SampleAfterValue": "100021", "UMask": "0x10", "Unit": "cpu_core" @@ -852,7 +904,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L3_HIT", - "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L3 cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L3 cache. Available PDIST counters: 0,1", "SampleAfterValue": "100021", "UMask": "0x4", "Unit": "cpu_core" @@ -863,7 +915,7 @@ "Data_LA": "1", "EventCode": "0xd1", "EventName": "MEM_LOAD_RETIRED.L3_MISS", - "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L3 cache. Available PDIST counters: 0", + "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L3 cache. Available PDIST counters: 0,1", "SampleAfterValue": "50021", "UMask": "0x20", "Unit": "cpu_core" @@ -1254,6 +1306,18 @@ "Unit": "cpu_atom" }, { + "BriefDescription": "Counts writebacks of modified cachelines that were supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_M.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x7E001E00008", + "PublicDescription": "Counts writebacks of modified cachelines that were supplied by the L3 cache. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts writebacks of non-modified cachelines that have any type of response.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xB7", @@ -1266,6 +1330,18 @@ "Unit": "cpu_atom" }, { + "BriefDescription": "Counts writebacks of non-modified cachelines that were supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_NONM.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x7E001E01000", + "PublicDescription": "Counts writebacks of non-modified cachelines that were supplied by the L3 cache. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts demand instruction fetches and L1 instruction cache prefetches that have any type of response.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xB7", @@ -1303,7 +1379,7 @@ }, { "BriefDescription": "Counts demand data reads that have any type of response.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", @@ -1315,7 +1391,7 @@ }, { "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", "MSRIndex": "0x1a6,0x1a7", @@ -1327,7 +1403,7 @@ }, { "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches which forwarded the unmodified data to the requesting core.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD", "MSRIndex": "0x1a6,0x1a7", @@ -1363,7 +1439,7 @@ }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", @@ -1375,7 +1451,7 @@ }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", "MSRIndex": "0x1a6,0x1a7", @@ -1386,6 +1462,18 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that were supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.READS_TO_CORE.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x7E001E04477", + "PublicDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that were supplied by the L3 cache. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Any memory transaction that reached the SQ.", "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x21", diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json index e2facc4086e9..b21d602e9f1a 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/frontend.json @@ -108,7 +108,7 @@ "EventName": "FRONTEND_RETIRED.ANY_ANT", "MSRIndex": "0x3F7", "MSRValue": "0x9", - "PublicDescription": "Always Not Taken (ANT) conditional retired branches (no BTB entry and not mispredicted) Available PDIST counters: 0", + "PublicDescription": "Always Not Taken (ANT) conditional retired branches (no BTB entry and not mispredicted) Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -120,7 +120,7 @@ "EventName": "FRONTEND_RETIRED.ANY_DSB_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x1", - "PublicDescription": "Counts retired Instructions that experienced DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions that experienced DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -169,7 +169,7 @@ "EventName": "FRONTEND_RETIRED.DSB_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x11", - "PublicDescription": "Number of retired Instructions that experienced a critical DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Critical means stalls were exposed to the back-end as a result of the DSB miss. Available PDIST counters: 0", + "PublicDescription": "Number of retired Instructions that experienced a critical DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Critical means stalls were exposed to the back-end as a result of the DSB miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -199,7 +199,7 @@ "EventName": "FRONTEND_RETIRED.ITLB_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x14", - "PublicDescription": "Counts retired Instructions that experienced iTLB (Instruction TLB) true miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions that experienced iTLB (Instruction TLB) true miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -211,7 +211,7 @@ "EventName": "FRONTEND_RETIRED.L1I_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x12", - "PublicDescription": "Counts retired Instructions who experienced Instruction L1 Cache true miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions who experienced Instruction L1 Cache true miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -223,7 +223,7 @@ "EventName": "FRONTEND_RETIRED.L2_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x13", - "PublicDescription": "Counts retired Instructions who experienced Instruction L2 Cache true miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions who experienced Instruction L2 Cache true miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -235,7 +235,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_128", "MSRIndex": "0x3F7", "MSRValue": "0x608006", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -247,7 +247,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_16", "MSRIndex": "0x3F7", "MSRValue": "0x601006", - "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 16 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 16 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -259,7 +259,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_2", "MSRIndex": "0x3F7", "MSRValue": "0x600206", - "PublicDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of at least 2 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of at least 2 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -271,7 +271,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_256", "MSRIndex": "0x3F7", "MSRValue": "0x610006", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -283,7 +283,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1", "MSRIndex": "0x3F7", "MSRValue": "0x100206", - "PublicDescription": "Counts retired instructions that are delivered to the back-end after the front-end had at least 1 bubble-slot for a period of 2 cycles. A bubble-slot is an empty issue-pipeline slot while there was no RAT stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after the front-end had at least 1 bubble-slot for a period of 2 cycles. A bubble-slot is an empty issue-pipeline slot while there was no RAT stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -295,7 +295,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_32", "MSRIndex": "0x3F7", "MSRValue": "0x602006", - "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 32 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 32 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -307,7 +307,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_4", "MSRIndex": "0x3F7", "MSRValue": "0x600406", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -319,7 +319,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_512", "MSRIndex": "0x3F7", "MSRValue": "0x620006", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -331,7 +331,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_64", "MSRIndex": "0x3F7", "MSRValue": "0x604006", - "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -343,7 +343,7 @@ "EventName": "FRONTEND_RETIRED.LATENCY_GE_8", "MSRIndex": "0x3F7", "MSRValue": "0x600806", - "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 8 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 8 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -355,7 +355,7 @@ "EventName": "FRONTEND_RETIRED.MISP_ANT", "MSRIndex": "0x3F7", "MSRValue": "0x9", - "PublicDescription": "ANT retired branches that got just mispredicted Available PDIST counters: 0", + "PublicDescription": "ANT retired branches that got just mispredicted Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x2", "Unit": "cpu_core" @@ -367,7 +367,7 @@ "EventName": "FRONTEND_RETIRED.MS_FLOWS", "MSRIndex": "0x3F7", "MSRValue": "0x8", - "PublicDescription": "Counts flows delivered by the Microcode Sequencer Available PDIST counters: 0", + "PublicDescription": "Counts flows delivered by the Microcode Sequencer Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -397,7 +397,7 @@ "EventName": "FRONTEND_RETIRED.STLB_MISS", "MSRIndex": "0x3F7", "MSRValue": "0x15", - "PublicDescription": "Counts retired Instructions that experienced STLB (2nd level TLB) true miss. Available PDIST counters: 0", + "PublicDescription": "Counts retired Instructions that experienced STLB (2nd level TLB) true miss. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" @@ -409,7 +409,7 @@ "EventName": "FRONTEND_RETIRED.UNKNOWN_BRANCH", "MSRIndex": "0x3F7", "MSRValue": "0x17", - "PublicDescription": "Number retired branch instructions that caused the front-end to be resteered when it finds the instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0", + "PublicDescription": "Number retired branch instructions that caused the front-end to be resteered when it finds the instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x3", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/lnl-metrics.json b/tools/perf/pmu-events/arch/x86/lunarlake/lnl-metrics.json index 3c740962e63e..06390a72110d 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/lnl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/lnl-metrics.json @@ -1,75 +1,47 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { - "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", - "MetricGroup": "Power", - "MetricName": "C3_Pkg_Residency", - "ScaleUnit": "100%" - }, - { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { - "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", - "MetricGroup": "Power", - "MetricName": "C7_Pkg_Residency", - "ScaleUnit": "100%" - }, - { - "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", - "MetricGroup": "Power", - "MetricName": "C8_Pkg_Residency", - "ScaleUnit": "100%" - }, - { - "BriefDescription": "C9 residency percent per package", - "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC", - "MetricGroup": "Power", - "MetricName": "C9_Pkg_Residency", - "ScaleUnit": "100%" - }, - { "BriefDescription": "Percentage of cycles spent in System Management Interrupts.", "MetricExpr": "((msr@aperf@ - cycles) / msr@aperf@ if msr@smi@ > 0 else 0)", "MetricGroup": "smi", @@ -555,7 +527,7 @@ }, { "BriefDescription": "Average CPU Utilization", - "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_atom@", "MetricName": "tma_info_system_cpu_utilization", "Unit": "cpu_atom" }, @@ -725,6 +697,13 @@ "Unit": "cpu_atom" }, { + "BriefDescription": "Uncore frequency per die [GHZ]", + "MetricExpr": "tma_info_system_socket_clks / #num_dies / duration_time / 1e9", + "MetricGroup": "SoC", + "MetricName": "UNCORE_FREQ", + "Unit": "cpu_core" + }, + { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", "MetricExpr": "cpu_core@UOPS_DISPATCHED.ALU@ / (6 * tma_info_thread_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", @@ -755,7 +734,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvOB;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -767,7 +746,7 @@ { "BriefDescription": "This category represents fraction of slots wasted due to incorrect speculations", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-bad\\-spec@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-bad\\-spec@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_bad_speculation", "MetricThreshold": "tma_bad_speculation > 0.15", @@ -794,35 +773,35 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", + "Unit": "cpu_core" + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full", "Unit": "cpu_core" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_capacity / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_capacity / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", - "Unit": "cpu_core" - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20", @@ -830,7 +809,7 @@ }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_microcode_sequencer + tma_few_uops_instructions) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_microcode_sequencer + tma_few_uops_instructions) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -839,7 +818,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / (tma_dtlb_load + tma_fb_full + tma_l1_latency_capacity + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_early_blk + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_bottleneck_memory_data_tlbs > 20", @@ -866,7 +845,7 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -883,7 +862,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", - "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BadSpec;BrMispredicts;BvMP;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", @@ -1023,7 +1002,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@R, 24 * tma_info_system_core_frequency) + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM@R, 25 * tma_info_system_core_frequency)) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "BvMS;DataSharing;LockCont;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", @@ -1076,7 +1054,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to DSB (decoded uop cache) fetch pipeline", - "MetricExpr": "(cpu_core@IDQ.DSB_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ + cpu_core@IDQ.DSB_UOPS@ / (cpu_core@IDQ.DSB_UOPS@ + cpu_core@IDQ.MITE_UOPS@) * (cpu_core@IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE@ - cpu_core@IDQ_BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks", + "MetricExpr": "(cpu_core@IDQ.DSB_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / 2 + cpu_core@IDQ.DSB_UOPS@ / (cpu_core@IDQ.DSB_UOPS@ + cpu_core@IDQ.MITE_UOPS@) * (cpu_core@IDQ_BUBBLES.STARVATION_CYCLES@ - cpu_core@IDQ_BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks", "MetricGroup": "DSB;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_dsb", "MetricThreshold": "tma_dsb > 0.15 & tma_fetch_bandwidth > 0.2", @@ -1130,7 +1108,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -1147,7 +1125,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was stalled due to Frontend latency issues", - "MetricExpr": "cpu_core@topdown\\-fetch\\-lat@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-fetch\\-lat@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Frontend;TmaL2;TopdownL2;tma_L2_group;tma_frontend_bound_group", "MetricName": "tma_fetch_latency", "MetricThreshold": "tma_fetch_latency > 0.1 & tma_frontend_bound > 0.15", @@ -1197,7 +1175,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) scalar uops fraction the CPU has retired", - "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_scalar", "MetricThreshold": "tma_fp_scalar > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -1207,7 +1185,7 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.VECTOR@ / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "cpu_core@FP_ARITH_OPS_RETIRED.VECTOR@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", "MetricThreshold": "tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6)", @@ -1217,7 +1195,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 128-bit wide vectors", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "(cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_SINGLE@) / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_128b", "MetricThreshold": "tma_fp_vector_128b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -1227,7 +1205,7 @@ }, { "BriefDescription": "This metric approximates arithmetic FP vector uops fraction the CPU has retired for 256-bit wide vectors", - "MetricExpr": "cpu_core@FP_ARITH_INST_RETIRED.VECTOR\\,umask\\=0x30@ / (tma_retiring * tma_info_thread_slots)", + "MetricExpr": "cpu_core@FP_ARITH_OPS_RETIRED.VECTOR\\,umask\\=0x30@ / (tma_retiring * tma_info_thread_slots)", "MetricGroup": "Compute;Flops;TopdownL5;tma_L5_group;tma_fp_vector_group;tma_issue2P", "MetricName": "tma_fp_vector_256b", "MetricThreshold": "tma_fp_vector_256b > 0.1 & (tma_fp_vector > 0.1 & (tma_fp_arith > 0.2 & tma_light_operations > 0.6))", @@ -1238,7 +1216,7 @@ { "BriefDescription": "This category represents fraction of slots where the processor's Frontend undersupplies its Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-fe\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvFB;BvIO;Default;PGO;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_frontend_bound", "MetricThreshold": "tma_frontend_bound > 0.15", @@ -1259,7 +1237,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -1437,7 +1415,7 @@ }, { "BriefDescription": "Floating Point Operations Per Cycle", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / tma_info_thread_clks", + "MetricExpr": "(cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_OPS_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE@) / tma_info_thread_clks", "MetricGroup": "Flops;Ret", "MetricName": "tma_info_core_flopc", "Unit": "cpu_core" @@ -1578,7 +1556,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + cpu_core@FP_ARITH_INST_RETIRED.VECTOR@)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + cpu_core@FP_ARITH_OPS_RETIRED.VECTOR@)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", "MetricThreshold": "tma_info_inst_mix_iparith < 10", @@ -1587,7 +1565,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX/SSE 128-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE@)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_SINGLE@)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx128", "MetricThreshold": "tma_info_inst_mix_iparith_avx128 < 10", @@ -1596,7 +1574,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic AVX* 256-bit instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_DOUBLE@ + cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE@)", "MetricGroup": "Flops;FpVector;InsType", "MetricName": "tma_info_inst_mix_iparith_avx256", "MetricThreshold": "tma_info_inst_mix_iparith_avx256 < 10", @@ -1605,7 +1583,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Double-Precision instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_DOUBLE@", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_OPS_RETIRED.SCALAR_DOUBLE@", "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_dp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_dp < 10", @@ -1614,7 +1592,7 @@ }, { "BriefDescription": "Instructions per FP Arithmetic Scalar Single-Precision instruction (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_INST_RETIRED.SCALAR_SINGLE@", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@FP_ARITH_OPS_RETIRED.SCALAR_SINGLE@", "MetricGroup": "Flops;FpScalar;InsType", "MetricName": "tma_info_inst_mix_iparith_scalar_sp", "MetricThreshold": "tma_info_inst_mix_iparith_scalar_sp < 10", @@ -1639,7 +1617,7 @@ }, { "BriefDescription": "Instructions per Floating Point (FP) Operation (lower number means higher occurrence rate)", - "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@)", + "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / (cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_OPS_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE@)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_ipflop", "MetricThreshold": "tma_info_inst_mix_ipflop < 10", @@ -1694,7 +1672,7 @@ }, { "BriefDescription": "Average per-thread data fill bandwidth to the L1 data cache [GB / sec]", - "MetricExpr": "64 * cpu_core@L1D.REPLACEMENT@ / 1e9 / tma_info_system_time", + "MetricExpr": "64 * cpu_core@L1D.L1_REPLACEMENT@ / 1e9 / tma_info_system_time", "MetricGroup": "Mem;MemoryBW", "MetricName": "tma_info_memory_l1d_cache_fill_bw", "Unit": "cpu_core" @@ -1707,6 +1685,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "L0 cache true misses per kilo instruction for retired demand loads", + "MetricExpr": "1e3 * (cpu_core@MEM_LOAD_RETIRED.L1_MISS@ + cpu_core@MEM_LOAD_RETIRED.L1_HIT_L1@) / cpu_core@INST_RETIRED.ANY@", + "MetricGroup": "CacheHits;Mem", + "MetricName": "tma_info_memory_l1dl0_mpki", + "Unit": "cpu_core" + }, + { "BriefDescription": "L1 cache true misses per kilo instruction for retired demand loads", "MetricExpr": "1e3 * cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / cpu_core@INST_RETIRED.ANY@", "MetricGroup": "CacheHits;Mem", @@ -1922,6 +1907,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Average number of uops fetched from MS per cycle", + "MetricExpr": "cpu_core@IDQ.MS_UOPS@ / cpu_core@IDQ.MS_UOPS\\,cmask\\=1@", + "MetricGroup": "Fed;FetchLat;MicroSeq", + "MetricName": "tma_info_pipeline_fetch_ms", + "Unit": "cpu_core" + }, + { "BriefDescription": "Instructions per a microcode Assist invocation", "MetricExpr": "cpu_core@INST_RETIRED.ANY@ / cpu_core@ASSISTS.ANY@", "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", @@ -1955,7 +1947,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc\\,cpu=cpu_core@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency", "Unit": "cpu_core" @@ -1969,14 +1961,22 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_core@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized", "Unit": "cpu_core" }, { + "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", + "MetricExpr": "32 * UNC_M_TOTAL_DATA / 1e9 / tma_info_system_time", + "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", + "MetricName": "tma_info_system_dram_bw_use", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full", + "Unit": "cpu_core" + }, + { "BriefDescription": "Giga Floating Point Operations Per Second", - "MetricExpr": "(cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_INST_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE@) / 1e9 / tma_info_system_time", + "MetricExpr": "(cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + 2 * cpu_core@FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE@ + 4 * cpu_core@FP_ARITH_OPS_RETIRED.4_FLOPS@ + 8 * cpu_core@FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE@) / 1e9 / tma_info_system_time", "MetricGroup": "Cor;Flops;HPC", "MetricName": "tma_info_system_gflops", "PublicDescription": "Giga Floating Point Operations Per Second. Aggregate across all supported options of: FP precisions, scalar and vector instructions, vector-width", @@ -2021,6 +2021,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Socket actual clocks when any core is active on that socket", + "MetricExpr": "UNC_CLOCK.SOCKET", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_socket_clks", + "Unit": "cpu_core" + }, + { "BriefDescription": "Run duration time in seconds", "MetricExpr": "duration_time", "MetricGroup": "Summary", @@ -2036,6 +2043,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / tma_info_system_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency", + "Unit": "cpu_core" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD@", "MetricGroup": "Pipeline", @@ -2156,12 +2170,12 @@ "Unit": "cpu_core" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", - "MetricExpr": "4 * cpu_core@DEPENDENT_LOADS.ANY@ / tma_info_thread_clks", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "MetricExpr": "4 * cpu_core@DEPENDENT_LOADS.ANY\\,cmask\\=1@ / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_UOPS_RETIRED.L1_HIT_PS", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2177,7 +2191,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L2 cache under unloaded scenarios (possibly L2 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu_core@MEM_LOAD_RETIRED.L2_HIT@ * min(cpu_core@MEM_LOAD_RETIRED.L2_HIT@R, 3 * tma_info_system_core_frequency) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_l2_bound_group", "MetricName": "tma_l2_hit_latency", @@ -2198,12 +2211,11 @@ }, { "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * min(cpu_core@MEM_LOAD_RETIRED.L3_HIT@R, 9 * tma_info_system_core_frequency) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2285,6 +2297,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ * cpu_core@MEM_INST_RETIRED.LOCK_LOADS@R / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -2295,7 +2308,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to LSD (Loop Stream Detector) unit", - "MetricExpr": "cpu_core@LSD.UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / tma_info_thread_clks", + "MetricExpr": "cpu_core@LSD.UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / tma_info_thread_clks / 2", "MetricGroup": "FetchBW;LSD;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_lsd", "MetricThreshold": "tma_lsd > 0.15 & tma_fetch_bandwidth > 0.2", @@ -2320,7 +2333,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2330,13 +2343,13 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", - "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -2347,7 +2360,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", @@ -2386,7 +2398,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the MITE pipeline (the legacy decode pipeline)", - "MetricExpr": "(cpu_core@IDQ.MITE_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / 2 + cpu_core@IDQ.MITE_UOPS@ / (cpu_core@IDQ.DSB_UOPS@ + cpu_core@IDQ.MITE_UOPS@) * (cpu_core@IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE@ - cpu_core@IDQ_BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks", + "MetricExpr": "(cpu_core@IDQ.MITE_UOPS\\,cmask\\=0x8\\,inv\\=0x1@ / 2 + cpu_core@IDQ.MITE_UOPS@ / (cpu_core@IDQ.DSB_UOPS@ + cpu_core@IDQ.MITE_UOPS@) * (cpu_core@IDQ_BUBBLES.STARVATION_CYCLES@ - cpu_core@IDQ_BUBBLES.FETCH_LATENCY@)) / tma_info_thread_clks", "MetricGroup": "DSBmiss;FetchBW;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_mite", "MetricThreshold": "tma_mite > 0.1 & tma_fetch_bandwidth > 0.2", @@ -2406,7 +2418,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "cpu_core@IDQ.MS_CYCLES_ANY@ / tma_info_thread_clks", + "MetricExpr": "cpu_core@IDQ.MS_CYCLES_ANY@ / tma_info_thread_clks / 1.8", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -2445,7 +2457,8 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", - "MetricExpr": "max(0, tma_light_operations - (tma_x87_use + (cpu_core@FP_ARITH_INST_RETIRED.SCALAR@ + cpu_core@FP_ARITH_INST_RETIRED.VECTOR@) / (tma_retiring * tma_info_thread_slots) + (cpu_core@INT_VEC_RETIRED.ADD_128@ + cpu_core@INT_VEC_RETIRED.VNNI_128@ + cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots) + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "max(0, tma_light_operations - (tma_x87_use + (cpu_core@FP_ARITH_OPS_RETIRED.SCALAR@ + cpu_core@FP_ARITH_OPS_RETIRED.VECTOR@) / (tma_retiring * tma_info_thread_slots) + (cpu_core@INT_VEC_RETIRED.ADD_128@ + cpu_core@INT_VEC_RETIRED.VNNI_128@ + cpu_core@INT_VEC_RETIRED.ADD_256@ + cpu_core@INT_VEC_RETIRED.MUL_256@ + cpu_core@INT_VEC_RETIRED.VNNI_256@) / (tma_retiring * tma_info_thread_slots) + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", "MetricThreshold": "tma_other_light_ops > 0.3 & tma_light_operations > 0.6", @@ -2483,6 +2496,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "((cpu_core@EXE_ACTIVITY.EXE_BOUND_0_PORTS@ + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_3_PORTS_UTIL@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_3_PORTS_UTIL@) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -2493,6 +2507,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.EXE_BOUND_0_PORTS@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", @@ -2503,6 +2518,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", @@ -2513,7 +2529,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.2_PORTS_UTIL@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", @@ -2524,7 +2539,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@UOPS_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks", "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", @@ -2545,7 +2559,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2576,7 +2590,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.PAUSE@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", @@ -2611,7 +2624,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2626,6 +2639,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "This metric estimates clocks wasted due to loads blocked due to unknown store address (did not do memory disambiguation) or due to unknown store data", + "MetricExpr": "7 * cpu_core@LD_BLOCKS.STORE_EARLY\\,cmask\\=1@ / tma_info_thread_clks", + "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", + "MetricName": "tma_store_early_blk", + "MetricThreshold": "tma_store_early_blk > 0.2", + "ScaleUnit": "100%", + "Unit": "cpu_core" + }, + { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", "MetricExpr": "13 * cpu_core@LD_BLOCKS.STORE_FORWARD@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/memory.json b/tools/perf/pmu-events/arch/x86/lunarlake/memory.json index 8021a1c7dd3b..caa387e10259 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/memory.json @@ -163,7 +163,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_1024", "MSRIndex": "0x3F6", "MSRValue": "0x400", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "53", "UMask": "0x1", "Unit": "cpu_core" @@ -176,7 +176,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128", "MSRIndex": "0x3F6", "MSRValue": "0x80", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "1009", "UMask": "0x1", "Unit": "cpu_core" @@ -189,7 +189,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16", "MSRIndex": "0x3F6", "MSRValue": "0x10", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "20011", "UMask": "0x1", "Unit": "cpu_core" @@ -202,7 +202,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_2048", "MSRIndex": "0x3F6", "MSRValue": "0x800", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "23", "UMask": "0x1", "Unit": "cpu_core" @@ -215,7 +215,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256", "MSRIndex": "0x3F6", "MSRValue": "0x100", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "503", "UMask": "0x1", "Unit": "cpu_core" @@ -228,7 +228,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32", "MSRIndex": "0x3F6", "MSRValue": "0x20", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "100007", "UMask": "0x1", "Unit": "cpu_core" @@ -241,7 +241,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4", "MSRIndex": "0x3F6", "MSRValue": "0x4", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -254,7 +254,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512", "MSRIndex": "0x3F6", "MSRValue": "0x200", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "101", "UMask": "0x1", "Unit": "cpu_core" @@ -267,7 +267,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64", "MSRIndex": "0x3F6", "MSRValue": "0x40", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "2003", "UMask": "0x1", "Unit": "cpu_core" @@ -280,7 +280,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8", "MSRIndex": "0x3F6", "MSRValue": "0x8", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles. Reported latency may be longer than just the memory latency.", "SampleAfterValue": "50021", "UMask": "0x1", "Unit": "cpu_core" @@ -291,7 +291,7 @@ "Data_LA": "1", "EventCode": "0xcd", "EventName": "MEM_TRANS_RETIRED.STORE_SAMPLE", - "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8 Available PDIST counters: 0", + "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8 Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -352,7 +352,7 @@ }, { "BriefDescription": "Counts demand data reads that were supplied by DRAM.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.DRAM", "MSRIndex": "0x1a6,0x1a7", @@ -376,7 +376,7 @@ }, { "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache and were supplied by the system memory (DRAM, MSC, or MMIO).", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", "MSRIndex": "0x1a6,0x1a7", @@ -412,7 +412,7 @@ }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache and were supplied by the system memory (DRAM, MSC, or MMIO).", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.L3_MISS", "MSRIndex": "0x1a6,0x1a7", diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/other.json b/tools/perf/pmu-events/arch/x86/lunarlake/other.json index 59949f9541d8..1df716442549 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/other.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/other.json @@ -151,7 +151,7 @@ }, { "BriefDescription": "Counts streaming stores that have any type of response.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.STREAMING_WR.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json index 6ac410510628..cdaa01e9a57d 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/pipeline.json @@ -110,7 +110,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.ALL_BRANCHES", - "PublicDescription": "Counts all branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts all branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "Unit": "cpu_core" }, @@ -128,7 +128,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND", - "PublicDescription": "Counts conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x111", "Unit": "cpu_core" @@ -147,7 +147,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND_NTAKEN", - "PublicDescription": "Counts not taken branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts not taken branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x10", "Unit": "cpu_core" @@ -166,7 +166,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND_TAKEN", - "PublicDescription": "Counts taken conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x101", "Unit": "cpu_core" @@ -176,7 +176,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND_TAKEN_BWD", - "PublicDescription": "Counts taken backward conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken backward conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x1", "Unit": "cpu_core" @@ -186,7 +186,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND_TAKEN_FWD", - "PublicDescription": "Counts taken forward conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken forward conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x102", "Unit": "cpu_core" @@ -205,7 +205,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.FAR_BRANCH", - "PublicDescription": "Counts far branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts far branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x40", "Unit": "cpu_core" @@ -224,7 +224,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.INDIRECT", - "PublicDescription": "Counts near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0", + "PublicDescription": "Counts near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x80", "Unit": "cpu_core" @@ -261,7 +261,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_CALL", - "PublicDescription": "Counts both direct and indirect near call instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts both direct and indirect near call instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x2", "Unit": "cpu_core" @@ -280,13 +280,13 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_RETURN", - "PublicDescription": "Counts return instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts return instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x8", "Unit": "cpu_core" }, { - "BriefDescription": "Counts the number of taken branch instructions retired", + "BriefDescription": "Counts the number of near taken branch instructions retired", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_TAKEN", @@ -299,7 +299,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_TAKEN", - "PublicDescription": "Counts taken branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x20", "Unit": "cpu_core" @@ -327,7 +327,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.ALL_BRANCHES", - "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. Available PDIST counters: 0", + "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "Unit": "cpu_core" }, @@ -336,7 +336,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.ALL_BRANCHES_COST", - "PublicDescription": "All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x44", "Unit": "cpu_core" @@ -355,7 +355,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND", - "PublicDescription": "Counts mispredicted conditional branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts mispredicted conditional branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x111", "Unit": "cpu_core" @@ -365,7 +365,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_COST", - "PublicDescription": "Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x151", "Unit": "cpu_core" @@ -384,7 +384,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_NTAKEN", - "PublicDescription": "Counts the number of conditional branch instructions retired that were mispredicted and the branch direction was not taken. Available PDIST counters: 0", + "PublicDescription": "Counts the number of conditional branch instructions retired that were mispredicted and the branch direction was not taken. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x10", "Unit": "cpu_core" @@ -394,7 +394,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_NTAKEN_COST", - "PublicDescription": "Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x50", "Unit": "cpu_core" @@ -413,7 +413,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN", - "PublicDescription": "Counts taken conditional mispredicted branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken conditional mispredicted branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x101", "Unit": "cpu_core" @@ -423,7 +423,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_BWD", - "PublicDescription": "Counts taken backward conditional mispredicted branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken backward conditional mispredicted branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x1", "Unit": "cpu_core" @@ -433,7 +433,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_BWD_COST", - "PublicDescription": "number of branch instructions retired that were mispredicted and taken backward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "number of branch instructions retired that were mispredicted and taken backward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x8001", "Unit": "cpu_core" @@ -443,7 +443,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_COST", - "PublicDescription": "Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x141", "Unit": "cpu_core" @@ -453,7 +453,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_FWD", - "PublicDescription": "Counts taken forward conditional mispredicted branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts taken forward conditional mispredicted branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "Unit": "cpu_core" }, @@ -462,7 +462,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.COND_TAKEN_FWD_COST", - "PublicDescription": "number of branch instructions retired that were mispredicted and taken forward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "number of branch instructions retired that were mispredicted and taken forward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x8002", "Unit": "cpu_core" @@ -481,7 +481,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.INDIRECT", - "PublicDescription": "Counts miss-predicted near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0", + "PublicDescription": "Counts miss-predicted near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x80", "Unit": "cpu_core" @@ -500,7 +500,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.INDIRECT_CALL", - "PublicDescription": "Counts retired mispredicted indirect (near taken) CALL instructions, including both register and memory indirect. Available PDIST counters: 0", + "PublicDescription": "Counts retired mispredicted indirect (near taken) CALL instructions, including both register and memory indirect. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x2", "Unit": "cpu_core" @@ -510,7 +510,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.INDIRECT_CALL_COST", - "PublicDescription": "Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x42", "Unit": "cpu_core" @@ -520,7 +520,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.INDIRECT_COST", - "PublicDescription": "Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0xc0", "Unit": "cpu_core" @@ -548,7 +548,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.NEAR_TAKEN", - "PublicDescription": "Counts number of near branch instructions retired that were mispredicted and taken. Available PDIST counters: 0", + "PublicDescription": "Counts number of near branch instructions retired that were mispredicted and taken. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x20", "Unit": "cpu_core" @@ -558,7 +558,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.NEAR_TAKEN_COST", - "PublicDescription": "Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "400009", "UMask": "0x60", "Unit": "cpu_core" @@ -568,7 +568,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.RET", - "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted return instructions retired. Available PDIST counters: 0", + "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted return instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x8", "Unit": "cpu_core" @@ -587,7 +587,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.RET_COST", - "PublicDescription": "Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0", + "PublicDescription": "Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x48", "Unit": "cpu_core" @@ -906,7 +906,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.ANY_P", - "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter. Available PDIST counters: 0", + "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter. Available PDIST counters: 0,1", "SampleAfterValue": "2000003", "Unit": "cpu_core" }, @@ -915,7 +915,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.BR_FUSED", - "PublicDescription": "retired macro-fused uops when there is a branch in the macro-fused pair (the two instructions that got macro-fused count once in this pmon) Available PDIST counters: 0", + "PublicDescription": "retired macro-fused uops when there is a branch in the macro-fused pair (the two instructions that got macro-fused count once in this pmon) Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -925,7 +925,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", - "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0", + "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0,1", "SampleAfterValue": "2000003", "UMask": "0x30", "Unit": "cpu_core" @@ -935,7 +935,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", - "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions Available PDIST counters: 0", + "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions Available PDIST counters: 0,1", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -954,7 +954,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", - "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0", + "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0,1", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -1228,6 +1228,15 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of times a load got early blocked due to preceding store operation with unknown address or unknown data. Excluding in-line (immediate) wakeups", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.STORE_EARLY", + "SampleAfterValue": "100003", + "UMask": "0xa1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the number of occurrences a retired load gets blocked because its address partially overlaps with an older store (size mismatch) - unknown_sta/bad_forward", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x03", @@ -1451,7 +1460,7 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xe4", "EventName": "MISC_RETIRED.LBR_INSERTS", - "PublicDescription": "LBR record is inserted Available PDIST counters: 0", + "PublicDescription": "LBR record is inserted Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/lunarlake/uncore-interconnect.json new file mode 100644 index 000000000000..69ef928d57f6 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/lunarlake/uncore-interconnect.json @@ -0,0 +1,10 @@ +[ + { + "BriefDescription": "This 48-bit fixed counter counts the UCLK cycles.", + "Counter": "FIXED", + "EventCode": "0xff", + "EventName": "UNC_CLOCK.SOCKET", + "PerPkg": "1", + "Unit": "SANTA" + } +] diff --git a/tools/perf/pmu-events/arch/x86/lunarlake/uncore-memory.json b/tools/perf/pmu-events/arch/x86/lunarlake/uncore-memory.json index 7d63580302de..63c4aa2791e4 100644 --- a/tools/perf/pmu-events/arch/x86/lunarlake/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/lunarlake/uncore-memory.json @@ -32,5 +32,13 @@ "Experimental": "1", "PerPkg": "1", "Unit": "iMC" + }, + { + "BriefDescription": "Total number of read and write byte transfers to/from DRAM, in 32B chunk, per DDR channel. Counter increments by 1 after sending or receiving 32B chunk data.", + "Counter": "0,1,2,3,4", + "EventCode": "0x3C", + "EventName": "UNC_M_TOTAL_DATA", + "PerPkg": "1", + "Unit": "iMC" } ] diff --git a/tools/perf/pmu-events/arch/x86/mapfile.csv b/tools/perf/pmu-events/arch/x86/mapfile.csv index 354ce241500b..32093bded949 100644 --- a/tools/perf/pmu-events/arch/x86/mapfile.csv +++ b/tools/perf/pmu-events/arch/x86/mapfile.csv @@ -1,7 +1,7 @@ Family-model,Version,Filename,EventType -GenuineIntel-6-(97|9A|B7|BA|BF),v1.31,alderlake,core -GenuineIntel-6-BE,v1.31,alderlaken,core -GenuineIntel-6-C[56],v1.09,arrowlake,core +GenuineIntel-6-(97|9A|B7|BA|BF),v1.34,alderlake,core +GenuineIntel-6-BE,v1.34,alderlaken,core +GenuineIntel-6-C[56],v1.13,arrowlake,core GenuineIntel-6-(1C|26|27|35|36),v5,bonnell,core GenuineIntel-6-(3D|47),v30,broadwell,core GenuineIntel-6-56,v12,broadwellde,core @@ -9,11 +9,11 @@ GenuineIntel-6-4F,v23,broadwellx,core GenuineIntel-6-55-[56789ABCDEF],v1.25,cascadelakex,core GenuineIntel-6-DD,v1.00,clearwaterforest,core GenuineIntel-6-9[6C],v1.05,elkhartlake,core -GenuineIntel-6-CF,v1.14,emeraldrapids,core +GenuineIntel-6-CF,v1.20,emeraldrapids,core GenuineIntel-6-5[CF],v13,goldmont,core GenuineIntel-6-7A,v1.01,goldmontplus,core -GenuineIntel-6-B6,v1.09,grandridge,core -GenuineIntel-6-A[DE],v1.10,graniterapids,core +GenuineIntel-6-B6,v1.10,grandridge,core +GenuineIntel-6-A[DE],v1.15,graniterapids,core GenuineIntel-6-(3C|45|46),v36,haswell,core GenuineIntel-6-3F,v29,haswellx,core GenuineIntel-6-7[DE],v1.24,icelake,core @@ -22,15 +22,15 @@ GenuineIntel-6-3A,v24,ivybridge,core GenuineIntel-6-3E,v24,ivytown,core GenuineIntel-6-2D,v24,jaketown,core GenuineIntel-6-(57|85),v16,knightslanding,core -GenuineIntel-6-BD,v1.14,lunarlake,core -GenuineIntel-6-(AA|AC|B5),v1.14,meteorlake,core +GenuineIntel-6-BD,v1.18,lunarlake,core +GenuineIntel-6-(AA|AC|B5),v1.17,meteorlake,core GenuineIntel-6-1[AEF],v4,nehalemep,core GenuineIntel-6-2E,v4,nehalemex,core GenuineIntel-6-CC,v1.00,pantherlake,core GenuineIntel-6-A7,v1.04,rocketlake,core GenuineIntel-6-2A,v19,sandybridge,core -GenuineIntel-6-8F,v1.28,sapphirerapids,core -GenuineIntel-6-AF,v1.11,sierraforest,core +GenuineIntel-6-8F,v1.35,sapphirerapids,core +GenuineIntel-6-AF,v1.12,sierraforest,core GenuineIntel-6-(37|4A|4C|4D|5A),v15,silvermont,core GenuineIntel-6-(4E|5E|8E|9E|A5|A6),v59,skylake,core GenuineIntel-6-55-[01234],v1.37,skylakex,core diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json index 82b115183924..d4731e300d6d 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/cache.json @@ -14,7 +14,6 @@ "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.HWPF_MISS", - "PublicDescription": "L1D.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20", "Unit": "cpu_core" @@ -24,7 +23,7 @@ "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.REPLACEMENT", - "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace. Available PDIST counters: 0", + "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -34,7 +33,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -46,7 +45,7 @@ "EdgeDetect": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL_PERIODS", - "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -56,7 +55,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.L2_STALLS", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x4", "Unit": "cpu_core" @@ -66,7 +65,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING", - "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -77,7 +76,7 @@ "CounterMask": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING_CYCLES", - "PublicDescription": "Counts duration of L1D miss outstanding in cycles. Available PDIST counters: 0", + "PublicDescription": "Counts duration of L1D miss outstanding in cycles.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -87,7 +86,7 @@ "Counter": "0,1,2,3", "EventCode": "0x25", "EventName": "L2_LINES_IN.ALL", - "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects. Available PDIST counters: 0", + "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects.", "SampleAfterValue": "100003", "UMask": "0x1f", "Unit": "cpu_core" @@ -147,7 +146,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.NON_SILENT", - "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3 Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3", "SampleAfterValue": "200003", "UMask": "0x2", "Unit": "cpu_core" @@ -167,7 +166,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.SILENT", - "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event. Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event.", "SampleAfterValue": "200003", "UMask": "0x1", "Unit": "cpu_core" @@ -177,7 +176,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.USELESS_HWPF", - "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache Available PDIST counters: 0", + "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache", "SampleAfterValue": "200003", "UMask": "0x4", "Unit": "cpu_core" @@ -187,7 +186,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.ALL", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES]", "SampleAfterValue": "200003", "UMask": "0xff", "Unit": "cpu_core" @@ -206,7 +205,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.HIT", - "PublicDescription": "Counts all requests that hit L2 cache. [This event is alias to L2_RQSTS.HIT] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that hit L2 cache. [This event is alias to L2_RQSTS.HIT]", "SampleAfterValue": "200003", "UMask": "0xdf", "Unit": "cpu_core" @@ -225,7 +224,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f", "Unit": "cpu_core" @@ -244,7 +243,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_CODE_RD", - "PublicDescription": "Counts the total number of L2 code requests. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of L2 code requests.", "SampleAfterValue": "200003", "UMask": "0xe4", "Unit": "cpu_core" @@ -254,7 +253,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD", - "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0xe1", "Unit": "cpu_core" @@ -264,7 +263,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_MISS", - "PublicDescription": "Counts demand requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts demand requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x27", "Unit": "cpu_core" @@ -274,7 +273,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_REFERENCES", - "PublicDescription": "Counts demand requests to L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts demand requests to L2 cache.", "SampleAfterValue": "200003", "UMask": "0xe7", "Unit": "cpu_core" @@ -284,7 +283,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_HWPF", - "PublicDescription": "L2_RQSTS.ALL_HWPF Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0xf0", "Unit": "cpu_core" @@ -294,7 +292,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_RFO", - "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches.", "SampleAfterValue": "200003", "UMask": "0xe2", "Unit": "cpu_core" @@ -304,7 +302,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_HIT", - "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads.", "SampleAfterValue": "200003", "UMask": "0xc4", "Unit": "cpu_core" @@ -314,7 +312,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_MISS", - "PublicDescription": "Counts L2 cache misses when fetching instructions. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache misses when fetching instructions.", "SampleAfterValue": "200003", "UMask": "0x24", "Unit": "cpu_core" @@ -324,7 +322,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT", - "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc1", "Unit": "cpu_core" @@ -334,7 +332,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_MISS", - "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0x21", "Unit": "cpu_core" @@ -344,7 +342,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.HIT", - "PublicDescription": "Counts all requests that hit L2 cache. [This event is alias to L2_REQUEST.HIT] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that hit L2 cache. [This event is alias to L2_REQUEST.HIT]", "SampleAfterValue": "200003", "UMask": "0xdf", "Unit": "cpu_core" @@ -354,7 +352,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.HWPF_MISS", - "PublicDescription": "L2_RQSTS.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x30", "Unit": "cpu_core" @@ -364,7 +361,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f", "Unit": "cpu_core" @@ -374,7 +371,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.REFERENCES", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL]", "SampleAfterValue": "200003", "UMask": "0xff", "Unit": "cpu_core" @@ -384,7 +381,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_HIT", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc2", "Unit": "cpu_core" @@ -394,7 +391,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_MISS", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x22", "Unit": "cpu_core" @@ -404,7 +401,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_HIT", - "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0xc8", "Unit": "cpu_core" @@ -414,7 +411,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_MISS", - "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0x28", "Unit": "cpu_core" @@ -424,7 +421,7 @@ "Counter": "0,1,2,3", "EventCode": "0x23", "EventName": "L2_TRANS.L2_WB", - "PublicDescription": "Counts L2 writebacks that access L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts L2 writebacks that access L2 cache.", "SampleAfterValue": "200003", "UMask": "0x40", "Unit": "cpu_core" @@ -434,7 +431,7 @@ "Counter": "0,1,2,3", "EventCode": "0x42", "EventName": "LOCK_CYCLES.CACHE_LOCK_DURATION", - "PublicDescription": "This event counts the number of cycles when the L1D is locked. It is a superset of the 0x1 mask (BUS_LOCK_CLOCKS.BUS_LOCK_DURATION). Available PDIST counters: 0", + "PublicDescription": "This event counts the number of cycles when the L1D is locked. It is a superset of the 0x1 mask (BUS_LOCK_CLOCKS.BUS_LOCK_DURATION).", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -454,7 +451,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.MISS", - "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x41", "Unit": "cpu_core" @@ -474,7 +471,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.REFERENCE", - "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x4f", "Unit": "cpu_core" @@ -695,7 +692,7 @@ "Counter": "0,1,2,3", "EventCode": "0x43", "EventName": "MEM_LOAD_COMPLETED.L1_MISS_ANY", - "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss) Available PDIST counters: 0", + "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss)", "SampleAfterValue": "1000003", "UMask": "0xfd", "Unit": "cpu_core" @@ -947,7 +944,6 @@ "Counter": "0,1,2,3", "EventCode": "0x44", "EventName": "MEM_STORE_RETIRED.L2_HIT", - "PublicDescription": "MEM_STORE_RETIRED.L2_HIT Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x1", "Unit": "cpu_core" @@ -974,7 +970,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024", @@ -986,7 +982,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128", @@ -998,7 +994,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16", @@ -1010,7 +1006,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048", @@ -1022,7 +1018,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256", @@ -1034,7 +1030,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32", @@ -1046,7 +1042,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4", @@ -1058,7 +1054,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512", @@ -1070,7 +1066,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64", @@ -1082,7 +1078,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8", @@ -1177,12 +1173,36 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe5", "EventName": "MEM_UOP_RETIRED.ANY", - "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses Available PDIST counters: 0", + "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses", "SampleAfterValue": "1000003", "UMask": "0x3", "Unit": "cpu_core" }, { + "BriefDescription": "Counts writebacks of modified cachelines that hit in the L3 or were snooped from another core's caches.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_M.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F803C0008", + "PublicDescription": "Counts writebacks of modified cachelines that hit in the L3 or were snooped from another core's caches. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts writebacks of non-modified cachelines that hit in the L3 or were snooped from another core's caches.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_NONM.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F803C1000", + "PublicDescription": "Counts writebacks of non-modified cachelines that hit in the L3 or were snooped from another core's caches. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts demand instruction fetches and L1 instruction cache prefetches that have any type of response.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xB7", @@ -1399,11 +1419,23 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that hit in the L3 or were snooped from another core's caches.", + "Counter": "0,1,2,3", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.READS_TO_CORE.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x3F803C4477", + "PublicDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that hit in the L3 or were snooped from another core's caches. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Any memory transaction that reached the SQ.", "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.ALL_REQUESTS", - "PublicDescription": "Counts memory transactions reached the super queue including requests initiated by the core, all L3 prefetches, page walks, etc.. Available PDIST counters: 0", + "PublicDescription": "Counts memory transactions reached the super queue including requests initiated by the core, all L3 prefetches, page walks, etc..", "SampleAfterValue": "100003", "UMask": "0x80", "Unit": "cpu_core" @@ -1413,7 +1445,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DATA_RD", - "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -1423,7 +1455,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_CODE_RD", - "PublicDescription": "Counts both cacheable and Non-Cacheable code read requests. Available PDIST counters: 0", + "PublicDescription": "Counts both cacheable and Non-Cacheable code read requests.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -1433,7 +1465,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD", - "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore. Available PDIST counters: 0", + "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -1443,7 +1475,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_RFO", - "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM. Available PDIST counters: 0", + "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -1454,7 +1486,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "PublicDescription": "Counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -1465,7 +1497,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1476,7 +1508,6 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "PublicDescription": "Cycles where at least 1 outstanding demand data read request is pending. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1487,7 +1518,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", - "PublicDescription": "Counts the number of offcore outstanding demand rfo Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding demand rfo Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1497,7 +1528,6 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -1507,7 +1537,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1517,7 +1547,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1527,7 +1557,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO", - "PublicDescription": "Counts the number of off-core outstanding read-for-ownership (RFO) store transactions every cycle. An RFO transaction is considered to be in the Off-core outstanding state between L2 cache miss and transaction completion. Available PDIST counters: 0", + "PublicDescription": "Counts the number of off-core outstanding read-for-ownership (RFO) store transactions every cycle. An RFO transaction is considered to be in the Off-core outstanding state between L2 cache miss and transaction completion.", "SampleAfterValue": "1000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1537,7 +1567,7 @@ "Counter": "0,1,2,3", "EventCode": "0x2c", "EventName": "SQ_MISC.BUS_LOCK", - "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory. Available PDIST counters: 0", + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -1547,7 +1577,6 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.ANY", - "PublicDescription": "Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0xf", "Unit": "cpu_core" @@ -1557,7 +1586,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.NTA", - "PublicDescription": "Counts the number of PREFETCHNTA instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHNTA instructions executed.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -1567,7 +1596,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.PREFETCHW", - "PublicDescription": "Counts the number of PREFETCHW instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHW instructions executed.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -1577,7 +1606,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T0", - "PublicDescription": "Counts the number of PREFETCHT0 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT0 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -1587,7 +1616,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T1_T2", - "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json b/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json index ae9778aa755b..28dc5e06ee31 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/floating-point.json @@ -15,7 +15,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.FPDIV_ACTIVE", - "PublicDescription": "This event counts the cycles the floating point divider is busy. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -25,7 +24,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.FP", - "PublicDescription": "Counts all microcode Floating Point assists. Available PDIST counters: 0", + "PublicDescription": "Counts all microcode Floating Point assists.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -35,7 +34,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.SSE_AVX_MIX", - "PublicDescription": "ASSISTS.SSE_AVX_MIX Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -45,7 +43,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_0", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -55,7 +52,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_1", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -65,7 +61,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_5", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -75,7 +70,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V0", - "PublicDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -85,7 +79,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V1", - "PublicDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -95,7 +88,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V2", - "PublicDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -105,7 +97,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -115,7 +107,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -125,7 +117,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -135,7 +127,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -145,7 +137,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x18", "Unit": "cpu_core" @@ -155,7 +147,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR", - "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0x3", "Unit": "cpu_core" @@ -165,7 +157,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -175,7 +167,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_SINGLE", - "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -185,7 +177,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.VECTOR", - "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0xfc", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json index 82727022efb6..6484834b1127 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/frontend.json @@ -14,7 +14,7 @@ "Counter": "0,1,2,3", "EventCode": "0x60", "EventName": "BACLEARS.ANY", - "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0", + "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -24,7 +24,7 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.LCP", - "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk. Available PDIST counters: 0", + "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk.", "SampleAfterValue": "500009", "UMask": "0x1", "Unit": "cpu_core" @@ -34,7 +34,6 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.MS_BUSY", - "PublicDescription": "Cycles the Microcode Sequencer is busy. Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x2", "Unit": "cpu_core" @@ -44,7 +43,7 @@ "Counter": "0,1,2,3", "EventCode": "0x61", "EventName": "DSB2MITE_SWITCHES.PENALTY_CYCLES", - "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE. Available PDIST counters: 0", + "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -405,7 +404,7 @@ "Counter": "0,1,2,3", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALLS", - "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity.", "SampleAfterValue": "500009", "UMask": "0x4", "Unit": "cpu_core" @@ -417,7 +416,6 @@ "EdgeDetect": "1", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALL_PERIODS", - "PublicDescription": "ICACHE_DATA.STALL_PERIODS Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x4", "Unit": "cpu_core" @@ -427,7 +425,7 @@ "Counter": "0,1,2,3", "EventCode": "0x83", "EventName": "ICACHE_TAG.STALLS", - "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss.", "SampleAfterValue": "200003", "UMask": "0x4", "Unit": "cpu_core" @@ -438,7 +436,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -449,7 +447,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -459,7 +457,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.DSB_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -470,7 +468,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -481,7 +479,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -491,7 +489,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MITE_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -502,7 +500,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MS_CYCLES_ANY", - "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE.", "SampleAfterValue": "2000003", "UMask": "0x20", "Unit": "cpu_core" @@ -514,7 +512,7 @@ "EdgeDetect": "1", "EventCode": "0x79", "EventName": "IDQ.MS_SWITCHES", - "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer. Available PDIST counters: 0", + "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -524,7 +522,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MS_UOPS", - "PublicDescription": "Counts the number of uops initiated by MITE or Decode Stream Buffer (DSB) and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may 'bypass' the IDQ. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops initiated by MITE or Decode Stream Buffer (DSB) and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "1000003", "UMask": "0x20", "Unit": "cpu_core" @@ -534,7 +532,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CORE", - "PublicDescription": "This event counts a subset of the Topdown Slots event that when no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations. The count may be distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method. Available PDIST counters: 0", + "PublicDescription": "This event counts a subset of the Topdown Slots event that when no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations. The count may be distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Frontend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -545,7 +543,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -557,7 +555,7 @@ "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -567,7 +565,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CORE", - "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -578,7 +576,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -590,7 +588,7 @@ "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/memory.json b/tools/perf/pmu-events/arch/x86/meteorlake/memory.json index 17b94f810d5a..f0cbeda4d5ca 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/memory.json @@ -5,7 +5,6 @@ "CounterMask": "2", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L3_MISS", - "PublicDescription": "Cycles while L3 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -16,7 +15,6 @@ "CounterMask": "6", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x6", "Unit": "cpu_core" @@ -90,7 +88,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", - "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture Available PDIST counters: 0", + "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -101,7 +99,6 @@ "CounterMask": "2", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -112,7 +109,6 @@ "CounterMask": "3", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x3", "Unit": "cpu_core" @@ -123,7 +119,7 @@ "CounterMask": "5", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_core" @@ -134,7 +130,7 @@ "CounterMask": "9", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x9", "Unit": "cpu_core" @@ -411,7 +407,6 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "Counts demand data read requests that miss the L3 cache. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -422,7 +417,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "Cycles with at least 1 Demand Data Read requests who miss L3 cache in the superQ. Available PDIST counters: 0", + "PublicDescription": "Cycles with at least 1 Demand Data Read requests who miss L3 cache in the superQ.", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -432,7 +427,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache.", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json b/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json index 0088be169f9b..948c16a1f95b 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/mtl-metrics.json @@ -1,56 +1,56 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c8\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C8_Pkg_Residency", "ScaleUnit": "100%" @@ -541,7 +541,7 @@ }, { "BriefDescription": "Average CPU Utilization", - "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricExpr": "cpu_atom@CPU_CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_atom@", "MetricName": "tma_info_system_cpu_utilization", "Unit": "cpu_atom" }, @@ -748,7 +748,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-be\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvOB;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -787,11 +787,20 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", + "Unit": "cpu_core" + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full", "Unit": "cpu_core" }, @@ -799,23 +808,14 @@ "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency", "Unit": "cpu_core" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: ", - "Unit": "cpu_core" - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20", @@ -823,7 +823,7 @@ }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricExpr": "100 * ((1 - cpu_core@INST_RETIRED.REP_ITERATION@ / cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + cpu_core@RS.EMPTY_RESOURCE@ / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -859,7 +859,7 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -876,7 +876,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", - "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-br\\-mispredict@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BadSpec;BrMispredicts;BvMP;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", @@ -1007,7 +1007,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS@R, 24 * tma_info_system_core_frequency) + cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@ * min(cpu_core@MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD@R, 25 * tma_info_system_core_frequency) * (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ / (cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM@ + cpu_core@OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD@))) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "BvMS;DataSharing;LockCont;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", @@ -1124,7 +1123,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -1253,7 +1252,7 @@ }, { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", - "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-heavy\\-ops@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -1923,7 +1922,7 @@ "Unit": "cpu_core" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "cpu_core@UOPS_EXECUTED.THREAD@ / (cpu_core@UOPS_EXECUTED.CORE_CYCLES_GE_1@ / 2 if #SMT_on else cpu_core@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute", @@ -1984,7 +1983,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc\\,cpu=cpu_core@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency", "Unit": "cpu_core" @@ -1998,7 +1997,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / TSC", + "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.REF_TSC@ / msr@tsc\\,cpu=cpu_core@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized", "Unit": "cpu_core" @@ -2008,7 +2007,7 @@ "MetricExpr": "64 * (UNC_HAC_ARB_TRK_REQUESTS.ALL + UNC_HAC_ARB_COH_TRK_REQUESTS.ALL) / 1e9 / tma_info_system_time", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full", + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full", "Unit": "cpu_core" }, { @@ -2095,6 +2094,13 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Measured Average Uncore Frequency for the SoC [GHz]", + "MetricExpr": "tma_info_system_socket_clks / 1e9 / tma_info_system_time", + "MetricGroup": "SoC", + "MetricName": "tma_info_system_uncore_frequency", + "Unit": "cpu_core" + }, + { "BriefDescription": "Per-Logical Processor actual clocks when the Logical Processor is active.", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.THREAD@", "MetricGroup": "Pipeline", @@ -2213,12 +2219,12 @@ "Unit": "cpu_core" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", "MetricExpr": "min(2 * (cpu_core@MEM_INST_RETIRED.ALL_LOADS@ - cpu_core@MEM_LOAD_RETIRED.FB_HIT@ - cpu_core@MEM_LOAD_RETIRED.L1_MISS@) * 20 / 100, max(cpu_core@CYCLE_ACTIVITY.CYCLES_MEM_ANY@ - cpu_core@MEMORY_ACTIVITY.CYCLES_L1D_MISS@, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2234,7 +2240,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L2 cache under unloaded scenarios (possibly L2 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu_core@MEM_LOAD_RETIRED.L2_HIT@ * min(cpu_core@MEM_LOAD_RETIRED.L2_HIT@R, 3 * tma_info_system_core_frequency) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_l2_bound_group", "MetricName": "tma_l2_hit_latency", @@ -2255,12 +2260,11 @@ }, { "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu_core@MEM_LOAD_RETIRED.L3_HIT@ * min(cpu_core@MEM_LOAD_RETIRED.L3_HIT@R, 9 * tma_info_system_core_frequency) * (1 + cpu_core@MEM_LOAD_RETIRED.FB_HIT@ / cpu_core@MEM_LOAD_RETIRED.L1_MISS@ / 2) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2342,6 +2346,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu_core@MEM_INST_RETIRED.LOCK_LOADS@ * cpu_core@MEM_INST_RETIRED.LOCK_LOADS@R / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -2377,7 +2382,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%", "Unit": "cpu_core" }, @@ -2387,13 +2392,13 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%", "Unit": "cpu_core" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", - "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-mem\\-bound@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "Backend;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -2404,7 +2409,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * cpu_core@MISC2_RETIRED.LFENCE@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", @@ -2463,7 +2467,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "max(cpu_core@IDQ.MS_CYCLES_ANY@, cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@ / (cpu_core@UOPS_RETIRED.SLOTS@ / cpu_core@UOPS_ISSUED.ANY@)) / tma_info_core_core_clks / 2", + "MetricExpr": "max(cpu_core@IDQ.MS_CYCLES_ANY@, cpu_core@UOPS_RETIRED.MS\\,cmask\\=1@ / (cpu_core@UOPS_RETIRED.SLOTS@ / cpu_core@UOPS_ISSUED.ANY@)) / tma_info_core_core_clks / 2.4", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -2502,6 +2506,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", @@ -2570,6 +2575,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_3_PORTS_UTIL@)) / tma_info_thread_clks if cpu_core@ARITH.DIV_ACTIVE@ < cpu_core@CYCLE_ACTIVITY.STALLS_TOTAL@ - cpu_core@EXE_ACTIVITY.BOUND_ON_LOADS@ else (cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ + tma_retiring * cpu_core@EXE_ACTIVITY.2_3_PORTS_UTIL@) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -2580,6 +2586,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "max(cpu_core@EXE_ACTIVITY.EXE_BOUND_0_PORTS@ - cpu_core@RESOURCE_STALLS.SCOREBOARD@, 0) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", @@ -2590,6 +2597,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.1_PORTS_UTIL@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", @@ -2600,7 +2608,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@EXE_ACTIVITY.2_PORTS_UTIL@ / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", @@ -2611,7 +2618,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@UOPS_EXECUTED.CYCLES_GE_3@ / tma_info_thread_clks", "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", @@ -2632,7 +2638,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@) + 0 * tma_info_thread_slots", + "MetricExpr": "cpu_core@topdown\\-retiring@ / (cpu_core@topdown\\-fe\\-bound@ + cpu_core@topdown\\-bad\\-spec@ + cpu_core@topdown\\-retiring@ + cpu_core@topdown\\-be\\-bound@)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2663,7 +2669,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "cpu_core@CPU_CLK_UNHALTED.PAUSE@ / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", @@ -2698,7 +2703,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%", "Unit": "cpu_core" }, diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/other.json b/tools/perf/pmu-events/arch/x86/meteorlake/other.json index cb21bb933617..8320ffd83c51 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/other.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/other.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.HARDWARE", - "PublicDescription": "Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events. This includes, but not limited to, assists at EXE or MEM uop writeback like AVX* load/store/gather/scatter (non-FP GSSE-assist ) , assists generated by ROB like PEBS and RTIT, Uncore trap, RAR (Remote Action Request) and CET (Control flow Enforcement Technology) assists. the event also counts for Machine Ordering count. Available PDIST counters: 0", + "PublicDescription": "Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events. This includes, but not limited to, assists at EXE or MEM uop writeback like AVX* load/store/gather/scatter (non-FP GSSE-assist ) , assists generated by ROB like PEBS and RTIT, Uncore trap, RAR (Remote Action Request) and CET (Control flow Enforcement Technology) assists. the event also counts for Machine Ordering count.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -14,7 +14,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.PAGE_FAULT", - "PublicDescription": "ASSISTS.PAGE_FAULT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -83,7 +82,7 @@ "CounterMask": "1", "EventCode": "0x2d", "EventName": "XQ.FULL_CYCLES", - "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache). Available PDIST counters: 0", + "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache).", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json index 22b25708e799..bfdaabe9377d 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/pipeline.json @@ -15,7 +15,7 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.DIV_ACTIVE", - "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations.", "SampleAfterValue": "1000003", "UMask": "0x9", "Unit": "cpu_core" @@ -26,7 +26,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.IDIV_ACTIVE", - "PublicDescription": "This event counts the cycles the integer divider is busy. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -36,7 +35,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.ANY", - "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists. Available PDIST counters: 0", + "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists.", "SampleAfterValue": "100003", "UMask": "0x1b", "Unit": "cpu_core" @@ -44,6 +43,7 @@ { "BriefDescription": "Counts the total number of branch instructions retired for all branch types.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "MTL012, MTL013", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.ALL_BRANCHES", "PublicDescription": "Counts the total number of instructions in which the instruction pointer (IP) of the processor is resteered due to a branch instruction and the branch instruction successfully retires. All branch type instructions are accounted for.", @@ -62,6 +62,7 @@ { "BriefDescription": "Counts the number of retired JCC (Jump on Conditional Code) branch instructions retired, includes both taken and not taken branches.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "MTL013", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.COND", "SampleAfterValue": "200003", @@ -110,6 +111,7 @@ { "BriefDescription": "Counts the number of far branch instructions retired, includes far jump, far call and return, and interrupt call and return.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "MTL013", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.FAR_BRANCH", "SampleAfterValue": "200003", @@ -129,6 +131,7 @@ { "BriefDescription": "Counts the number of near indirect JMP and near indirect CALL branch instructions retired.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "MTL013", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.INDIRECT", "SampleAfterValue": "200003", @@ -148,6 +151,7 @@ { "BriefDescription": "Counts the number of near indirect CALL branch instructions retired.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "MTL013", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.INDIRECT_CALL", "SampleAfterValue": "200003", @@ -167,6 +171,7 @@ "BriefDescription": "This event is deprecated. Refer to new event BR_INST_RETIRED.INDIRECT_CALL", "Counter": "0,1,2,3,4,5,6,7", "Deprecated": "1", + "Errata": "MTL013", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.IND_CALL", "SampleAfterValue": "200003", @@ -176,6 +181,7 @@ { "BriefDescription": "Counts the number of near CALL branch instructions retired.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "MTL012, MTL013", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_CALL", "SampleAfterValue": "200003", @@ -214,6 +220,7 @@ { "BriefDescription": "Counts the number of near taken branch instructions retired.", "Counter": "0,1,2,3,4,5,6,7", + "Errata": "MTL013", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.NEAR_TAKEN", "SampleAfterValue": "200003", @@ -484,7 +491,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C01", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -494,7 +501,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C02", - "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x20", "Unit": "cpu_core" @@ -504,7 +511,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C0_WAIT", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction.", "SampleAfterValue": "2000003", "UMask": "0x70", "Unit": "cpu_core" @@ -530,7 +537,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.DISTRIBUTED", - "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -540,7 +547,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", - "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted. Available PDIST counters: 0", + "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted.", "SampleAfterValue": "25003", "UMask": "0x2", "Unit": "cpu_core" @@ -550,7 +557,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40", "Unit": "cpu_core" @@ -562,7 +568,6 @@ "EdgeDetect": "1", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE_INST", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE_INST Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40", "Unit": "cpu_core" @@ -572,7 +577,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_DISTRIBUTED", - "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -589,7 +594,7 @@ "BriefDescription": "Reference cycles when the core is not in halt state.", "Counter": "Fixed counter 2", "EventName": "CPU_CLK_UNHALTED.REF_TSC", - "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case. Available PDIST counters: 0", + "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.", "SampleAfterValue": "2000003", "UMask": "0x3", "Unit": "cpu_core" @@ -609,7 +614,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", - "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case. Available PDIST counters: 0", + "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -626,7 +631,7 @@ "BriefDescription": "Core cycles when the thread is not in halt state", "Counter": "Fixed counter 1", "EventName": "CPU_CLK_UNHALTED.THREAD", - "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events. Available PDIST counters: 0", + "PublicDescription": "Counts the number of core cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. This event is a component in many key event ratios. The core frequency may change from time to time due to transitions associated with Enhanced Intel SpeedStep Technology or TM2. For this reason this event may have a changing ratio with regards to time. When the core frequency is constant, this event can approximate elapsed time while the core was not in the halt state. It is counted on a dedicated fixed counter, leaving the eight programmable counters available for other events.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -644,7 +649,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.THREAD_P", - "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time. Available PDIST counters: 0", + "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.", "SampleAfterValue": "2000003", "Unit": "cpu_core" }, @@ -654,7 +659,6 @@ "CounterMask": "8", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8", "Unit": "cpu_core" @@ -665,7 +669,6 @@ "CounterMask": "1", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L2_MISS", - "PublicDescription": "Cycles while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -676,7 +679,6 @@ "CounterMask": "16", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_MEM_ANY", - "PublicDescription": "Cycles while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -687,7 +689,6 @@ "CounterMask": "12", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xc", "Unit": "cpu_core" @@ -698,7 +699,6 @@ "CounterMask": "5", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_core" @@ -709,7 +709,6 @@ "CounterMask": "4", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_TOTAL", - "PublicDescription": "Total execution stalls. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4", "Unit": "cpu_core" @@ -719,7 +718,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.1_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -729,7 +728,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_3_PORTS_UTIL", - "PublicDescription": "Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0xc", "Unit": "cpu_core" @@ -739,7 +737,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -749,7 +747,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.3_PORTS_UTIL", - "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -759,7 +757,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.4_PORTS_UTIL", - "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -770,7 +768,6 @@ "CounterMask": "5", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_LOADS", - "PublicDescription": "Execution stalls while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x21", "Unit": "cpu_core" @@ -781,7 +778,7 @@ "CounterMask": "2", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_STORES", - "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall.", "SampleAfterValue": "1000003", "UMask": "0x40", "Unit": "cpu_core" @@ -791,7 +788,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.EXE_BOUND_0_PORTS", - "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load. Available PDIST counters: 0", + "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load.", "SampleAfterValue": "1000003", "UMask": "0x80", "Unit": "cpu_core" @@ -801,7 +798,7 @@ "Counter": "0,1,2,3", "EventCode": "0x75", "EventName": "INST_DECODED.DECODERS", - "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions. Available PDIST counters: 0", + "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -846,7 +843,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", - "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -856,7 +852,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", - "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions Available PDIST counters: 0", + "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -875,7 +871,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", - "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0", + "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8", "Unit": "cpu_core" @@ -887,7 +883,7 @@ "EdgeDetect": "1", "EventCode": "0xad", "EventName": "INT_MISC.CLEARS_COUNT", - "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears Available PDIST counters: 0", + "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", "SampleAfterValue": "500009", "UMask": "0x1", "Unit": "cpu_core" @@ -897,7 +893,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.CLEAR_RESTEER_CYCLES", - "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path. Available PDIST counters: 0", + "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.", "SampleAfterValue": "500009", "UMask": "0x80", "Unit": "cpu_core" @@ -907,7 +903,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.RECOVERY_CYCLES", - "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event. Available PDIST counters: 0", + "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event.", "SampleAfterValue": "500009", "UMask": "0x1", "Unit": "cpu_core" @@ -919,7 +915,6 @@ "EventName": "INT_MISC.UNKNOWN_BRANCH_CYCLES", "MSRIndex": "0x3F7", "MSRValue": "0x7", - "PublicDescription": "Bubble cycles of BAClear (Unknown Branch). Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40", "Unit": "cpu_core" @@ -929,7 +924,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.UOP_DROPPING", - "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons Available PDIST counters: 0", + "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -939,7 +934,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.128BIT", - "PublicDescription": "INT_VEC_RETIRED.128BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x13", "Unit": "cpu_core" @@ -949,7 +943,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.256BIT", - "PublicDescription": "INT_VEC_RETIRED.256BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xac", "Unit": "cpu_core" @@ -959,7 +952,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_128", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0x3", "Unit": "cpu_core" @@ -969,7 +962,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_256", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0xc", "Unit": "cpu_core" @@ -979,7 +972,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.MUL_256", - "PublicDescription": "INT_VEC_RETIRED.MUL_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x80", "Unit": "cpu_core" @@ -989,7 +981,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.SHUFFLES", - "PublicDescription": "INT_VEC_RETIRED.SHUFFLES Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40", "Unit": "cpu_core" @@ -999,7 +990,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_128", - "PublicDescription": "INT_VEC_RETIRED.VNNI_128 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1009,7 +999,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_256", - "PublicDescription": "INT_VEC_RETIRED.VNNI_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20", "Unit": "cpu_core" @@ -1028,7 +1017,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.ADDRESS_ALIAS", - "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -1047,7 +1036,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.NO_SR", - "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.", "SampleAfterValue": "100003", "UMask": "0x88", "Unit": "cpu_core" @@ -1066,7 +1055,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.STORE_FORWARD", - "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide.", "SampleAfterValue": "100003", "UMask": "0x82", "Unit": "cpu_core" @@ -1076,7 +1065,7 @@ "Counter": "0,1,2,3", "EventCode": "0x4c", "EventName": "LOAD_HIT_PREFETCH.SWPF", - "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -1087,7 +1076,7 @@ "CounterMask": "1", "EventCode": "0xa8", "EventName": "LSD.CYCLES_ACTIVE", - "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1098,7 +1087,7 @@ "CounterMask": "6", "EventCode": "0xa8", "EventName": "LSD.CYCLES_OK", - "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1108,7 +1097,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa8", "EventName": "LSD.UOPS", - "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector).", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1128,7 +1117,7 @@ "EdgeDetect": "1", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.COUNT", - "PublicDescription": "Counts the number of machine clears (nukes) of any type. Available PDIST counters: 0", + "PublicDescription": "Counts the number of machine clears (nukes) of any type.", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -1184,7 +1173,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.SMC", - "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear. Available PDIST counters: 0", + "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -1194,7 +1183,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe0", "EventName": "MISC2_RETIRED.LFENCE", - "PublicDescription": "number of LFENCE retired instructions Available PDIST counters: 0", + "PublicDescription": "number of LFENCE retired instructions", "SampleAfterValue": "400009", "UMask": "0x20", "Unit": "cpu_core" @@ -1213,7 +1202,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcc", "EventName": "MISC_RETIRED.LBR_INSERTS", - "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT. Available PDIST counters: 0", + "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT.", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -1223,7 +1212,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SB", - "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end. Available PDIST counters: 0", + "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -1233,7 +1222,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SCOREBOARD", - "PublicDescription": "Counts cycles where the pipeline is stalled due to serializing operations. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -1243,7 +1231,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY", - "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses) Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", "SampleAfterValue": "1000003", "UMask": "0x7", "Unit": "cpu_core" @@ -1256,7 +1244,7 @@ "EventCode": "0xa5", "EventName": "RS.EMPTY_COUNT", "Invert": "1", - "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events) Available PDIST counters: 0", + "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events)", "SampleAfterValue": "100003", "UMask": "0x7", "Unit": "cpu_core" @@ -1266,7 +1254,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY_RESOURCE", - "PublicDescription": "Cycles when RS was empty and a resource allocation stall is asserted Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1285,7 +1272,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method. Available PDIST counters: 0", + "PublicDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core, in processors that support Intel Hyper-Threading Technology. Software can use this event as the numerator for the Backend Bound metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "10000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1295,7 +1282,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BAD_SPEC_SLOTS", - "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations. Available PDIST counters: 0", + "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations.", "SampleAfterValue": "10000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1305,7 +1292,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS", - "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction. Available PDIST counters: 0", + "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction.", "SampleAfterValue": "10000003", "UMask": "0x8", "Unit": "cpu_core" @@ -1315,7 +1302,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.MEMORY_BOUND_SLOTS", - "PublicDescription": "TOPDOWN.MEMORY_BOUND_SLOTS Available PDIST counters: 0", "SampleAfterValue": "10000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1324,7 +1310,7 @@ "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event", "Counter": "Fixed counter 3", "EventName": "TOPDOWN.SLOTS", - "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3). Available PDIST counters: 0", + "PublicDescription": "Number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method (TMA). The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Software can use this event as the denominator for the top-level metrics of the TMA method. This architectural event is counted on a designated fixed counter (Fixed Counter 3).", "SampleAfterValue": "10000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1334,7 +1320,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.SLOTS_P", - "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Available PDIST counters: 0", + "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core.", "SampleAfterValue": "10000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1591,7 +1577,7 @@ "Counter": "0,1,2,3", "EventCode": "0x76", "EventName": "UOPS_DECODED.DEC0_UOPS", - "PublicDescription": "This event counts the number of not dec-by-all uops decoded by decoder 0. Available PDIST counters: 0", + "PublicDescription": "This event counts the number of not dec-by-all uops decoded by decoder 0.", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1601,7 +1587,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_0", - "PublicDescription": "Number of uops dispatch to execution port 0. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 0.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1611,7 +1597,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_1", - "PublicDescription": "Number of uops dispatch to execution port 1. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 1.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1621,7 +1607,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_2_3_10", - "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1631,7 +1617,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_4_9", - "PublicDescription": "Number of uops dispatch to execution ports 4 and 9 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 4 and 9", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1641,7 +1627,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_5_11", - "PublicDescription": "Number of uops dispatch to execution ports 5 and 11 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 5 and 11", "SampleAfterValue": "2000003", "UMask": "0x20", "Unit": "cpu_core" @@ -1651,7 +1637,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_6", - "PublicDescription": "Number of uops dispatch to execution port 6. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 6.", "SampleAfterValue": "2000003", "UMask": "0x40", "Unit": "cpu_core" @@ -1661,7 +1647,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_7_8", - "PublicDescription": "Number of uops dispatch to execution ports 7 and 8. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 7 and 8.", "SampleAfterValue": "2000003", "UMask": "0x80", "Unit": "cpu_core" @@ -1671,7 +1657,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE", - "PublicDescription": "Counts the number of uops executed from any thread. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops executed from any thread.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1682,7 +1668,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_1", - "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1693,7 +1679,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_2", - "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1704,7 +1690,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_3", - "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1715,7 +1701,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_4", - "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1726,7 +1712,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_1", - "PublicDescription": "Cycles where at least 1 uop was executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 1 uop was executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1737,7 +1723,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_2", - "PublicDescription": "Cycles where at least 2 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 2 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1748,7 +1734,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_3", - "PublicDescription": "Cycles where at least 3 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 3 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1759,7 +1745,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_4", - "PublicDescription": "Cycles where at least 4 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 4 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1771,7 +1757,7 @@ "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.STALLS", "Invert": "1", - "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1781,7 +1767,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.THREAD", - "PublicDescription": "Counts the number of uops to be executed per-thread each cycle. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1791,7 +1776,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.X87", - "PublicDescription": "Counts the number of x87 uops executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of x87 uops executed.", "SampleAfterValue": "2000003", "UMask": "0x10", "Unit": "cpu_core" @@ -1810,7 +1795,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xae", "EventName": "UOPS_ISSUED.ANY", - "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS).", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1821,7 +1806,6 @@ "CounterMask": "1", "EventCode": "0xae", "EventName": "UOPS_ISSUED.CYCLES", - "PublicDescription": "UOPS_ISSUED.CYCLES Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1840,7 +1824,7 @@ "CounterMask": "1", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.CYCLES", - "PublicDescription": "Counts cycles where at least one uop has retired. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where at least one uop has retired.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1850,7 +1834,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.HEAVY", - "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count. Available PDIST counters: 0", + "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count.", "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_core" @@ -1880,7 +1864,6 @@ "EventName": "UOPS_RETIRED.MS", "MSRIndex": "0x3F7", "MSRValue": "0x8", - "PublicDescription": "UOPS_RETIRED.MS Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4", "Unit": "cpu_core" @@ -1890,7 +1873,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.SLOTS", - "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric. Software can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method. Available PDIST counters: 0", + "PublicDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric. Software can use this event as the numerator for the Retiring metric (or top-level category) of the Top-down Microarchitecture Analysis method.", "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" @@ -1902,7 +1885,7 @@ "EventCode": "0xc2", "EventName": "UOPS_RETIRED.STALLS", "Invert": "1", - "PublicDescription": "This event counts cycles without actually retired uops. Available PDIST counters: 0", + "PublicDescription": "This event counts cycles without actually retired uops.", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json index f300129e9e2d..305b96b26a4e 100644 --- a/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/meteorlake/virtual-memory.json @@ -13,7 +13,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.STLB_HIT", - "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB).", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -24,7 +24,7 @@ "CounterMask": "1", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -43,7 +43,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe", "Unit": "cpu_core" @@ -53,7 +53,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -73,7 +73,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -93,7 +93,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -113,7 +113,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -132,7 +132,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.STLB_HIT", - "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB).", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -143,7 +143,7 @@ "CounterMask": "1", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -162,7 +162,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe", "Unit": "cpu_core" @@ -172,7 +172,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8", "Unit": "cpu_core" @@ -192,7 +192,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -212,7 +212,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -232,7 +232,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -260,7 +260,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.STLB_HIT", - "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB).", "SampleAfterValue": "100003", "UMask": "0x20", "Unit": "cpu_core" @@ -271,7 +271,7 @@ "CounterMask": "1", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" @@ -291,7 +291,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe", "Unit": "cpu_core" @@ -311,7 +311,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4", "Unit": "cpu_core" @@ -331,7 +331,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2", "Unit": "cpu_core" @@ -351,7 +351,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10", "Unit": "cpu_core" diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/cache.json b/tools/perf/pmu-events/arch/x86/pantherlake/cache.json index c84f3d9fdb10..7098ea1d6d16 100644 --- a/tools/perf/pmu-events/arch/x86/pantherlake/cache.json +++ b/tools/perf/pmu-events/arch/x86/pantherlake/cache.json @@ -1,5 +1,136 @@ [ { + "BriefDescription": "Counts the number of cache lines replaced in L0 data cache.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x51", + "EventName": "L1D.L0_REPLACEMENT", + "PublicDescription": "Counts L0 data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace.", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cachelines replaced into the L1 d-cache. Successful replacements only (not blocked) and exclude WB-miss case", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x51", + "EventName": "L1D.L1_REPLACEMENT", + "PublicDescription": "Counts cachelines replaced into the L1 d-cache.", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cachelines replaced into the L0 and L1 d-cache. Successful replacements only (not blocked) and exclude WB-miss case", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x51", + "EventName": "L1D.REPLACEMENT", + "PublicDescription": "Counts cachelines replaced into the L0 and L1 d-cache.", + "SampleAfterValue": "1000003", + "UMask": "0x5", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x49", + "EventName": "L1D_MISS.FB_FULL", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of cycles a demand request has waited due to L1D due to lack of L2 resources.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x49", + "EventName": "L1D_MISS.L2_STALLS", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of demand requests that missed L1D cache", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x49", + "EventName": "L1D_MISS.LOAD", + "PublicDescription": "Count occurrences (rising-edge) of DCACHE_PENDING sub-event0. Impl. sends per-port binary inc-bit the occupancy increases* (at FB alloc or promotion).", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of L1D misses that are outstanding", + "Counter": "2", + "EventCode": "0x48", + "EventName": "L1D_PENDING.LOAD", + "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type.", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles with L1D load Misses outstanding.", + "Counter": "2", + "CounterMask": "1", + "EventCode": "0x48", + "EventName": "L1D_PENDING.LOAD_CYCLES", + "PublicDescription": "Counts duration of L1D miss outstanding in cycles.", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache lines filling L2", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x25", + "EventName": "L2_LINES_IN.ALL", + "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects.", + "SampleAfterValue": "100003", + "UMask": "0x1f", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of cache lines filled into the L2 cache that are in Exclusive state", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x25", + "EventName": "L2_LINES_IN.E", + "PublicDescription": "Counts the number of cache lines filled into the L2 cache that are in Exclusive state. Counts on a per core basis.", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Modified cache lines that are evicted by L2 cache when triggered by an L2 cache fill.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x26", + "EventName": "L2_LINES_OUT.NON_SILENT", + "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3", + "SampleAfterValue": "200003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Non-modified cache lines that are silently dropped by L2 cache.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x26", + "EventName": "L2_LINES_OUT.SILENT", + "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event.", + "SampleAfterValue": "200003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cache lines that have been L2 hardware prefetched but not used by demand accesses", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x26", + "EventName": "L2_LINES_OUT.USELESS_HWPF", + "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache", + "SampleAfterValue": "200003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the number of L2 cache accesses from front door requests for Code Read, Data Read, RFO, ITOM, and L2 Prefetches. Does not include rejects or recycles, per core event.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x24", @@ -9,6 +140,35 @@ "Unit": "cpu_atom" }, { + "BriefDescription": "All accesses to L2 cache [This event is alias to L2_RQSTS.REFERENCES, L2_RQSTS.ANY]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_REQUEST.ALL", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES, L2_RQSTS.ANY]", + "SampleAfterValue": "200003", + "UMask": "0xff", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of L2 cache accesses from front door requests that resulted in a Hit. Does not include rejects or recycles, per core event.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x24", + "EventName": "L2_REQUEST.HIT", + "SampleAfterValue": "1000003", + "UMask": "0x1bf", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Read requests with true-miss in L2 cache [This event is alias to L2_RQSTS.MISS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_REQUEST.MISS", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS]", + "SampleAfterValue": "200003", + "UMask": "0x3f", + "Unit": "cpu_core" + }, + { "BriefDescription": "L2 code requests", "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x24", @@ -29,6 +189,106 @@ "Unit": "cpu_core" }, { + "BriefDescription": "All accesses to L2 cache [This event is alias to L2_RQSTS.REFERENCES, L2_REQUEST.ALL]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_RQSTS.ANY", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES, L2_REQUEST.ALL]", + "SampleAfterValue": "200003", + "UMask": "0xff", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache hits when fetching instructions, code reads.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_RQSTS.CODE_RD_HIT", + "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads.", + "SampleAfterValue": "200003", + "UMask": "0x44", + "Unit": "cpu_core" + }, + { + "BriefDescription": "L2 cache misses when fetching instructions", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_RQSTS.CODE_RD_MISS", + "PublicDescription": "Counts L2 cache misses when fetching instructions.", + "SampleAfterValue": "200003", + "UMask": "0x24", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Demand Data Read requests that hit L2 cache", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT", + "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache.", + "SampleAfterValue": "200003", + "UMask": "0x41", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Demand Data Read miss L2 cache", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_RQSTS.DEMAND_DATA_RD_MISS", + "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once.", + "SampleAfterValue": "200003", + "UMask": "0x21", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Read requests with true-miss in L2 cache [This event is alias to L2_REQUEST.MISS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_RQSTS.MISS", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS]", + "SampleAfterValue": "200003", + "UMask": "0x3f", + "Unit": "cpu_core" + }, + { + "BriefDescription": "All accesses to L2 cache [This event is alias to L2_REQUEST.ALL,L2_RQSTS.ANY]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_RQSTS.REFERENCES", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL,L2_RQSTS.ANY]", + "SampleAfterValue": "200003", + "UMask": "0xff", + "Unit": "cpu_core" + }, + { + "BriefDescription": "RFO requests that hit L2 cache", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_RQSTS.RFO_HIT", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache.", + "SampleAfterValue": "200003", + "UMask": "0x42", + "Unit": "cpu_core" + }, + { + "BriefDescription": "RFO requests that miss L2 cache", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x24", + "EventName": "L2_RQSTS.RFO_MISS", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache.", + "SampleAfterValue": "200003", + "UMask": "0x22", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when L1D is locked", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x42", + "EventName": "LOCK_CYCLES.CACHE_LOCK_DURATION", + "PublicDescription": "This event counts the number of cycles when the L1D is locked.", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the number of cacheable memory requests that miss in the LLC. Counts on a per core basis.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", @@ -69,12 +329,67 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to an L1 demand load miss.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x7f", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.L2_HIT", + "PublicDescription": "Counts the number of cycles a core is stalled due to a demand load which hit in the L2 cache.", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles the core is stalled due to a demand load which hit in the L2 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.L2_MISS", + "SampleAfterValue": "1000003", + "UMask": "0x7e", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT", + "SampleAfterValue": "1000003", + "UMask": "0x6", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS", + "SampleAfterValue": "1000003", + "UMask": "0x38", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the caches. DRAM, MMIO or other LOCAL memory type provides the data.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.LLC_MISS_LOCALMEM", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts all retired load instructions.", "Counter": "0,1,2,3", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ALL_LOADS", - "PublicDescription": "Counts Instructions with at least one architecturally visible load retired. Available PDIST counters: 0", + "PublicDescription": "Counts Instructions with at least one architecturally visible load retired. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x81", "Unit": "cpu_core" @@ -85,16 +400,442 @@ "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_INST_RETIRED.ALL_STORES", - "PublicDescription": "Counts all retired store instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all retired store instructions. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x82", "Unit": "cpu_core" }, { + "BriefDescription": "Retired software prefetch instructions.", + "Counter": "0,1,2,3", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.ALL_SWPF", + "PublicDescription": "Counts all retired software prefetch instructions. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x84", + "Unit": "cpu_core" + }, + { + "BriefDescription": "All retired memory instructions.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.ANY", + "PublicDescription": "Counts all retired memory instructions - loads and stores. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x87", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions with locked access.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.LOCK_LOADS", + "PublicDescription": "Counts retired load instructions with locked access. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x21", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions that split across a cacheline boundary.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.SPLIT_LOADS", + "PublicDescription": "Counts retired load instructions that split across a cacheline boundary. Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x41", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired store instructions that split across a cacheline boundary.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.SPLIT_STORES", + "PublicDescription": "Counts retired store instructions that split across a cacheline boundary. Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x42", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that hit the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_HIT_ANY", + "PublicDescription": "Number of retired instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0xf", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions that hit the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_HIT_LOADS", + "PublicDescription": "Number of retired load instructions with a clean hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x9", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired store instructions that hit the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_HIT_STORES", + "PublicDescription": "Number of retired store instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0xa", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired SWPF instructions that hit the STLB.", + "Counter": "0,1,2,3", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_HIT_SWPF", + "PublicDescription": "Number of retired SWPF instructions that hit in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0xc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that miss the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_MISS_ANY", + "PublicDescription": "Number of retired instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x17", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions that miss the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_MISS_LOADS", + "PublicDescription": "Number of retired load instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x11", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired store instructions that miss the STLB.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_MISS_STORES", + "PublicDescription": "Number of retired store instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x12", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired SWPF instructions that miss the STLB.", + "Counter": "0,1,2,3", + "EventCode": "0xd0", + "EventName": "MEM_INST_RETIRED.STLB_MISS_SWPF", + "PublicDescription": "Number of retired SWPF instructions that (start a) miss in the 2nd-level TLB (STLB). Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x14", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$)", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd2", + "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD", + "PublicDescription": "Counts retired load instructions whose data sources were a cross-core Snoop hits and forwards data from an in on-package core cache (induced by NI$) Available PDIST counters: 0,1", + "SampleAfterValue": "20011", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions whose data sources were HitM responses from shared L3, Hit-with-FWD is normally excluded.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd2", + "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_HITM", + "PublicDescription": "Counts retired load instructions whose data sources were HitM responses from shared L3, Hit-with-FWD is normally excluded. Available PDIST counters: 0,1", + "SampleAfterValue": "20011", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd2", + "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS", + "PublicDescription": "Counts the retired load instructions whose data sources were L3 hit and cross-core snoop missed in on-pkg core cache. Available PDIST counters: 0,1", + "SampleAfterValue": "20011", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd2", + "EventName": "MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD", + "PublicDescription": "Counts retired load instructions whose data sources were L3 and cross-core snoop hits in on-pkg core cache. Available PDIST counters: 0,1", + "SampleAfterValue": "20011", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions which data source is memory side cache.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_L3_MISS_RETIRED.MEMSIDE_CACHE", + "PublicDescription": "Retired load instructions which data source is memory side cache. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions with at least 1 uncacheable load or lock.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd4", + "EventName": "MEM_LOAD_MISC_RETIRED.UC", + "PublicDescription": "Retired instructions with at least one load to uncacheable memory-type, or at least one cache-line split locked access (Bus Lock). Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of completed demand load requests that missed the L1, but hit the FB(fill buffer), because a preceding miss to the same cacheline initiated the line to be brought into L1, but data is not yet ready in L1.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.FB_HIT", + "PublicDescription": "Counts retired load instructions with at least one uop was load missed in L1 but hit FB (Fill Buffers) due to preceding miss to the same cache line with data not ready. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions with L1 cache hits as data sources", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x101", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts retired load instructions with at least one uop that hit in the Level 0 of the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source.", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.L1_HIT_L0", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 0 of the L1 data cache. This event includes all SW prefetches and lock instructions regardless of the data source. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts retired load instructions with at least one uop that hit in the Level 1 of the L1 data cache.", + "Counter": "0,1,2,3", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.L1_HIT_L1", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the Level 1 of the L1 data cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions missed L1 cache as data sources", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.L1_MISS", + "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L1 cache. Available PDIST counters: 0,1", + "SampleAfterValue": "200003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions with L2 cache hits as data sources", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.L2_HIT", + "PublicDescription": "Counts retired load instructions with L2 cache hits as data sources. Available PDIST counters: 0,1", + "SampleAfterValue": "200003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions missed L2 cache as data sources", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.L2_MISS", + "PublicDescription": "Counts retired load instructions missed L2 cache as data sources. Available PDIST counters: 0,1", + "SampleAfterValue": "100021", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions with L3 cache hits as data sources", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.L3_HIT", + "PublicDescription": "Counts retired load instructions with at least one uop that hit in the L3 cache. Available PDIST counters: 0,1", + "SampleAfterValue": "100021", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired load instructions missed L3 cache as data sources", + "Counter": "0,1,2,3", + "Data_LA": "1", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_RETIRED.L3_MISS", + "PublicDescription": "Counts retired load instructions with at least one uop that missed in the L3 cache. Available PDIST counters: 0,1", + "SampleAfterValue": "50021", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in DRAM", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.LOCAL_DRAM", + "PublicDescription": "Counts the number of load ops retired that miss the L3 cache and hit in DRAM Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in memside cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.MEMSIDE_CACHE", + "PublicDescription": "Counts the number of load ops retired that miss the L3 cache and hit in memside cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x40", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of load ops retired that hit the L1 data cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L1_HIT", + "PublicDescription": "Counts the number of load ops retired that hit the L1 data cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss in the L1 data cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L1_MISS", + "PublicDescription": "Counts the number of load ops retired that miss in the L1 data cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x40", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of load ops retired that hit in the L2 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L2_HIT", + "PublicDescription": "Counts the number of load ops retired that hit in the L2 cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss in the L2 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L2_MISS", + "PublicDescription": "Counts the number of load ops retired that miss in the L2 cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x80", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of load ops retired that hit in the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd1", + "EventName": "MEM_LOAD_UOPS_RETIRED.L3_HIT", + "PublicDescription": "Counts the number of load ops retired that hit in the L3 cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x1c", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked for any of the following reasons: load buffer, store buffer or RSV full.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.ALL", + "SampleAfterValue": "1000003", + "UMask": "0x7", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked due to a load buffer full condition.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.LD_BUF", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked due to an RSV full condition.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.RSV", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of cycles that uops are blocked due to a store buffer full condition.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x04", + "EventName": "MEM_SCHEDULER_BLOCK.ST_BUF", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "MEM_STORE_RETIRED.L2_HIT", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x44", + "EventName": "MEM_STORE_RETIRED.L2_HIT", + "SampleAfterValue": "200003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of cache-lines required by retired stores whose Data Source is: Memory Side Cache", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x44", + "EventName": "MEM_STORE_RETIRED.MEMSIDE_CACHE", + "SampleAfterValue": "100021", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the number of load ops retired.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.ALL_LOADS", + "PublicDescription": "Counts the number of load ops retired. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x81", "Unit": "cpu_atom" @@ -104,130 +845,246 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.ALL_STORES", + "PublicDescription": "Counts the number of store ops retired. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x82", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024", "MSRIndex": "0x3F6", "MSRValue": "0x400", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128", "MSRIndex": "0x3F6", "MSRValue": "0x80", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16", "MSRIndex": "0x3F6", "MSRValue": "0x10", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048", "MSRIndex": "0x3F6", "MSRValue": "0x800", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256", "MSRIndex": "0x3F6", "MSRValue": "0x100", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32", "MSRIndex": "0x3F6", "MSRValue": "0x20", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4", "MSRIndex": "0x3F6", "MSRValue": "0x4", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512", "MSRIndex": "0x3F6", "MSRValue": "0x200", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64", "MSRIndex": "0x3F6", "MSRValue": "0x40", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8", "MSRIndex": "0x3F6", "MSRValue": "0x8", + "PublicDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x5", "Unit": "cpu_atom" }, { + "BriefDescription": "Counts the number of load uops retired that performed one or more locks", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.LOCK_LOADS", + "PublicDescription": "Counts the number of load uops retired that performed one or more locks Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x21", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of memory uops retired that were splits.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.SPLIT", + "PublicDescription": "Counts the number of memory uops retired that were splits. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x43", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retired split load uops.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.SPLIT_LOADS", + "PublicDescription": "Counts the number of retired split load uops. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x41", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retired split store uops.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.SPLIT_STORES", + "PublicDescription": "Counts the number of retired split store uops. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x42", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of memory uops retired that missed in the second level TLB.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.STLB_MISS", + "PublicDescription": "Counts the number of memory uops retired that missed in the second level TLB. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x13", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of load uops retired that miss in the second Level TLB.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.STLB_MISS_LOADS", + "PublicDescription": "Counts the number of load uops retired that miss in the second Level TLB. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x11", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of store uops retired that miss in the second level TLB.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd0", + "EventName": "MEM_UOPS_RETIRED.STLB_MISS_STORES", + "PublicDescription": "Counts the number of store uops retired that miss in the second level TLB. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x12", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts the number of stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.STORE_LATENCY", + "PublicDescription": "Counts the number of stores uops retired same as MEM_UOPS_RETIRED.ALL_STORES Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x6", "Unit": "cpu_atom" }, { + "BriefDescription": "Retired memory uops for any access", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe5", + "EventName": "MEM_UOP_RETIRED.ANY", + "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses", + "SampleAfterValue": "1000003", + "UMask": "0xf", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts writebacks of modified cachelines that were supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_M.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x7E001E00008", + "PublicDescription": "Counts writebacks of modified cachelines that were supplied by the L3 cache. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts writebacks of non-modified cachelines that were supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.COREWB_NONM.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x7E001E01000", + "PublicDescription": "Counts writebacks of non-modified cachelines that were supplied by the L3 cache. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts demand data reads that have any type of response.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xB7", @@ -241,7 +1098,7 @@ }, { "BriefDescription": "Counts demand data reads that have any type of response.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", @@ -252,6 +1109,30 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x40001E00001", + "PublicDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches which forwarded the unmodified data to the requesting core.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x20001E00001", + "PublicDescription": "Counts demand data reads that were supplied by the L3 cache where a snoop hit in another cores caches which forwarded the unmodified data to the requesting core. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xB7", @@ -265,7 +1146,7 @@ }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that have any type of response.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.ANY_RESPONSE", "MSRIndex": "0x1a6,0x1a7", @@ -274,5 +1155,221 @@ "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.DEMAND_RFO.L3_HIT.SNOOP_HITM", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x40001E00002", + "PublicDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were supplied by the L3 cache where a snoop hit in another cores caches, data forwarding is required as the data is modified. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that were supplied by the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.READS_TO_CORE.L3_HIT", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x7E001E04477", + "PublicDescription": "Counts all data read, code read, RFO and ITOM requests including demands and prefetches to the core caches (L1 or L2) that were supplied by the L3 cache. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Any memory transaction that reached the SQ.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x21", + "EventName": "OFFCORE_REQUESTS.ALL_REQUESTS", + "PublicDescription": "Counts memory transactions reached the super queue including requests initiated by the core, all L3 prefetches, page walks, etc..", + "SampleAfterValue": "100003", + "UMask": "0x80", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Demand and prefetch data reads", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x21", + "EventName": "OFFCORE_REQUESTS.DATA_RD", + "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type.", + "SampleAfterValue": "100003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cacheable and Non-Cacheable code read requests", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x21", + "EventName": "OFFCORE_REQUESTS.DEMAND_CODE_RD", + "PublicDescription": "Counts both cacheable and Non-Cacheable code read requests.", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Demand Data Read requests sent to uncore", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x21", + "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD", + "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore.", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Demand RFO requests including regular RFOs, locks, ItoM", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x21", + "EventName": "OFFCORE_REQUESTS.DEMAND_RFO", + "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM.", + "SampleAfterValue": "100003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when offcore outstanding cacheable Core Data Read transactions are present in SuperQueue (SQ), queue to uncore.", + "Counter": "0,1,2,3", + "CounterMask": "1", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", + "PublicDescription": "Counts cycles when offcore outstanding cacheable Core Data Read transactions are present in the super queue. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS.", + "SampleAfterValue": "1000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles with offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore.", + "Counter": "0,1,2,3", + "CounterMask": "1", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles where at least 1 outstanding demand data read request is pending.", + "Counter": "0,1,2,3", + "CounterMask": "1", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles with offcore outstanding demand rfo reads transactions in SuperQueue (SQ), queue to uncore.", + "Counter": "0,1,2,3", + "CounterMask": "1", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", + "PublicDescription": "Counts the number of offcore outstanding demand rfo Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Offcore outstanding cacheable Core Data Read transactions in SuperQueue (SQ), queue to uncore", + "Counter": "0,1,2,3", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD", + "PublicDescription": "Counts the number of offcore outstanding cacheable Core Data Read transactions in the super queue every cycle. A transaction is considered to be in the Offcore outstanding state between L2 miss and transaction completion sent to requestor (SQ de-allocation). See corresponding Umask under OFFCORE_REQUESTS.", + "SampleAfterValue": "1000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Offcore outstanding Code Reads transactions in the SuperQueue (SQ), queue to uncore, every cycle.", + "Counter": "0,1,2,3", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "For every cycle, increments by the number of outstanding demand data read requests pending.", + "Counter": "0,1,2,3", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD", + "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor.", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Store Read transactions pending for off-core. Highly correlated.", + "Counter": "0,1,2,3", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_RFO", + "PublicDescription": "Counts the number of off-core outstanding read-for-ownership (RFO) store transactions every cycle. An RFO transaction is considered to be in the Off-core outstanding state between L2 cache miss and transaction completion.", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts bus locks, accounts for cache line split locks and UC locks.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2c", + "EventName": "SQ_MISC.BUS_LOCK", + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x40", + "EventName": "SW_PREFETCH_ACCESS.ANY", + "SampleAfterValue": "100003", + "UMask": "0xf", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of PREFETCHNTA instructions executed.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x40", + "EventName": "SW_PREFETCH_ACCESS.NTA", + "PublicDescription": "Counts the number of PREFETCHNTA instructions executed.", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of PREFETCHW instructions executed.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x40", + "EventName": "SW_PREFETCH_ACCESS.PREFETCHW", + "PublicDescription": "Counts the number of PREFETCHW instructions executed.", + "SampleAfterValue": "100003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of PREFETCHT0 instructions executed.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x40", + "EventName": "SW_PREFETCH_ACCESS.T0", + "PublicDescription": "Counts the number of PREFETCHT0 instructions executed.", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of PREFETCHT1 or PREFETCHT2 instructions executed.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x40", + "EventName": "SW_PREFETCH_ACCESS.T1_T2", + "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed.", + "SampleAfterValue": "100003", + "UMask": "0x4", + "Unit": "cpu_core" } ] diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/counter.json b/tools/perf/pmu-events/arch/x86/pantherlake/counter.json index 69f158a97707..432b6946ccbc 100644 --- a/tools/perf/pmu-events/arch/x86/pantherlake/counter.json +++ b/tools/perf/pmu-events/arch/x86/pantherlake/counter.json @@ -1,12 +1,17 @@ [ { "Unit": "cpu_atom", - "CountersNumFixed": "3", - "CountersNumGeneric": "39" + "CountersNumFixed": "7", + "CountersNumGeneric": "8" }, { "Unit": "cpu_core", "CountersNumFixed": "4", "CountersNumGeneric": "10" + }, + { + "Unit": "iMC", + "CountersNumFixed": "0", + "CountersNumGeneric": "5" } ]
\ No newline at end of file diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/floating-point.json b/tools/perf/pmu-events/arch/x86/pantherlake/floating-point.json new file mode 100644 index 000000000000..57c26866bc79 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/pantherlake/floating-point.json @@ -0,0 +1,286 @@ +[ + { + "BriefDescription": "Cycles when floating-point divide unit is busy executing divide or square root operations.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0xb0", + "EventName": "ARITH.FPDIV_ACTIVE", + "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for floating-point operations only.", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts all microcode FP assists.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc1", + "EventName": "ASSISTS.FP", + "PublicDescription": "Counts all microcode Floating Point assists.", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "ASSISTS.SSE_AVX_MIX", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc1", + "EventName": "ASSISTS.SSE_AVX_MIX", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of FP-arith-uops dispatched on 1st VEC port (port 0). FP-arith-uops are of type ADD* / SUB* / MUL / FMA* / DPP.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V0", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of FP-arith-uops dispatched on 2nd VEC port (port 1)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V1", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of FP-arith-uops dispatched on 3rd VEC port (port 5)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V2", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of FP-arith-uops dispatched on 4th VEC port", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb3", + "EventName": "FP_ARITH_DISPATCHED.V3", + "SampleAfterValue": "2000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.128B_PACKED_DOUBLE", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "100003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.128B_PACKED_SINGLE", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "100003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.256B_PACKED_DOUBLE", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.256B_PACKED_SINGLE", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "100003", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of SSE/AVX computational 128-bit packed single and 256-bit packed double precision FP instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, 1 for each element. Applies to SSE* and AVX* packed single precision and packed double precision FP instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB count twice as they perform 2 calculations per element.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.4_FLOPS", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "100003", + "UMask": "0x18", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of SSE/AVX computational scalar floating-point instructions retired; some instructions will count twice as noted below. Applies to SSE* and AVX* scalar, double and single precision floating-point: ADD SUB MUL DIV MIN MAX RCP14 RSQRT14 RANGE SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform multiple calculations per element.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.SCALAR", + "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "1000003", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.SCALAR_DOUBLE", + "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.SCALAR_SINGLE", + "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of any Vector retired FP arithmetic instructions", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.VECTOR", + "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", + "SampleAfterValue": "1000003", + "UMask": "0x3c", + "Unit": "cpu_core" + }, + { + "BriefDescription": "FP_ARITH_OPS_RETIRED.VECTOR_128B", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.VECTOR_128B", + "SampleAfterValue": "100003", + "UMask": "0xc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "FP_ARITH_OPS_RETIRED.VECTOR_256B", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc7", + "EventName": "FP_ARITH_OPS_RETIRED.VECTOR_256B", + "SampleAfterValue": "100003", + "UMask": "0x30", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of all types of floating point operations per uop with all default weighting", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.ALL", + "PublicDescription": "Counts the number of all types of floating point operations per uop with all default weighting Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x3", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of floating point operations that produce 32 bit single precision results", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.FP32", + "PublicDescription": "Counts the number of floating point operations that produce 32 bit single precision results Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of floating point operations that produce 64 bit double precision results", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc8", + "EventName": "FP_FLOPS_RETIRED.FP64", + "PublicDescription": "Counts the number of floating point operations that produce 64 bit double precision results Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retired instructions whose sources are a packed 128 bit double precision floating point. This may be SSE or AVX.128 operations.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc7", + "EventName": "FP_INST_RETIRED.128B_DP", + "PublicDescription": "Counts the number of retired instructions whose sources are a packed 128 bit double precision floating point. This may be SSE or AVX.128 operations. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x8", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retired instructions whose sources are a packed 128 bit single precision floating point. This may be SSE or AVX.128 operations.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc7", + "EventName": "FP_INST_RETIRED.128B_SP", + "PublicDescription": "Counts the number of retired instructions whose sources are a packed 128 bit single precision floating point. This may be SSE or AVX.128 operations. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retired instructions whose sources are a packed 256 bit double precision floating point.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc7", + "EventName": "FP_INST_RETIRED.256B_DP", + "PublicDescription": "Counts the number of retired instructions whose sources are a packed 256 bit double precision floating point. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x20", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retired instructions whose sources are a packed 256 bit single precision floating point.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc7", + "EventName": "FP_INST_RETIRED.256B_SP", + "PublicDescription": "Counts the number of retired instructions whose sources are a packed 256 bit single precision floating point. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retired instructions whose sources are a scalar 32bit single precision floating point.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc7", + "EventName": "FP_INST_RETIRED.32B_SP", + "PublicDescription": "Counts the number of retired instructions whose sources are a scalar 32bit single precision floating point. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of retired instructions whose sources are a scalar 64 bit double precision floating point.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc7", + "EventName": "FP_INST_RETIRED.64B_DP", + "PublicDescription": "Counts the number of retired instructions whose sources are a scalar 64 bit double precision floating point. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the total number of floating point retired instructions.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc7", + "EventName": "FP_INST_RETIRED.ALL", + "PublicDescription": "Counts the total number of floating point retired instructions. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x3f", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of floating point operations retired that required microcode assist.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.FP_ASSIST", + "PublicDescription": "Counts the number of floating point operations retired that required microcode assist, which is not a reflection of the number of FP operations, instructions or uops.", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + } +] diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/frontend.json b/tools/perf/pmu-events/arch/x86/pantherlake/frontend.json index aedf631e3c0f..d36faa683d3f 100644 --- a/tools/perf/pmu-events/arch/x86/pantherlake/frontend.json +++ b/tools/perf/pmu-events/arch/x86/pantherlake/frontend.json @@ -1,5 +1,370 @@ [ { + "BriefDescription": "Counts the total number of BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xe6", + "EventName": "BACLEARS.ANY", + "PublicDescription": "Counts the total number of BACLEARS, which occur when the Branch Target Buffer (BTB) prediction or lack thereof, was corrected by a later branch predictor in the frontend. Includes BACLEARS due to all branch types including conditional and unconditional jumps, returns, and indirect branches.", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Clears due to Unknown Branches.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x60", + "EventName": "BACLEARS.ANY", + "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore.", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of BACLEARS due to a conditional jump.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xe6", + "EventName": "BACLEARS.COND", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of BACLEARS due to an indirect branch.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xe6", + "EventName": "BACLEARS.INDIRECT", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of BACLEARS due to a return branch.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xe6", + "EventName": "BACLEARS.RETURN", + "SampleAfterValue": "1000003", + "UMask": "0x8", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of BACLEARS due to a direct, unconditional jump.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xe6", + "EventName": "BACLEARS.UNCOND", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Stalls caused by changing prefix length of the instruction.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x87", + "EventName": "DECODE.LCP", + "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk.", + "SampleAfterValue": "500009", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles the Microcode Sequencer is busy.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x87", + "EventName": "DECODE.MS_BUSY", + "SampleAfterValue": "500009", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "DSB-to-MITE switch true penalty cycles.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x61", + "EventName": "DSB2MITE_SWITCHES.PENALTY_CYCLES", + "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE.", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired ANT branches", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.ANY_ANT", + "MSRIndex": "0x3F7", + "MSRValue": "0x9", + "PublicDescription": "Always Not Taken (ANT) conditional retired branches (no BTB entry and not mispredicted) Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired Instructions who experienced DSB miss.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.ANY_DSB_MISS", + "MSRIndex": "0x3F7", + "MSRValue": "0x1", + "PublicDescription": "Counts retired Instructions that experienced DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired Instructions who experienced a critical DSB miss.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.DSB_MISS", + "MSRIndex": "0x3F7", + "MSRValue": "0x11", + "PublicDescription": "Number of retired Instructions that experienced a critical DSB (Decode stream buffer i.e. the decoded instruction-cache) miss. Critical means stalls were exposed to the back-end as a result of the DSB miss. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.ITLB_MISS", + "PublicDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Retired Instructions who experienced iTLB true miss.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.ITLB_MISS", + "MSRIndex": "0x3F7", + "MSRValue": "0x14", + "PublicDescription": "Counts retired Instructions that experienced iTLB (Instruction TLB) true miss. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired Instructions who experienced Instruction L1 Cache true miss.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.L1I_MISS", + "MSRIndex": "0x3F7", + "MSRValue": "0x12", + "PublicDescription": "Counts retired Instructions who experienced Instruction L1 Cache true miss. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired Instructions who experienced Instruction L2 Cache true miss.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.L2_MISS", + "MSRIndex": "0x3F7", + "MSRValue": "0x13", + "PublicDescription": "Counts retired Instructions who experienced Instruction L2 Cache true miss. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_128", + "MSRIndex": "0x3F7", + "MSRValue": "0x608006", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 128 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 16 cycles which was not interrupted by a back-end stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_16", + "MSRIndex": "0x3F7", + "MSRValue": "0x601006", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 16 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions after front-end starvation of at least 2 cycles", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_2", + "MSRIndex": "0x3F7", + "MSRValue": "0x600206", + "PublicDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of at least 2 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_256", + "MSRIndex": "0x3F7", + "MSRValue": "0x610006", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 256 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that are fetched after an interval where the front-end had at least 1 bubble-slot for a period of 2 cycles which was not interrupted by a back-end stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_2_BUBBLES_GE_1", + "MSRIndex": "0x3F7", + "MSRValue": "0x100206", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after the front-end had at least 1 bubble-slot for a period of 2 cycles. A bubble-slot is an empty issue-pipeline slot while there was no RAT stall. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 32 cycles which was not interrupted by a back-end stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_32", + "MSRIndex": "0x3F7", + "MSRValue": "0x602006", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 32 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_4", + "MSRIndex": "0x3F7", + "MSRValue": "0x600406", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 4 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_512", + "MSRIndex": "0x3F7", + "MSRValue": "0x620006", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 512 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_64", + "MSRIndex": "0x3F7", + "MSRValue": "0x604006", + "PublicDescription": "Counts retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 64 cycles which was not interrupted by a back-end stall. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that are fetched after an interval where the front-end delivered no uops for a period of 8 cycles which was not interrupted by a back-end stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.LATENCY_GE_8", + "MSRIndex": "0x3F7", + "MSRValue": "0x600806", + "PublicDescription": "Counts retired instructions that are delivered to the back-end after a front-end stall of at least 8 cycles. During this period the front-end delivered no uops. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted Retired ANT branches", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.MISP_ANT", + "MSRIndex": "0x3F7", + "MSRValue": "0x9", + "PublicDescription": "ANT retired branches that got just mispredicted Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts flows delivered by the Microcode Sequencer", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.MS_FLOWS", + "MSRIndex": "0x3F7", + "MSRValue": "0x8", + "PublicDescription": "Counts flows delivered by the Microcode Sequencer Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired Instructions who experienced STLB (2nd level TLB) true miss.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.STLB_MISS", + "MSRIndex": "0x3F7", + "MSRValue": "0x15", + "PublicDescription": "Counts retired Instructions that experienced STLB (2nd level TLB) true miss. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired instructions that caused clears due to being Unknown Branches.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc6", + "EventName": "FRONTEND_RETIRED.UNKNOWN_BRANCH", + "MSRIndex": "0x3F7", + "MSRValue": "0x17", + "PublicDescription": "Number retired branch instructions that caused the front-end to be resteered when it finds the instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to Instruction L1 cache miss, that hit in the L2 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc9", + "EventName": "FRONTEND_RETIRED_SOURCE.ICACHE_L2_HIT", + "PublicDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to Instruction L1 cache miss, that hit in the L2 cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss that hit in the second level TLB.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc9", + "EventName": "FRONTEND_RETIRED_SOURCE.ITLB_STLB_HIT", + "PublicDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss that hit in the second level TLB. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss that also missed the second level TLB.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc9", + "EventName": "FRONTEND_RETIRED_SOURCE.ITLB_STLB_MISS", + "PublicDescription": "Counts the number of instructions retired that were tagged because empty issue slots were seen before the uop due to ITLB miss that also missed the second level TLB. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x20", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x80", @@ -9,6 +374,15 @@ "Unit": "cpu_atom" }, { + "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump and the instruction cache registers bytes are present.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x80", + "EventName": "ICACHE.HIT", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts every time the code stream enters into a new cache line by walking sequential from the previous line or being redirected by a jump and the instruction cache registers bytes are not present. -", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x80", @@ -18,6 +392,134 @@ "Unit": "cpu_atom" }, { + "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache miss.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x80", + "EventName": "ICACHE_DATA.STALLS", + "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity.", + "SampleAfterValue": "500009", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "ICACHE_DATA.STALL_PERIODS", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EdgeDetect": "1", + "EventCode": "0x80", + "EventName": "ICACHE_DATA.STALL_PERIODS", + "SampleAfterValue": "500009", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles where a code fetch is stalled due to L1 instruction cache tag miss.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x83", + "EventName": "ICACHE_TAG.STALLS", + "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss.", + "SampleAfterValue": "200003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles Decode Stream Buffer (DSB) is delivering any Uop", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0x79", + "EventName": "IDQ.DSB_CYCLES_ANY", + "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", + "SampleAfterValue": "2000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles DSB is delivering optimal number of Uops", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "8", + "EventCode": "0x79", + "EventName": "IDQ.DSB_CYCLES_OK", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", + "SampleAfterValue": "2000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x79", + "EventName": "IDQ.DSB_UOPS", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", + "SampleAfterValue": "2000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles MITE is delivering any Uop", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0x79", + "EventName": "IDQ.MITE_CYCLES_ANY", + "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles MITE is delivering optimal number of Uops", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "8", + "EventCode": "0x79", + "EventName": "IDQ.MITE_CYCLES_OK", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops delivered to Instruction Decode Queue (IDQ) from MITE path", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x79", + "EventName": "IDQ.MITE_UOPS", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB).", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when uops are being delivered to IDQ while MS is busy", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0x79", + "EventName": "IDQ.MS_CYCLES_ANY", + "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE.", + "SampleAfterValue": "2000003", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of switches from DSB or MITE to the MS", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EdgeDetect": "1", + "EventCode": "0x79", + "EventName": "IDQ.MS_SWITCHES", + "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.", + "SampleAfterValue": "100003", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops initiated by MITE or Decode Stream Buffer (DSB) and delivered to Instruction Decode Queue (IDQ) while Microcode Sequencer (MS) is busy", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x79", + "EventName": "IDQ.MS_UOPS", + "PublicDescription": "Counts the number of uops initiated by MITE or Decode Stream Buffer (DSB) and delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Counting includes uops that may 'bypass' the IDQ.", + "SampleAfterValue": "1000003", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { "BriefDescription": "This event counts a subset of the Topdown Slots event that when no operation was delivered to the back-end pipeline due to instruction fetch limitations when the back-end could have accepted more operations. Common examples include instruction cache misses or x86 instruction decode limitations.", "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x9c", @@ -26,5 +528,38 @@ "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when optimal number of uops was delivered to the back-end when the back-end is not stalled", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0x9c", + "EventName": "IDQ_BUBBLES.CYCLES_FE_WAS_OK", + "Invert": "1", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls.", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when no uops are delivered by the IDQ for 2 or more cycles when backend of the machine is not stalled - normally indicating a Fetch Latency issue", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x9c", + "EventName": "IDQ_BUBBLES.FETCH_LATENCY", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls for 2 or more cycles - normally indicating a Fetch Latency issue.", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when no uops are not delivered by the IDQ when backend of the machine is not stalled", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "8", + "EventCode": "0x9c", + "EventName": "IDQ_BUBBLES.STARVATION_CYCLES", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls.", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" } ] diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/memory.json b/tools/perf/pmu-events/arch/x86/pantherlake/memory.json index 47daee8cc00f..397a15dbb964 100644 --- a/tools/perf/pmu-events/arch/x86/pantherlake/memory.json +++ b/tools/perf/pmu-events/arch/x86/pantherlake/memory.json @@ -1,5 +1,33 @@ [ { + "BriefDescription": "Counts the number of cycles that the head (oldest load) of the load buffer is stalled due to a core bound stall including a store address match, a DTLB miss or a page walk that detains the load from retiring.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x05", + "EventName": "LD_HEAD.L1_BOUND_AT_RET", + "SampleAfterValue": "1000003", + "UMask": "0xf4", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears due to memory ordering caused by a snoop from an external agent. Does not count internally generated machine clears such as those due to memory disambiguation.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Number of machine clears due to memory ordering conflicts.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", + "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles.", "Counter": "2,3,4,5,6,7,8,9", "Data_LA": "1", @@ -7,7 +35,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_1024", "MSRIndex": "0x3F6", "MSRValue": "0x400", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 1024 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "53", "UMask": "0x1", "Unit": "cpu_core" @@ -20,7 +48,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_128", "MSRIndex": "0x3F6", "MSRValue": "0x80", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 128 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "1009", "UMask": "0x1", "Unit": "cpu_core" @@ -33,7 +61,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_16", "MSRIndex": "0x3F6", "MSRValue": "0x10", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 16 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "20011", "UMask": "0x1", "Unit": "cpu_core" @@ -46,7 +74,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_2048", "MSRIndex": "0x3F6", "MSRValue": "0x800", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 2048 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "23", "UMask": "0x1", "Unit": "cpu_core" @@ -59,7 +87,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_256", "MSRIndex": "0x3F6", "MSRValue": "0x100", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 256 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "503", "UMask": "0x1", "Unit": "cpu_core" @@ -72,7 +100,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_32", "MSRIndex": "0x3F6", "MSRValue": "0x20", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 32 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "100007", "UMask": "0x1", "Unit": "cpu_core" @@ -85,7 +113,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_4", "MSRIndex": "0x3F6", "MSRValue": "0x4", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 4 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" @@ -98,7 +126,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_512", "MSRIndex": "0x3F6", "MSRValue": "0x200", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 512 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "101", "UMask": "0x1", "Unit": "cpu_core" @@ -111,7 +139,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_64", "MSRIndex": "0x3F6", "MSRValue": "0x40", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 64 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "2003", "UMask": "0x1", "Unit": "cpu_core" @@ -124,7 +152,7 @@ "EventName": "MEM_TRANS_RETIRED.LOAD_LATENCY_GT_8", "MSRIndex": "0x3F6", "MSRValue": "0x8", - "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0", + "PublicDescription": "Counts randomly selected loads when the latency from first dispatch to completion is greater than 8 cycles. Reported latency may be longer than just the memory latency. Available PDIST counters: 0,1", "SampleAfterValue": "50021", "UMask": "0x1", "Unit": "cpu_core" @@ -135,12 +163,32 @@ "Data_LA": "1", "EventCode": "0xcd", "EventName": "MEM_TRANS_RETIRED.STORE_SAMPLE", - "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8 Available PDIST counters: 0", + "PublicDescription": "Counts Retired memory accesses with at least 1 store operation. This PEBS event is the precisely-distributed (PDist) trigger covering all stores uops for sampling by the PEBS Store Latency Facility. The facility is described in Intel SDM Volume 3 section 19.9.8 Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_core" }, { + "BriefDescription": "Counts misaligned loads that are 4K page splits.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x13", + "EventName": "MISALIGN_MEM_REF.LOAD_PAGE_SPLIT", + "PublicDescription": "Counts misaligned loads that are 4K page splits. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts misaligned stores that are 4K page splits.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x13", + "EventName": "MISALIGN_MEM_REF.STORE_PAGE_SPLIT", + "PublicDescription": "Counts misaligned stores that are 4K page splits. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { "BriefDescription": "Counts demand data reads that were supplied by DRAM.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xB7", @@ -154,7 +202,7 @@ }, { "BriefDescription": "Counts demand data reads that were supplied by DRAM.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.DRAM", "MSRIndex": "0x1a6,0x1a7", @@ -178,7 +226,7 @@ }, { "BriefDescription": "Counts demand data reads that were not supplied by the L3 cache.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_DATA_RD.L3_MISS", "MSRIndex": "0x1a6,0x1a7", @@ -202,7 +250,7 @@ }, { "BriefDescription": "Counts demand read for ownership (RFO) requests and software prefetches for exclusive ownership (PREFETCHW) that were not supplied by the L3 cache.", - "Counter": "0,1,2,3", + "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0x2A,0x2B", "EventName": "OCR.DEMAND_RFO.L3_MISS", "MSRIndex": "0x1a6,0x1a7", @@ -211,5 +259,35 @@ "SampleAfterValue": "100003", "UMask": "0x1", "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts demand data read requests that miss the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x21", + "EventName": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles where data return is pending for a Demand Data Read request who miss L3 cache.", + "Counter": "0,1,2,3", + "CounterMask": "1", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_L3_MISS_DEMAND_DATA_RD", + "PublicDescription": "Cycles with at least 1 Demand Data Read requests who miss L3 cache in the superQ.", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache.", + "Counter": "0,1,2,3", + "EventCode": "0x20", + "EventName": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD", + "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache.", + "SampleAfterValue": "2000003", + "UMask": "0x10", + "Unit": "cpu_core" } ] diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/other.json b/tools/perf/pmu-events/arch/x86/pantherlake/other.json new file mode 100644 index 000000000000..d49651d4f112 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/pantherlake/other.json @@ -0,0 +1,44 @@ +[ + { + "BriefDescription": "Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc1", + "EventName": "ASSISTS.HARDWARE", + "PublicDescription": "Count all other hardware assists or traps that are not necessarily architecturally exposed (through a software handler) beyond FP; SSE-AVX mix and A/D assists who are counted by dedicated sub-events. This includes, but not limited to, assists at EXE or MEM uop writeback like AVX* load/store/gather/scatter (non-FP GSSE-assist ) , assists generated by ROB like PEBS and RTIT, Uncore trap, RAR (Remote Action Request) and CET (Control flow Enforcement Technology) assists.", + "SampleAfterValue": "100003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "ASSISTS.PAGE_FAULT", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc1", + "EventName": "ASSISTS.PAGE_FAULT", + "SampleAfterValue": "1000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts streaming stores that have any type of response.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x2A,0x2B", + "EventName": "OCR.STREAMING_WR.ANY_RESPONSE", + "MSRIndex": "0x1a6,0x1a7", + "MSRValue": "0x10800", + "PublicDescription": "Counts streaming stores that have any type of response. Available PDIST counters: 0", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles the uncore cannot take further requests", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0x2d", + "EventName": "XQ.FULL", + "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache).", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + } +] diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/pipeline.json b/tools/perf/pmu-events/arch/x86/pantherlake/pipeline.json index 2caf2f85327f..2d805ac98c5b 100644 --- a/tools/perf/pmu-events/arch/x86/pantherlake/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/pantherlake/pipeline.json @@ -1,10 +1,51 @@ [ { + "BriefDescription": "Cycles when divide unit is busy executing divide or square root operations.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0xb0", + "EventName": "ARITH.DIV_ACTIVE", + "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations.", + "SampleAfterValue": "1000003", + "UMask": "0x9", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when integer divide unit is busy executing divide or square root operations.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0xb0", + "EventName": "ARITH.IDIV_ACTIVE", + "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer operations only.", + "SampleAfterValue": "1000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of occurrences where a microcode assist is invoked by hardware.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc1", + "EventName": "ASSISTS.ANY", + "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists.", + "SampleAfterValue": "100003", + "UMask": "0x1f", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts cycles where the pipeline is stalled due to serializing operations.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa2", + "EventName": "BE_STALLS.SCOREBOARD", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the total number of branch instructions retired for all branch types.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.ALL_BRANCHES", - "PublicDescription": "Counts the total number of instructions in which the instruction pointer (IP) of the processor is resteered due to a branch instruction and the branch instruction successfully retires. All branch type instructions are accounted for.", + "PublicDescription": "Counts the total number of instructions in which the instruction pointer (IP) of the processor is resteered due to a branch instruction and the branch instruction successfully retires. All branch type instructions are accounted for. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "Unit": "cpu_atom" }, @@ -13,8 +54,242 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc4", "EventName": "BR_INST_RETIRED.ALL_BRANCHES", - "PublicDescription": "Counts all branch instructions retired. Available PDIST counters: 0", + "PublicDescription": "Counts all branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Conditional branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND", + "PublicDescription": "Counts conditional branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x7", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of not taken conditional branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND_NTAKEN", + "PublicDescription": "Counts the number of not taken conditional branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Not taken branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND_NTAKEN", + "PublicDescription": "Counts not taken branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Taken conditional branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND_TAKEN", + "PublicDescription": "Counts taken conditional branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Taken backward conditional branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND_TAKEN_BWD", + "PublicDescription": "Counts taken backward conditional branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Taken forward conditional branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.COND_TAKEN_FWD", + "PublicDescription": "Counts taken forward conditional branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Far branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.FAR_BRANCH", + "PublicDescription": "Counts far branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.INDIRECT", + "PublicDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x50", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of near CALL branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_CALL", + "PublicDescription": "Counts the number of near CALL branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x30", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Direct and indirect near call instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_CALL", + "PublicDescription": "Counts both direct and indirect near call instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x30", + "Unit": "cpu_core" + }, + { + "BriefDescription": "near relative call instructions retired. [This event is alias to BR_INST_RETIRED.NEAR_REL_CALL]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_DIRECT_CALL", + "PublicDescription": "Counts near relative call instructions retired. [This event is alias to BR_INST_RETIRED.NEAR_REL_CALL] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "near relative jump instructions retired. [This event is alias to BR_INST_RETIRED.NEAR_REL_JMP]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_DIRECT_JMP", + "PublicDescription": "Counts near relative jump instructions retired. [This event is alias to BR_INST_RETIRED.NEAR_REL_JMP] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x80", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Indirect near branch instructions retired (excluding returns) [This event is alias to BR_INST_RETIRED.INDIRECT]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_INDIRECT", + "PublicDescription": "Counts near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. [This event is alias to BR_INST_RETIRED.INDIRECT] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x50", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Indirect near call instructions retired. [This event is alias to BR_INST_RETIRED.NEAR_IND_CALL]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_INDIRECT_CALL", + "PublicDescription": "Counts indirect near call instructions retired. [This event is alias to BR_INST_RETIRED.NEAR_IND_CALL] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Indirect near jump instructions retired. [This event is alias to BR_INST_RETIRED.NEAR_IND_JMP]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_INDIRECT_JMP", + "PublicDescription": "Counts indirect near jump instructions retired. [This event is alias to BR_INST_RETIRED.NEAR_IND_JMP] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT_CALL]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_IND_CALL", + "PublicDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT_CALL] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT_JMP]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_IND_JMP", + "PublicDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_INDIRECT_JMP] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Indirect and Direct Relative near jump instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_JMP", + "PublicDescription": "Counts near jump instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0xc0", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_DIRECT_CALL]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_REL_CALL", + "PublicDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_DIRECT_CALL] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_DIRECT_JMP]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_REL_JMP", + "PublicDescription": "This event is deprecated. [This event is alias to BR_INST_RETIRED.NEAR_DIRECT_JMP] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x80", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Return instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_RETURN", + "PublicDescription": "Counts return instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of near taken branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_TAKEN", + "PublicDescription": "Counts the number of near taken branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0xfb", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Taken branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc4", + "EventName": "BR_INST_RETIRED.NEAR_TAKEN", + "PublicDescription": "Counts taken branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", + "UMask": "0xfb", "Unit": "cpu_core" }, { @@ -22,7 +297,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.ALL_BRANCHES", - "PublicDescription": "Counts the total number of mispredicted branch instructions retired. All branch type instructions are accounted for. Prediction of the branch target address enables the processor to begin executing instructions before the non-speculative execution path is known. The branch prediction unit (BPU) predicts the target address based on the instruction pointer (IP) of the branch and on the execution path through which execution reached this IP. A branch misprediction occurs when the prediction is wrong, and results in discarding all instructions executed in the speculative path and re-fetching from the correct path.", + "PublicDescription": "Counts the total number of mispredicted branch instructions retired. All branch type instructions are accounted for. Prediction of the branch target address enables the processor to begin executing instructions before the non-speculative execution path is known. The branch prediction unit (BPU) predicts the target address based on the instruction pointer (IP) of the branch and on the execution path through which execution reached this IP. A branch misprediction occurs when the prediction is wrong, and results in discarding all instructions executed in the speculative path and re-fetching from the correct path. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "Unit": "cpu_atom" }, @@ -31,15 +306,438 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc5", "EventName": "BR_MISP_RETIRED.ALL_BRANCHES", - "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. Available PDIST counters: 0", + "PublicDescription": "Counts all the retired branch instructions that were mispredicted by the processor. A branch misprediction occurs when the processor incorrectly predicts the destination of the branch. When the misprediction is discovered at execution, all the instructions executed in the wrong (speculative) path must be discarded, and the processor must start fetching from the correct path. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.ALL_BRANCHES_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.ALL_BRANCHES_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.ALL_BRANCHES_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "Unit": "cpu_core" + }, + { + "BriefDescription": "All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.ALL_BRANCHES_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.ALL_BRANCHES_TPEBS", + "PublicDescription": "All mispredicted branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.ALL_BRANCHES_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted conditional branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND", + "PublicDescription": "Counts mispredicted conditional branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x7", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8007", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of mispredicted not taken conditional branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_NTAKEN", + "PublicDescription": "Counts the number of mispredicted not taken conditional branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Mispredicted non-taken conditional branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_NTAKEN", + "PublicDescription": "Counts the number of conditional branch instructions retired that were mispredicted and the branch direction was not taken. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_NTAKEN_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_NTAKEN_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_NTAKEN_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8004", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_NTAKEN_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_NTAKEN_TPEBS", + "PublicDescription": "Mispredicted non-taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_NTAKEN_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8004", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of mispredicted taken conditional branch instructions retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN", + "PublicDescription": "Counts the number of mispredicted taken conditional branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x3", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "number of branch instructions retired that were mispredicted and taken.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN", + "PublicDescription": "Counts taken conditional mispredicted branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "number of branch instructions retired that were mispredicted and taken backward.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN_BWD", + "PublicDescription": "Counts taken backward conditional mispredicted branch instructions retired. Available PDIST counters: 0,1", "SampleAfterValue": "400009", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_BWD_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN_BWD_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_BWD_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8001", + "Unit": "cpu_core" + }, + { + "BriefDescription": "number of branch instructions retired that were mispredicted and taken backward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_BWD_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN_BWD_TPEBS", + "PublicDescription": "number of branch instructions retired that were mispredicted and taken backward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_BWD_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8001", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8003", + "Unit": "cpu_core" + }, + { + "BriefDescription": "number of branch instructions retired that were mispredicted and taken forward.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN_FWD", + "PublicDescription": "Counts taken forward conditional mispredicted branch instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_FWD_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN_FWD_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_FWD_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8002", + "Unit": "cpu_core" + }, + { + "BriefDescription": "number of branch instructions retired that were mispredicted and taken forward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_FWD_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN_FWD_TPEBS", + "PublicDescription": "number of branch instructions retired that were mispredicted and taken forward. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_FWD_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8002", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TAKEN_TPEBS", + "PublicDescription": "Mispredicted taken conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_TAKEN_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8003", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.COND_TPEBS", + "PublicDescription": "Mispredicted conditional branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.COND_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8007", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x50", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_CALL]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT_CALL", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_CALL] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_CALL_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT_CALL_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_CALL_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8010", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x8050", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_JMP]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT_JMP", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_JMP] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_JMP_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.INDIRECT_JMP_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_INDIRECT_JMP_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x8040", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Miss-predicted near indirect branch instructions retired (excluding returns) [This event is alias to BR_MISP_RETIRED.INDIRECT]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_INDIRECT", + "PublicDescription": "Counts miss-predicted near indirect branch instructions retired excluding returns. TSX abort is an indirect branch. [This event is alias to BR_MISP_RETIRED.INDIRECT] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x50", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted indirect CALL retired. [This event is alias to BR_MISP_RETIRED.INDIRECT_CALL]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_INDIRECT_CALL", + "PublicDescription": "Counts retired mispredicted indirect (near taken) CALL instructions, including both register and memory indirect. [This event is alias to BR_MISP_RETIRED.INDIRECT_CALL] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.INDIRECT_CALL_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_INDIRECT_CALL_TPEBS", + "PublicDescription": "Mispredicted indirect CALL retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.INDIRECT_CALL_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x8010", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Miss-predicted near indirect jump instructions retired. [This event is alias to BR_MISP_RETIRED.INDIRECT_JMP]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_INDIRECT_JMP", + "PublicDescription": "Miss-predicted near indirect jump instructions retired. [This event is alias to BR_MISP_RETIRED.INDIRECT_JMP] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Miss-predicted near indirect jump instructions retired. Precise cost. [This event is alias to BR_MISP_RETIRED.INDIRECT_JMP_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_INDIRECT_JMP_TPEBS", + "PublicDescription": "Miss-predicted near indirect jump instructions retired. Precise cost. [This event is alias to BR_MISP_RETIRED.INDIRECT_JMP_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x8040", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.INDIRECT_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_INDIRECT_TPEBS", + "PublicDescription": "Mispredicted near indirect branch instructions retired (excluding returns). This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.INDIRECT_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "100003", + "UMask": "0x8050", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event counts the number of mispredicted ret instructions retired. Non PEBS [This event is alias to BR_MISP_RETIRED.RET]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_RETURN", + "PublicDescription": "This is a non-precise version (that is, does not use PEBS) of the event that counts mispredicted return instructions retired. [This event is alias to BR_MISP_RETIRED.RET] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.RET_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_RETURN_TPEBS", + "PublicDescription": "Mispredicted ret instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.RET_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x8008", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of near branch instructions retired that were mispredicted and taken.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_TAKEN", + "PublicDescription": "Counts number of near branch instructions retired that were mispredicted and taken. Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0xfb", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_TAKEN_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_TAKEN_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_TAKEN_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x80fb", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.NEAR_TAKEN_COST]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.NEAR_TAKEN_TPEBS", + "PublicDescription": "Mispredicted taken near branch instructions retired. This precise event may be used to get the misprediction cost via the Retire_Latency field of PEBS. It fires on the instruction that immediately follows the mispredicted branch. [This event is alias to BR_MISP_RETIRED.NEAR_TAKEN_COST] Available PDIST counters: 0,1", + "SampleAfterValue": "400009", + "UMask": "0x80fb", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_RETURN]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.RET", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_RETURN] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_RETURN_TPEBS]", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "Deprecated": "1", + "EventCode": "0xc5", + "EventName": "BR_MISP_RETIRED.RET_COST", + "PublicDescription": "This event is deprecated. [This event is alias to BR_MISP_RETIRED.NEAR_RETURN_TPEBS] Available PDIST counters: 0,1", + "SampleAfterValue": "100007", + "UMask": "0x8008", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xec", + "EventName": "CPU_CLK_UNHALTED.C01", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", + "SampleAfterValue": "2000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xec", + "EventName": "CPU_CLK_UNHALTED.C02", + "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", + "SampleAfterValue": "2000003", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Core clocks when the thread is in the C0.1 or C0.2 or running a PAUSE in C0 ACPI state.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xec", + "EventName": "CPU_CLK_UNHALTED.C0_WAIT", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction.", + "SampleAfterValue": "2000003", + "UMask": "0x70", "Unit": "cpu_core" }, { "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.", "Counter": "Fixed counter 1", "EventName": "CPU_CLK_UNHALTED.CORE", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_atom" }, @@ -57,7 +755,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.CORE_P", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "Unit": "cpu_atom" }, { @@ -70,10 +768,30 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Core clocks when a PAUSE is pending.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xec", + "EventName": "CPU_CLK_UNHALTED.PAUSE", + "SampleAfterValue": "2000003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of Pause instructions", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EdgeDetect": "1", + "EventCode": "0xec", + "EventName": "CPU_CLK_UNHALTED.PAUSE_INST", + "SampleAfterValue": "2000003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { "BriefDescription": "Fixed Counter: Counts the number of unhalted reference clock cycles.", "Counter": "Fixed counter 2", "EventName": "CPU_CLK_UNHALTED.REF_TSC", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x3", "Unit": "cpu_atom" }, @@ -92,7 +810,7 @@ "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", "PublicDescription": "Counts the number of reference cycles that the core is not in a halt state. The core enters the halt state when it is running the HLT instruction. This event is not affected by core frequency changes and increments at a fixed frequency that is also used for the Time Stamp Counter (TSC). This event uses a programmable general purpose performance counter.", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_atom" }, @@ -110,7 +828,7 @@ "BriefDescription": "Fixed Counter: Counts the number of unhalted core clock cycles.", "Counter": "Fixed counter 1", "EventName": "CPU_CLK_UNHALTED.THREAD", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_atom" }, @@ -128,7 +846,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.THREAD_P", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "Unit": "cpu_atom" }, { @@ -141,11 +859,130 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Cycles while memory subsystem has an outstanding load.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "16", + "EventCode": "0xa3", + "EventName": "CYCLE_ACTIVITY.CYCLES_MEM_ANY", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Total execution stalls.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "4", + "EventCode": "0xa3", + "EventName": "CYCLE_ACTIVITY.STALLS_TOTAL", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Count number of times a load is depending on another load that had just write back its data or in previous or 2 cycles back. This event supports in-direct dependency through a single uop.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x02", + "EventName": "DEPENDENT_LOADS.ANY", + "SampleAfterValue": "1000003", + "UMask": "0x7", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles total of 1 uop is executed on all ports and Reservation Station was not empty.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa6", + "EventName": "EXE_ACTIVITY.1_PORTS_UTIL", + "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty.", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa6", + "EventName": "EXE_ACTIVITY.2_3_PORTS_UTIL", + "SampleAfterValue": "2000003", + "UMask": "0xc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles total of 2 uops are executed on all ports and Reservation Station was not empty.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa6", + "EventName": "EXE_ACTIVITY.2_PORTS_UTIL", + "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty.", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station was not empty.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa6", + "EventName": "EXE_ACTIVITY.3_PORTS_UTIL", + "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty.", + "SampleAfterValue": "2000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station was not empty.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa6", + "EventName": "EXE_ACTIVITY.4_PORTS_UTIL", + "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty.", + "SampleAfterValue": "2000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Execution stalls while memory subsystem has an outstanding load.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "5", + "EventCode": "0xa6", + "EventName": "EXE_ACTIVITY.BOUND_ON_LOADS", + "SampleAfterValue": "2000003", + "UMask": "0x21", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles where the Store Buffer was full and no loads caused an execution stall.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "2", + "EventCode": "0xa6", + "EventName": "EXE_ACTIVITY.BOUND_ON_STORES", + "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall.", + "SampleAfterValue": "1000003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles no uop executed while RS was not empty, the SB was not full and there was no outstanding load.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa6", + "EventName": "EXE_ACTIVITY.EXE_BOUND_0_PORTS", + "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load.", + "SampleAfterValue": "1000003", + "UMask": "0x80", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Instruction decoders utilized in a cycle", + "Counter": "2", + "EventCode": "0x75", + "EventName": "INST_DECODED.DECODERS", + "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { "BriefDescription": "Fixed Counter: Counts the number of instructions retired.", "Counter": "Fixed counter 0", "EventName": "INST_RETIRED.ANY", "PublicDescription": "Fixed Counter: Counts the number of instructions retired. Available PDIST counters: 32", - "SampleAfterValue": "1000003", + "SampleAfterValue": "2000003", "UMask": "0x1", "Unit": "cpu_atom" }, @@ -163,7 +1000,8 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.ANY_P", - "SampleAfterValue": "1000003", + "PublicDescription": "Counts the number of instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "2000003", "Unit": "cpu_atom" }, { @@ -171,8 +1009,273 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc0", "EventName": "INST_RETIRED.ANY_P", - "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter. Available PDIST counters: 0", + "PublicDescription": "Counts the number of X86 instructions retired - an Architectural PerfMon event. Counting continues during hardware interrupts, traps, and inside interrupt handlers. Notes: INST_RETIRED.ANY is counted by a designated fixed counter freeing up programmable counters to count other events. INST_RETIRED.ANY_P is counted by a programmable counter. Available PDIST counters: 0,1", + "SampleAfterValue": "2000003", + "Unit": "cpu_core" + }, + { + "BriefDescription": "retired macro-fused uops when there is a branch in the macro-fused pair (the two instructions that got macro-fused count once in this pmon)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc0", + "EventName": "INST_RETIRED.BR_FUSED", + "PublicDescription": "retired macro-fused uops when there is a branch in the macro-fused pair (the two instructions that got macro-fused count once in this pmon) Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "INST_RETIRED.MACRO_FUSED", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc0", + "EventName": "INST_RETIRED.MACRO_FUSED", + "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0,1", + "SampleAfterValue": "2000003", + "UMask": "0x30", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired NOP instructions.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc0", + "EventName": "INST_RETIRED.NOP", + "PublicDescription": "Counts all retired NOP or ENDBR32/64 or PREFETCHIT0/1 instructions Available PDIST counters: 0,1", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Precise instruction retired with PEBS precise-distribution", + "Counter": "Fixed counter 0", + "EventName": "INST_RETIRED.PREC_DIST", + "PublicDescription": "A version of INST_RETIRED that allows for a precise distribution of samples across instructions retired. It utilizes the Precise Distribution of Instructions Retired (PDIR++) feature to fix bias in how retired instructions get sampled. Use on Fixed Counter 0. Available PDIST counters: 32", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Iterations of Repeat string retired instructions.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc0", + "EventName": "INST_RETIRED.REP_ITERATION", + "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0,1", "SampleAfterValue": "2000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Bubble cycles of BPClear.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xad", + "EventName": "INT_MISC.BPCLEAR_CYCLES", + "MSRIndex": "0x3F7", + "MSRValue": "0xB", + "SampleAfterValue": "1000003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Clears speculative count", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EdgeDetect": "1", + "EventCode": "0xad", + "EventName": "INT_MISC.CLEARS_COUNT", + "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", + "SampleAfterValue": "500009", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xad", + "EventName": "INT_MISC.CLEAR_RESTEER_CYCLES", + "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.", + "SampleAfterValue": "500009", + "UMask": "0x80", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Core cycles the allocator was stalled due to recovery from earlier clear event for this thread", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xad", + "EventName": "INT_MISC.RECOVERY_CYCLES", + "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event.", + "SampleAfterValue": "500009", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Bubble cycles of BAClear (Unknown Branch).", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xad", + "EventName": "INT_MISC.UNKNOWN_BRANCH_CYCLES", + "MSRIndex": "0x3F7", + "MSRValue": "0x7", + "SampleAfterValue": "1000003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "TMA slots where uops got dropped", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xad", + "EventName": "INT_MISC.UOP_DROPPING", + "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of uops executed on integer port 0,1, 2, 3.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xb3", + "EventName": "INT_UOPS_EXECUTED.PRIMARY", + "SampleAfterValue": "1000003", + "UMask": "0x78", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Number of vector integer instructions retired of 128-bit vector-width.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe7", + "EventName": "INT_VEC_RETIRED.128BIT", + "SampleAfterValue": "1000003", + "UMask": "0x13", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of vector integer instructions retired of 256-bit vector-width.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe7", + "EventName": "INT_VEC_RETIRED.256BIT", + "SampleAfterValue": "1000003", + "UMask": "0xac", + "Unit": "cpu_core" + }, + { + "BriefDescription": "integer ADD, SUB, SAD 128-bit vector instructions.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe7", + "EventName": "INT_VEC_RETIRED.ADD_128", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions.", + "SampleAfterValue": "1000003", + "UMask": "0x3", + "Unit": "cpu_core" + }, + { + "BriefDescription": "integer ADD, SUB, SAD 256-bit vector instructions.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe7", + "EventName": "INT_VEC_RETIRED.ADD_256", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions.", + "SampleAfterValue": "1000003", + "UMask": "0xc", + "Unit": "cpu_core" + }, + { + "BriefDescription": "INT_VEC_RETIRED.MUL_256", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe7", + "EventName": "INT_VEC_RETIRED.MUL_256", + "SampleAfterValue": "1000003", + "UMask": "0x80", + "Unit": "cpu_core" + }, + { + "BriefDescription": "INT_VEC_RETIRED.SHUFFLES", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe7", + "EventName": "INT_VEC_RETIRED.SHUFFLES", + "SampleAfterValue": "1000003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "INT_VEC_RETIRED.VNNI_128", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe7", + "EventName": "INT_VEC_RETIRED.VNNI_128", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "INT_VEC_RETIRED.VNNI_256", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe7", + "EventName": "INT_VEC_RETIRED.VNNI_256", + "SampleAfterValue": "1000003", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.ADDRESS_ALIAS", + "PublicDescription": "Counts the number of retired loads that are blocked because it initially appears to be store forward blocked, but subsequently is shown not to be blocked based on 4K alias check. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "False dependencies in MOB due to partial compare on address.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.ADDRESS_ALIAS", + "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address.", + "SampleAfterValue": "100003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of retired loads that are blocked for any of the following reasons: DTLB miss, address alias, store forward or data unknown (includes memory disambiguation blocks and ESP consuming load blocks).", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.ALL", + "PublicDescription": "Counts the number of retired loads that are blocked for any of the following reasons: DTLB miss, address alias, store forward or data unknown (includes memory disambiguation blocks and ESP consuming load blocks). Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Bank conflicts in DCU due to limited lookup ports.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.BANK_CONFLICT", + "PublicDescription": "Counts the number of times a load got blocked due to bank conflicts in DCU", + "SampleAfterValue": "100003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.DATA_UNKNOWN", + "PublicDescription": "Counts the number of retired loads that are blocked because its address exactly matches an older store whose data is not ready. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "The number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.NO_SR", + "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.", + "SampleAfterValue": "100003", + "UMask": "0x88", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of times a load got early blocked due to preceding store operation with unknown address or unknown data. Excluding in-line (immediate) wakeups", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.STORE_EARLY", + "SampleAfterValue": "100003", + "UMask": "0xa1", "Unit": "cpu_core" }, { @@ -180,6 +1283,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x03", "EventName": "LD_BLOCKS.STORE_FORWARD", + "PublicDescription": "Counts the number of retired loads that are blocked because its address partially overlapped with an older store. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x2", "Unit": "cpu_atom" @@ -195,10 +1299,129 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Cycles Uops delivered by the LSD, but didn't come from the decoder.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0xa8", + "EventName": "LSD.CYCLES_ACTIVE", + "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector).", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles optimal number of Uops delivered by the LSD, but did not come from the decoder.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "8", + "EventCode": "0xa8", + "EventName": "LSD.CYCLES_OK", + "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector).", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of Uops delivered by the LSD.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa8", + "EventName": "LSD.UOPS", + "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector).", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of machine clears (nukes) of any type.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EdgeDetect": "1", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.COUNT", + "PublicDescription": "Counts the number of machine clears (nukes) of any type.", + "SampleAfterValue": "100003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of machine clears due to memory ordering in which an internal load passes an older store within the same CPU.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.DISAMBIGUATION", + "SampleAfterValue": "1000003", + "UMask": "0x8", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of machine clears due to a page fault. Counts both I-Side and D-Side (Loads/Stores) page faults. A page fault occurs when either the page is not present, or an access violation occurs.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.PAGE_FAULT", + "SampleAfterValue": "1000003", + "UMask": "0x20", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Self-modifying code (SMC) detected.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc3", + "EventName": "MACHINE_CLEARS.SMC", + "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear.", + "SampleAfterValue": "100003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts cycles where no execution is happening due to loads waiting for L1 cache (that is: no execution & load in flight & no load missed L1 cache)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x46", + "EventName": "MEMORY_STALLS.L1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts cycles where no execution is happening due to loads waiting for L2 cache (that is: no execution & load in flight & load missed L1 & no load missed L2 cache)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x46", + "EventName": "MEMORY_STALLS.L2", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts cycles where no execution is happening due to loads waiting for L3 cache (that is: no execution & load in flight & load missed L1 & load missed L2 cache & no load missed L3 Cache)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x46", + "EventName": "MEMORY_STALLS.L3", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts cycles where no execution is happening due to loads waiting for Memory (that is: no execution & load in flight & a load missed L3 cache)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x46", + "EventName": "MEMORY_STALLS.MEM", + "SampleAfterValue": "1000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "LFENCE instructions retired", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xe0", + "EventName": "MISC2_RETIRED.LFENCE", + "PublicDescription": "number of LFENCE retired instructions", + "SampleAfterValue": "400009", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe4", "EventName": "MISC_RETIRED.LBR_INSERTS", + "PublicDescription": "Counts the number of LBR entries recorded. Requires LBRs to be enabled in IA32_LBR_CTL. Available PDIST counters: 0,1", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_atom" @@ -208,12 +1431,83 @@ "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xe4", "EventName": "MISC_RETIRED.LBR_INSERTS", - "PublicDescription": "LBR record is inserted Available PDIST counters: 0", + "PublicDescription": "LBR record is inserted Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of LFENCE instructions retired.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xe0", + "EventName": "MISC_RETIRED1.LFENCE", + "PublicDescription": "Counts the number of LFENCE instructions retired. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of accesses to KeyLocker cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xe1", + "EventName": "MISC_RETIRED2.KEYLOCKER_ACCESS", + "PublicDescription": "Counts the number of accesses to KeyLocker cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of misses to KeyLocker cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xe1", + "EventName": "MISC_RETIRED2.KEYLOCKER_MISS", + "PublicDescription": "Counts the number of misses to KeyLocker cache. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x11", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Cycles when Reservation Station (RS) is empty for the thread.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa5", + "EventName": "RS.EMPTY", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", + "SampleAfterValue": "1000003", + "UMask": "0x7", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts end of periods where the Reservation Station (RS) was empty.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EdgeDetect": "1", + "EventCode": "0xa5", + "EventName": "RS.EMPTY_COUNT", + "Invert": "1", + "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events)", + "SampleAfterValue": "100003", + "UMask": "0x7", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when RS was empty and a resource allocation stall is asserted", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xa5", + "EventName": "RS.EMPTY_RESOURCE", "SampleAfterValue": "1000003", "UMask": "0x1", "Unit": "cpu_core" }, { + "BriefDescription": "Counts the number of issue slots in a UMWAIT or TPAUSE instruction where no uop issues due to the instruction putting the CPU into the C0.1 activity state. For Tremont, UMWAIT and TPAUSE will only put the CPU into C0.1 activity state (not C0.2 activity state).", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x75", + "EventName": "SERIALIZATION.C01_MS_SCB", + "SampleAfterValue": "1000003", + "UMask": "0x4", + "Unit": "cpu_atom" + }, + { "BriefDescription": "This event counts a subset of the Topdown Slots event that were not consumed by the back-end pipeline due to lack of back-end resources, as a result of memory subsystem delays, execution units limitations, or other conditions.", "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xa4", @@ -224,6 +1518,35 @@ "Unit": "cpu_core" }, { + "BriefDescription": "TMA slots wasted due to incorrect speculations.", + "Counter": "0", + "EventCode": "0xa4", + "EventName": "TOPDOWN.BAD_SPEC_SLOTS", + "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations.", + "SampleAfterValue": "10000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "TMA slots wasted due to incorrect speculation by branch mispredictions", + "Counter": "0", + "EventCode": "0xa4", + "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS", + "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction.", + "SampleAfterValue": "10000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "TOPDOWN.MEMORY_BOUND_SLOTS", + "Counter": "3", + "EventCode": "0xa4", + "EventName": "TOPDOWN.MEMORY_BOUND_SLOTS", + "SampleAfterValue": "10000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { "BriefDescription": "TMA slots available for an unhalted logical processor. Fixed counter - architectural event", "Counter": "Fixed counter 3", "EventName": "TOPDOWN.SLOTS", @@ -244,7 +1567,7 @@ }, { "BriefDescription": "Fixed Counter: Counts the number of issue slots that were not consumed by the backend because allocation is stalled due to a mispredicted jump or a machine clear.", - "Counter": "36", + "Counter": "Fixed counter 4", "EventName": "TOPDOWN_BAD_SPECULATION.ALL", "SampleAfterValue": "1000003", "UMask": "0x5", @@ -279,7 +1602,7 @@ }, { "BriefDescription": "Fixed Counter: Counts the number of retirement slots not consumed due to front end stalls.", - "Counter": "37", + "Counter": "Fixed counter 5", "EventName": "TOPDOWN_FE_BOUND.ALL", "SampleAfterValue": "1000003", "UMask": "0x6", @@ -296,7 +1619,7 @@ }, { "BriefDescription": "Fixed Counter: Counts the number of consumed retirement slots.", - "Counter": "38", + "Counter": "Fixed counter 6", "EventName": "TOPDOWN_RETIRING.ALL", "SampleAfterValue": "1000003", "UMask": "0x7", @@ -313,6 +1636,237 @@ "Unit": "cpu_atom" }, { + "BriefDescription": "Number of non dec-by-all uops decoded by decoder", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x76", + "EventName": "UOPS_DECODED.DEC0_UOPS", + "PublicDescription": "This event counts the number of not dec-by-all uops decoded by decoder 0.", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops executed on INT EU ALU ports.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb2", + "EventName": "UOPS_DISPATCHED.ALU", + "PublicDescription": "Number of ALU integer uops dispatch to execution.", + "SampleAfterValue": "2000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops executed on any INT EU ports", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb2", + "EventName": "UOPS_DISPATCHED.INT_EU_ALL", + "PublicDescription": "Number of integer uops dispatched to execution.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of Uops dispatched/executed by any of the 3 JEUs (all ups that hold the JEU including macro; micro jumps; fetch-from-eip)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb2", + "EventName": "UOPS_DISPATCHED.JMP", + "PublicDescription": "Number of jump uops dispatch to execution", + "SampleAfterValue": "2000003", + "UMask": "0x40", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops executed on Load ports", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb2", + "EventName": "UOPS_DISPATCHED.LOAD", + "PublicDescription": "Number of Load uops dispatched to execution.", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of (shift) 1-cycle Uops dispatched/executed by any of the Shift Eus", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb2", + "EventName": "UOPS_DISPATCHED.SHIFT", + "PublicDescription": "Number of SHIFT integer uops dispatch to execution", + "SampleAfterValue": "2000003", + "UMask": "0x20", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of Uops dispatched/executed by Slow EU (e.g. 3+ cycles LEA, >1 cycles shift, iDIVs, CR; *H operation)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb2", + "EventName": "UOPS_DISPATCHED.SLOW", + "PublicDescription": "Number of Slow integer uops dispatch to execution.", + "SampleAfterValue": "2000003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of Uops dispatched on STA ports", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb2", + "EventName": "UOPS_DISPATCHED.STA", + "PublicDescription": "Number of STA (Store Address) uops dispatch to execution", + "SampleAfterValue": "2000003", + "UMask": "0x80", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops executed on STD ports", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb2", + "EventName": "UOPS_DISPATCHED.STD", + "PublicDescription": "Number of STD (Store Data) uops dispatch to execution", + "SampleAfterValue": "2000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles where at least 1 uop was executed per-thread", + "Counter": "3", + "CounterMask": "1", + "EventCode": "0xb1", + "EventName": "UOPS_EXECUTED.CYCLES_GE_1", + "PublicDescription": "Cycles where at least 1 uop was executed per-thread.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles where at least 2 uops were executed per-thread", + "Counter": "3", + "CounterMask": "2", + "EventCode": "0xb1", + "EventName": "UOPS_EXECUTED.CYCLES_GE_2", + "PublicDescription": "Cycles where at least 2 uops were executed per-thread.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles where at least 3 uops were executed per-thread", + "Counter": "3", + "CounterMask": "3", + "EventCode": "0xb1", + "EventName": "UOPS_EXECUTED.CYCLES_GE_3", + "PublicDescription": "Cycles where at least 3 uops were executed per-thread.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles where at least 4 uops were executed per-thread", + "Counter": "3", + "CounterMask": "4", + "EventCode": "0xb1", + "EventName": "UOPS_EXECUTED.CYCLES_GE_4", + "PublicDescription": "Cycles where at least 4 uops were executed per-thread.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts number of cycles no uops were dispatched to be executed on this thread.", + "Counter": "3", + "CounterMask": "1", + "EventCode": "0xb1", + "EventName": "UOPS_EXECUTED.STALLS", + "Invert": "1", + "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of uops to be executed per-thread each cycle.", + "Counter": "3", + "EventCode": "0xb1", + "EventName": "UOPS_EXECUTED.THREAD", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of x87 uops dispatched.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xb1", + "EventName": "UOPS_EXECUTED.X87", + "PublicDescription": "Counts the number of x87 uops executed.", + "SampleAfterValue": "2000003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Uops that RAT issues to RS", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xae", + "EventName": "UOPS_ISSUED.ANY", + "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS).", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "UOPS_ISSUED.CYCLES", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0xae", + "EventName": "UOPS_ISSUED.CYCLES", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles with retired uop(s).", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.CYCLES", + "PublicDescription": "Counts cycles where at least one uop has retired.", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Retired uops except the last uop of each instruction.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.HEAVY", + "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count.", + "SampleAfterValue": "2000003", + "UMask": "0x1", + "Unit": "cpu_core" + }, + { + "BriefDescription": "UOPS_RETIRED.MS", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.MS", + "MSRIndex": "0x3F7", + "MSRValue": "0x8", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of non-speculative switches to the Microcode Sequencer (MS)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EdgeDetect": "1", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.MS_SWITCHES", + "MSRIndex": "0x3F7", + "MSRValue": "0x8", + "PublicDescription": "Switches to the Microcode Sequencer", + "SampleAfterValue": "2000003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { "BriefDescription": "This event counts a subset of the Topdown Slots event that are utilized by operations that eventually get retired (committed) by the processor pipeline. Usually, this event positively correlates with higher performance for example, as measured by the instructions-per-cycle metric.", "Counter": "0,1,2,3,4,5,6,7,8,9", "EventCode": "0xc2", @@ -321,5 +1875,17 @@ "SampleAfterValue": "2000003", "UMask": "0x2", "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles without actually retired uops.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0xc2", + "EventName": "UOPS_RETIRED.STALLS", + "Invert": "1", + "PublicDescription": "This event counts cycles without actually retired uops.", + "SampleAfterValue": "1000003", + "UMask": "0x2", + "Unit": "cpu_core" } ] diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/uncore-memory.json b/tools/perf/pmu-events/arch/x86/pantherlake/uncore-memory.json new file mode 100644 index 000000000000..a881b99be5f3 --- /dev/null +++ b/tools/perf/pmu-events/arch/x86/pantherlake/uncore-memory.json @@ -0,0 +1,26 @@ +[ + { + "BriefDescription": "Read CAS command sent to DRAM", + "Counter": "0,1,2,3,4", + "EventCode": "0x22", + "EventName": "UNC_M_CAS_COUNT_RD", + "PerPkg": "1", + "Unit": "iMC" + }, + { + "BriefDescription": "Write CAS command sent to DRAM", + "Counter": "0,1,2,3,4", + "EventCode": "0x23", + "EventName": "UNC_M_CAS_COUNT_WR", + "PerPkg": "1", + "Unit": "iMC" + }, + { + "BriefDescription": "Total number of read and write byte transfers to/from DRAM, in 32B chunk, per DDR channel. Counter increments by 1 after sending or receiving 32B chunk data.", + "Counter": "0,1,2,3,4", + "EventCode": "0x3C", + "EventName": "UNC_M_TOTAL_DATA", + "PerPkg": "1", + "Unit": "iMC" + } +] diff --git a/tools/perf/pmu-events/arch/x86/pantherlake/virtual-memory.json b/tools/perf/pmu-events/arch/x86/pantherlake/virtual-memory.json index 690c5dff9d9e..8d56c16b2a39 100644 --- a/tools/perf/pmu-events/arch/x86/pantherlake/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/pantherlake/virtual-memory.json @@ -1,5 +1,44 @@ [ { + "BriefDescription": "Counts the number of page walks initiated by a demand load that missed the first and second level TLBs.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.MISS_CAUSED_WALK", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of first level TLB misses but second level hits due to a demand load that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x08", + "EventName": "DTLB_LOAD_MISSES.STLB_HIT", + "SampleAfterValue": "1000003", + "UMask": "0x20", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Loads that miss the DTLB and hit the STLB.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x12", + "EventName": "DTLB_LOAD_MISSES.STLB_HIT", + "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB).", + "SampleAfterValue": "100003", + "UMask": "0x320", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when at least one PMH is busy with a page walk for a demand load.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0x12", + "EventName": "DTLB_LOAD_MISSES.WALK_ACTIVE", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load.", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the number of page walks completed due to load DTLB misses to any page size.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x08", @@ -20,6 +59,86 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Page walks completed due to a demand data load to a 1G page.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x12", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_1G", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", + "SampleAfterValue": "100003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Page walks completed due to a demand data load to a 2M/4M page.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x12", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", + "SampleAfterValue": "100003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Page walks completed due to a demand data load to a 4K page.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x12", + "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of page walks outstanding for a demand load in the PMH each cycle.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x12", + "EventName": "DTLB_LOAD_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle.", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of page walks initiated by a store that missed the first and second level TLBs.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.MISS_CAUSED_WALK", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of first level TLB misses but second level hits due to stores that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x49", + "EventName": "DTLB_STORE_MISSES.STLB_HIT", + "PublicDescription": "Counts the number of first level TLB misses but second level hits due to a demand load that did not start a page walk. Accounts for all page sizes. Will result in a DTLB write from STLB.", + "SampleAfterValue": "1000003", + "UMask": "0x20", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Stores that miss the DTLB and hit the STLB.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x13", + "EventName": "DTLB_STORE_MISSES.STLB_HIT", + "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB).", + "SampleAfterValue": "100003", + "UMask": "0x320", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when at least one PMH is busy with a page walk for a store.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0x13", + "EventName": "DTLB_STORE_MISSES.WALK_ACTIVE", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store.", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the number of page walks completed due to store DTLB misses to any page size.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x49", @@ -40,6 +159,85 @@ "Unit": "cpu_core" }, { + "BriefDescription": "Page walks completed due to a demand data store to a 1G page.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x13", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_1G", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", + "SampleAfterValue": "100003", + "UMask": "0x8", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Page walks completed due to a demand data store to a 2M/4M page.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x13", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", + "SampleAfterValue": "100003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Page walks completed due to a demand data store to a 4K page.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x13", + "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Number of page walks outstanding for a store in the PMH each cycle.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x13", + "EventName": "DTLB_STORE_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle.", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of page walks initiated by a instruction fetch that missed the first and second level TLBs.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.MISS_CAUSED_WALK", + "SampleAfterValue": "1000003", + "UMask": "0x1", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Counts the number of first level TLB misses but second level hits due to an instruction fetch that did not start a page walk. Account for all pages sizes. Will result in an ITLB write from STLB.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.STLB_HIT", + "SampleAfterValue": "1000003", + "UMask": "0x20", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Instruction fetch requests that miss the ITLB and hit the STLB.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x11", + "EventName": "ITLB_MISSES.STLB_HIT", + "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB).", + "SampleAfterValue": "100003", + "UMask": "0x120", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Cycles when at least one PMH is busy with a page walk for code (instruction fetch) request.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "CounterMask": "1", + "EventCode": "0x11", + "EventName": "ITLB_MISSES.WALK_ACTIVE", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request.", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { "BriefDescription": "Counts the number of page walks completed due to instruction fetch misses to any page size.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x85", @@ -58,5 +256,55 @@ "SampleAfterValue": "100003", "UMask": "0xe", "Unit": "cpu_core" + }, + { + "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (2M/4M)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x11", + "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M", + "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", + "SampleAfterValue": "100003", + "UMask": "0x4", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Code miss in all TLB levels causes a page walk that completes. (4K)", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x11", + "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", + "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", + "SampleAfterValue": "100003", + "UMask": "0x2", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of page walks outstanding for iside in PMH every cycle.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x85", + "EventName": "ITLB_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding for iside in PMH every cycle. A PMH page walk is outstanding from page walk start till PMH becomes idle again (ready to serve next walk). Includes EPT-walk intervals. Walks could be counted by edge detecting on this event, but would count restarted suspended walks.", + "SampleAfterValue": "1000003", + "UMask": "0x10", + "Unit": "cpu_atom" + }, + { + "BriefDescription": "Number of page walks outstanding for an outstanding code request in the PMH each cycle.", + "Counter": "0,1,2,3,4,5,6,7,8,9", + "EventCode": "0x11", + "EventName": "ITLB_MISSES.WALK_PENDING", + "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle.", + "SampleAfterValue": "100003", + "UMask": "0x10", + "Unit": "cpu_core" + }, + { + "BriefDescription": "Counts the number of retired loads that are blocked due to a first level TLB miss.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x03", + "EventName": "LD_BLOCKS.DTLB_MISS", + "PublicDescription": "Counts the number of retired loads that are blocked due to a first level TLB miss. Available PDIST counters: 0,1", + "SampleAfterValue": "1000003", + "UMask": "0x8", + "Unit": "cpu_atom" } ] diff --git a/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json b/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json index 71737a1a1997..79dc34157481 100644 --- a/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/rocketlake/rkl-metrics.json @@ -1,63 +1,63 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c8\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C8_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C9 residency percent per package", - "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c9\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C9_Pkg_Residency", "ScaleUnit": "100%" @@ -85,7 +85,6 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", @@ -134,6 +133,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_bottleneck_big_code", @@ -148,39 +148,44 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20" }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -188,6 +193,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", @@ -196,6 +202,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", "MetricGroup": "BvMS;LockCont;Mem;Offcore;tma_issueSyncxn", "MetricName": "tma_bottleneck_memory_synchronization", @@ -204,6 +211,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM", "MetricName": "tma_bottleneck_mispredictions", @@ -212,7 +220,8 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -220,6 +229,7 @@ }, { "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + 2 * BR_INST_RETIRED.NEAR_CALL + INST_RETIRED.NOP) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "BvUW;Ret", "MetricName": "tma_bottleneck_useful_work", @@ -427,7 +437,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -619,6 +629,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_lsd + tma_mite + tma_ms)))", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", "MetricName": "tma_info_botlnk_l2_dsb_bandwidth", @@ -1074,7 +1085,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -1098,6 +1109,12 @@ "MetricName": "tma_info_pipeline_fetch_mite" }, { + "BriefDescription": "Average number of uops fetched from MS per cycle", + "MetricExpr": "IDQ.MS_UOPS / cpu@IDQ.MS_UOPS\\,cmask\\=1@", + "MetricGroup": "Fed;FetchLat;MicroSeq", + "MetricName": "tma_info_pipeline_fetch_ms" + }, + { "BriefDescription": "Instructions per a microcode Assist invocation", "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY", "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", @@ -1113,7 +1130,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1125,7 +1142,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -1134,7 +1151,7 @@ "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / tma_info_system_time / 1e3", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1165,6 +1182,7 @@ }, { "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", "MetricName": "tma_info_system_mem_parallel_reads", @@ -1316,12 +1334,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1345,7 +1363,6 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", @@ -1359,7 +1376,7 @@ "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1465,7 +1482,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1474,7 +1491,7 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1542,7 +1559,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "cpu@IDQ.MS_UOPS\\,cmask\\=1@ / tma_info_core_core_clks / 2", + "MetricExpr": "cpu@IDQ.MS_UOPS\\,cmask\\=1@ / tma_info_core_core_clks / 3.3", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -1676,7 +1693,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1713,7 +1730,6 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", @@ -1727,7 +1743,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1741,7 +1757,6 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", diff --git a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json index 823d8b7c4224..d40761903429 100644 --- a/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sandybridge/snb-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -71,7 +71,6 @@ }, { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "1 - (tma_frontend_bound + tma_bad_speculation + tma_retiring)", "MetricGroup": "BvOB;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", @@ -296,7 +295,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -308,7 +307,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json index 21db53f9e9d6..c66324d41a89 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/cache.json @@ -1,10 +1,72 @@ [ { + "BriefDescription": "Hit snoop reply with data, line invalidated.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.I_FWD_FE", + "PublicDescription": "Counts responses to snoops indicating the line will now be (I)nvalidated: removed from this core's cache, after the data is forwarded back to the requestor and indicating the data was found unmodified in the (FE) Forward or Exclusive State in this cores caches cache. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x20" + }, + { + "BriefDescription": "HitM snoop reply with data, line invalidated.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.I_FWD_M", + "PublicDescription": "Counts responses to snoops indicating the line will now be (I)nvalidated: removed from this core's caches, after the data is forwarded back to the requestor, and indicating the data was found modified(M) in this cores caches cache (aka HitM response). A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x10" + }, + { + "BriefDescription": "Hit snoop reply without sending the data, line invalidated.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.I_HIT_FSE", + "PublicDescription": "Counts responses to snoops indicating the line will now be (I)nvalidated in this core's caches without forwarded back to the requestor. The line was in Forward, Shared or Exclusive (FSE) state in this cores caches. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Line not found snoop reply", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.MISS", + "PublicDescription": "Counts responses to snoops indicating that the data was not found (IHitI) in this core's caches. A single snoop response from the core counts on all hyperthreads of the Core.", + "SampleAfterValue": "1000003", + "UMask": "0x1" + }, + { + "BriefDescription": "Hit snoop reply with data, line kept in Shared state.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.S_FWD_FE", + "PublicDescription": "Counts responses to snoops indicating the line may be kept on this core in the (S)hared state, after the data is forwarded back to the requestor, initially the data was found in the cache in the (FS) Forward or Shared state. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x40" + }, + { + "BriefDescription": "HitM snoop reply with data, line kept in Shared state", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.S_FWD_M", + "PublicDescription": "Counts responses to snoops indicating the line may be kept on this core in the (S)hared state, after the data is forwarded back to the requestor, initially the data was found in the cache in the (M)odified state. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x8" + }, + { + "BriefDescription": "Hit snoop reply without sending the data, line kept in Shared state.", + "Counter": "0,1,2,3", + "EventCode": "0x27", + "EventName": "CORE_SNOOP_RESPONSE.S_HIT_FSE", + "PublicDescription": "Counts responses to snoops indicating the line was kept on this core in the (S)hared state, and that the data was found unmodified but not forwarded back to the requestor, initially the data was found in the cache in the (FSE) Forward, Shared state or Exclusive state. A single snoop response from the core counts on all hyperthreads of the core.", + "SampleAfterValue": "1000003", + "UMask": "0x4" + }, + { "BriefDescription": "L1D.HWPF_MISS", "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.HWPF_MISS", - "PublicDescription": "L1D.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -13,7 +75,7 @@ "Counter": "0,1,2,3", "EventCode": "0x51", "EventName": "L1D.REPLACEMENT", - "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace. Available PDIST counters: 0", + "PublicDescription": "Counts L1D data line replacements including opportunistic replacements, and replacements that require stall-for-replace or block-for-replace.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -22,7 +84,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -33,7 +95,7 @@ "EdgeDetect": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.FB_FULL_PERIODS", - "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of phases a demand request has waited due to L1D Fill Buffer (FB) unavailability. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -43,7 +105,6 @@ "Deprecated": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.L2_STALL", - "PublicDescription": "This event is deprecated. Refer to new event L1D_PEND_MISS.L2_STALLS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -52,7 +113,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.L2_STALLS", - "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses. Available PDIST counters: 0", + "PublicDescription": "Counts number of cycles a demand request has waited due to L1D due to lack of L2 resources. Demand requests include cacheable/uncacheable demand load, store, lock or SW prefetch accesses.", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -61,7 +122,7 @@ "Counter": "0,1,2,3", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING", - "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts number of L1D misses that are outstanding in each cycle, that is each cycle the number of Fill Buffers (FB) outstanding required by Demand Reads. FB either is held by demand loads, or it is held by non-demand loads and gets hit at least once by demand. The valid outstanding interval is defined until the FB deallocation by one of the following ways: from FB allocation, if FB is allocated by demand from the demand Hit FB, if it is allocated by hardware or software prefetch. Note: In the L1D, a Demand Read contains cacheable or noncacheable demand loads, including ones causing cache-line splits and reads due to page walks resulted from any request type.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -71,7 +132,7 @@ "CounterMask": "1", "EventCode": "0x48", "EventName": "L1D_PEND_MISS.PENDING_CYCLES", - "PublicDescription": "Counts duration of L1D miss outstanding in cycles. Available PDIST counters: 0", + "PublicDescription": "Counts duration of L1D miss outstanding in cycles.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -80,7 +141,7 @@ "Counter": "0,1,2,3", "EventCode": "0x25", "EventName": "L2_LINES_IN.ALL", - "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects. Available PDIST counters: 0", + "PublicDescription": "Counts the number of L2 cache lines filling the L2. Counting does not cover rejects.", "SampleAfterValue": "100003", "UMask": "0x1f" }, @@ -89,7 +150,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.NON_SILENT", - "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3 Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are evicted by L2 cache when triggered by an L2 cache fill. Those lines are in Modified state. Modified lines are written back to L3", "SampleAfterValue": "200003", "UMask": "0x2" }, @@ -98,7 +159,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.SILENT", - "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event. Available PDIST counters: 0", + "PublicDescription": "Counts the number of lines that are silently dropped by L2 cache. These lines are typically in Shared or Exclusive state. A non-threaded event.", "SampleAfterValue": "200003", "UMask": "0x1" }, @@ -107,7 +168,7 @@ "Counter": "0,1,2,3", "EventCode": "0x26", "EventName": "L2_LINES_OUT.USELESS_HWPF", - "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache Available PDIST counters: 0", + "PublicDescription": "Counts the number of cache lines that have been prefetched by the L2 hardware prefetcher but not used by demand access when evicted from the L2 cache", "SampleAfterValue": "200003", "UMask": "0x4" }, @@ -116,7 +177,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.ALL", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.REFERENCES]", "SampleAfterValue": "200003", "UMask": "0xff" }, @@ -125,7 +186,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_REQUEST.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_RQSTS.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f" }, @@ -134,7 +195,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_CODE_RD", - "PublicDescription": "Counts the total number of L2 code requests. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of L2 code requests.", "SampleAfterValue": "200003", "UMask": "0xe4" }, @@ -143,7 +204,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_DATA_RD", - "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts Demand Data Read requests accessing the L2 cache. These requests may hit or miss L2 cache. True-miss exclude misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0xe1" }, @@ -152,7 +213,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_MISS", - "PublicDescription": "Counts demand requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts demand requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x27" }, @@ -161,7 +222,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_DEMAND_REFERENCES", - "PublicDescription": "Counts demand requests to L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts demand requests to L2 cache.", "SampleAfterValue": "200003", "UMask": "0xe7" }, @@ -170,7 +231,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_HWPF", - "PublicDescription": "L2_RQSTS.ALL_HWPF Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0xf0" }, @@ -179,7 +239,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.ALL_RFO", - "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches. Available PDIST counters: 0", + "PublicDescription": "Counts the total number of RFO (read for ownership) requests to L2 cache. L2 RFO requests include both L1D demand RFO misses as well as L1D RFO prefetches.", "SampleAfterValue": "200003", "UMask": "0xe2" }, @@ -188,7 +248,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_HIT", - "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache hits when fetching instructions, code reads.", "SampleAfterValue": "200003", "UMask": "0xc4" }, @@ -197,7 +257,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.CODE_RD_MISS", - "PublicDescription": "Counts L2 cache misses when fetching instructions. Available PDIST counters: 0", + "PublicDescription": "Counts L2 cache misses when fetching instructions.", "SampleAfterValue": "200003", "UMask": "0x24" }, @@ -206,7 +266,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_HIT", - "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the number of demand Data Read requests initiated by load instructions that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc1" }, @@ -215,7 +275,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.DEMAND_DATA_RD_MISS", - "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once. Available PDIST counters: 0", + "PublicDescription": "Counts demand Data Read requests with true-miss in the L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. An access is counted once.", "SampleAfterValue": "200003", "UMask": "0x21" }, @@ -224,7 +284,6 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.HWPF_MISS", - "PublicDescription": "L2_RQSTS.HWPF_MISS Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x30" }, @@ -233,7 +292,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.MISS", - "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS] Available PDIST counters: 0", + "PublicDescription": "Counts read requests of any type with true-miss in the L2 cache. True-miss excludes L2 misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.MISS]", "SampleAfterValue": "200003", "UMask": "0x3f" }, @@ -242,7 +301,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.REFERENCES", - "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL] Available PDIST counters: 0", + "PublicDescription": "Counts all requests that were hit or true misses in L2 cache. True-miss excludes misses that were merged with ongoing L2 misses. [This event is alias to L2_REQUEST.ALL]", "SampleAfterValue": "200003", "UMask": "0xff" }, @@ -251,7 +310,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_HIT", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that hit L2 cache.", "SampleAfterValue": "200003", "UMask": "0xc2" }, @@ -260,7 +319,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.RFO_MISS", - "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts the RFO (Read-for-Ownership) requests that miss L2 cache.", "SampleAfterValue": "200003", "UMask": "0x22" }, @@ -269,7 +328,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_HIT", - "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that hit the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0xc8" }, @@ -278,7 +337,7 @@ "Counter": "0,1,2,3", "EventCode": "0x24", "EventName": "L2_RQSTS.SWPF_MISS", - "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full. Available PDIST counters: 0", + "PublicDescription": "Counts Software prefetch requests that miss the L2 cache. Accounts for PREFETCHNTA and PREFETCHT0/1/2 instructions when FB is not full.", "SampleAfterValue": "200003", "UMask": "0x28" }, @@ -287,7 +346,7 @@ "Counter": "0,1,2,3", "EventCode": "0x23", "EventName": "L2_TRANS.L2_WB", - "PublicDescription": "Counts L2 writebacks that access L2 cache. Available PDIST counters: 0", + "PublicDescription": "Counts L2 writebacks that access L2 cache.", "SampleAfterValue": "200003", "UMask": "0x40" }, @@ -296,7 +355,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.MISS", - "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests that miss the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x41" }, @@ -305,7 +364,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x2e", "EventName": "LONGEST_LAT_CACHE.REFERENCE", - "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3. Available PDIST counters: 0", + "PublicDescription": "Counts core-originated cacheable requests to the L3 cache (Longest Latency cache). Requests include data and code reads, Reads-for-Ownership (RFOs), speculative accesses and hardware prefetches to the L1 and L2. It does not include hardware prefetches to the L3, and may not count other types of requests to the L3.", "SampleAfterValue": "100003", "UMask": "0x4f" }, @@ -394,7 +453,7 @@ "Counter": "0,1,2,3", "EventCode": "0x43", "EventName": "MEM_LOAD_COMPLETED.L1_MISS_ANY", - "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss) Available PDIST counters: 0", + "PublicDescription": "Number of completed demand load requests that missed the L1 data cache including shadow misses (FB hits, merge to an ongoing L1D miss)", "SampleAfterValue": "1000003", "UMask": "0xfd" }, @@ -582,7 +641,6 @@ "Counter": "0,1,2,3", "EventCode": "0x44", "EventName": "MEM_STORE_RETIRED.L2_HIT", - "PublicDescription": "MEM_STORE_RETIRED.L2_HIT Available PDIST counters: 0", "SampleAfterValue": "200003", "UMask": "0x1" }, @@ -591,7 +649,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe5", "EventName": "MEM_UOP_RETIRED.ANY", - "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses Available PDIST counters: 0", + "PublicDescription": "Number of retired micro-operations (uops) for load or store memory accesses", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -1073,7 +1131,6 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.ALL_REQUESTS", - "PublicDescription": "OFFCORE_REQUESTS.ALL_REQUESTS Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -1082,7 +1139,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DATA_RD", - "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type. Available PDIST counters: 0", + "PublicDescription": "Counts the demand and prefetch data reads. All Core Data Reads include cacheable 'Demands' and L2 prefetchers (not L3 prefetchers). Counting also covers reads due to page walks resulted from any request type.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -1091,7 +1148,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_CODE_RD", - "PublicDescription": "Counts both cacheable and non-cacheable code read requests. Available PDIST counters: 0", + "PublicDescription": "Counts both cacheable and non-cacheable code read requests.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -1100,7 +1157,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_DATA_RD", - "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore. Available PDIST counters: 0", + "PublicDescription": "Counts the Demand Data Read requests sent to uncore. Use it in conjunction with OFFCORE_REQUESTS_OUTSTANDING to determine average latency in the uncore.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -1109,7 +1166,7 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.DEMAND_RFO", - "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM. Available PDIST counters: 0", + "PublicDescription": "Counts the demand RFO (read for ownership) requests including regular RFOs, locks, ItoM.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -1119,7 +1176,6 @@ "Deprecated": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.ALL_DATA_RD", - "PublicDescription": "This event is deprecated. Refer to new event OFFCORE_REQUESTS_OUTSTANDING.DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -1129,7 +1185,6 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -1139,7 +1194,7 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1149,7 +1204,6 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_DATA_RD", - "PublicDescription": "Cycles where at least 1 outstanding demand data read request is pending. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1159,7 +1213,6 @@ "CounterMask": "1", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -1168,7 +1221,6 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD", - "PublicDescription": "OFFCORE_REQUESTS_OUTSTANDING.DATA_RD Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -1177,7 +1229,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_CODE_RD", - "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS. Available PDIST counters: 0", + "PublicDescription": "Counts the number of offcore outstanding Code Reads transactions in the super queue every cycle. The 'Offcore outstanding' state of the transaction lasts from the L2 miss until the sending transaction completion to requestor (SQ deallocation). See the corresponding Umask under OFFCORE_REQUESTS.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1186,7 +1238,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of outstanding demand data read requests pending. Requests are considered outstanding from the time they miss the core's L2 cache until the transaction completion message is sent to the requestor.", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -1195,7 +1247,7 @@ "Counter": "0,1,2,3", "EventCode": "0x2c", "EventName": "SQ_MISC.BUS_LOCK", - "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory. Available PDIST counters: 0", + "PublicDescription": "Counts the more expensive bus lock needed to enforce cache coherency for certain memory accesses that need to be done atomically. Can be created by issuing an atomic instruction (via the LOCK prefix) which causes a cache line split or accesses uncacheable memory.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -1204,7 +1256,6 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.ANY", - "PublicDescription": "Counts the number of PREFETCHNTA, PREFETCHW, PREFETCHT0, PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0xf" }, @@ -1213,7 +1264,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.NTA", - "PublicDescription": "Counts the number of PREFETCHNTA instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHNTA instructions executed.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -1222,7 +1273,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.PREFETCHW", - "PublicDescription": "Counts the number of PREFETCHW instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHW instructions executed.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -1231,7 +1282,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T0", - "PublicDescription": "Counts the number of PREFETCHT0 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT0 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -1240,7 +1291,7 @@ "Counter": "0,1,2,3", "EventCode": "0x40", "EventName": "SW_PREFETCH_ACCESS.T1_T2", - "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of PREFETCHT1 or PREFETCHT2 instructions executed.", "SampleAfterValue": "100003", "UMask": "0x4" } diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/floating-point.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/floating-point.json index 8c9207750c82..bc475e163227 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/floating-point.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/floating-point.json @@ -5,7 +5,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.FPDIV_ACTIVE", - "PublicDescription": "ARITH.FPDIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -14,7 +13,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.FP", - "PublicDescription": "Counts all microcode Floating Point assists. Available PDIST counters: 0", + "PublicDescription": "Counts all microcode Floating Point assists.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -23,7 +22,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.SSE_AVX_MIX", - "PublicDescription": "ASSISTS.SSE_AVX_MIX Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -32,7 +30,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_0", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_0 [This event is alias to FP_ARITH_DISPATCHED.V0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -41,7 +38,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_1", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_1 [This event is alias to FP_ARITH_DISPATCHED.V1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -50,7 +46,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.PORT_5", - "PublicDescription": "FP_ARITH_DISPATCHED.PORT_5 [This event is alias to FP_ARITH_DISPATCHED.V2] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -59,7 +54,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V0", - "PublicDescription": "FP_ARITH_DISPATCHED.V0 [This event is alias to FP_ARITH_DISPATCHED.PORT_0] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -68,7 +62,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V1", - "PublicDescription": "FP_ARITH_DISPATCHED.V1 [This event is alias to FP_ARITH_DISPATCHED.PORT_1] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -77,7 +70,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb3", "EventName": "FP_ARITH_DISPATCHED.V2", - "PublicDescription": "FP_ARITH_DISPATCHED.V2 [This event is alias to FP_ARITH_DISPATCHED.PORT_5] Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -86,7 +78,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -95,7 +87,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.128B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -104,7 +96,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 4 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -113,7 +105,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.256B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RCP DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -122,7 +114,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.4_FLOPS", - "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 128-bit packed single precision and 256-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 2 or/and 4 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point and packed double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX RCP14 RSQRT14 SQRT DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x18" }, @@ -131,7 +123,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.512B_PACKED_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x40" }, @@ -140,7 +132,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.512B_PACKED_SINGLE", - "PublicDescription": "Number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 16 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 512-bit packed single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 16 computation operations, one for each element. Applies to SSE* and AVX* packed single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT14 RCP14 FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -149,7 +141,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.8_FLOPS", - "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision and double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational 256-bit packed single precision and 512-bit packed double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 8 computation operations, one for each element. Applies to SSE* and AVX* packed single precision and double precision floating-point instructions: ADD SUB HADD HSUB SUBADD MUL DIV MIN MAX SQRT RSQRT RSQRT14 RCP RCP14 DPP FM(N)ADD/SUB. DPP and FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x60" }, @@ -158,7 +150,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR", - "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision and double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -167,7 +159,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_DOUBLE", - "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar double precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar double precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -176,7 +168,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.SCALAR_SINGLE", - "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of SSE/AVX computational scalar single precision floating-point instructions retired; some instructions will count twice as noted below. Each count represents 1 computational operation. Applies to SSE* and AVX* scalar single precision floating-point instructions: ADD SUB MUL DIV MIN MAX SQRT RSQRT RCP FM(N)ADD/SUB. FM(N)ADD/SUB instructions count twice as they perform 2 calculations per element. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -185,7 +177,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc7", "EventName": "FP_ARITH_INST_RETIRED.VECTOR", - "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events. Available PDIST counters: 0", + "PublicDescription": "Number of any Vector retired FP arithmetic instructions. The DAZ and FTZ flags in the MXCSR register need to be set when using these events.", "SampleAfterValue": "1000003", "UMask": "0xfc" }, @@ -194,7 +186,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.128B_PACKED_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.128B_PACKED_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -203,7 +194,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.256B_PACKED_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.256B_PACKED_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -212,7 +202,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.512B_PACKED_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.512B_PACKED_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -221,7 +210,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.COMPLEX_SCALAR_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -230,7 +218,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.SCALAR", - "PublicDescription": "FP_ARITH_INST_RETIRED2.SCALAR Available PDIST counters: 0", + "PublicDescription": "FP_ARITH_INST_RETIRED2.SCALAR", "SampleAfterValue": "100003", "UMask": "0x3" }, @@ -239,7 +227,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.SCALAR_HALF", - "PublicDescription": "FP_ARITH_INST_RETIRED2.SCALAR_HALF Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -248,7 +235,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcf", "EventName": "FP_ARITH_INST_RETIRED2.VECTOR", - "PublicDescription": "FP_ARITH_INST_RETIRED2.VECTOR Available PDIST counters: 0", + "PublicDescription": "FP_ARITH_INST_RETIRED2.VECTOR", "SampleAfterValue": "100003", "UMask": "0x1c" } diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json index 9fe9d62b867a..793c486ffabe 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/frontend.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3", "EventCode": "0x60", "EventName": "BACLEARS.ANY", - "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore. Available PDIST counters: 0", + "PublicDescription": "Number of times the front-end is resteered when it finds a branch instruction in a fetch line. This is called Unknown Branch which occurs for the first time a branch instruction is fetched or when the branch is not tracked by the BPU (Branch Prediction Unit) anymore.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -13,7 +13,7 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.LCP", - "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk. Available PDIST counters: 0", + "PublicDescription": "Counts cycles that the Instruction Length decoder (ILD) stalls occurred due to dynamically changing prefix length of the decoded instruction (by operand size prefix instruction 0x66, address size prefix instruction 0x67 or REX.W for Intel64). Count is proportional to the number of prefixes in a 16B-line. This may result in a three-cycle penalty for each LCP (Length changing prefix) in a 16-byte chunk.", "SampleAfterValue": "500009", "UMask": "0x1" }, @@ -22,7 +22,6 @@ "Counter": "0,1,2,3", "EventCode": "0x87", "EventName": "DECODE.MS_BUSY", - "PublicDescription": "Cycles the Microcode Sequencer is busy. Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x2" }, @@ -31,7 +30,7 @@ "Counter": "0,1,2,3", "EventCode": "0x61", "EventName": "DSB2MITE_SWITCHES.PENALTY_CYCLES", - "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE. Available PDIST counters: 0", + "PublicDescription": "Decode Stream Buffer (DSB) is a Uop-cache that holds translations of previously fetched instructions that were decoded by the legacy x86 decode pipeline (MITE). This event counts fetch penalty cycles when a transition occurs from DSB to MITE.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -249,7 +248,7 @@ "Counter": "0,1,2,3", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALLS", - "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code line fetch is stalled due to an L1 instruction cache miss. The decode pipeline works at a 32 Byte granularity.", "SampleAfterValue": "500009", "UMask": "0x4" }, @@ -260,7 +259,6 @@ "EdgeDetect": "1", "EventCode": "0x80", "EventName": "ICACHE_DATA.STALL_PERIODS", - "PublicDescription": "ICACHE_DATA.STALL_PERIODS Available PDIST counters: 0", "SampleAfterValue": "500009", "UMask": "0x4" }, @@ -269,7 +267,7 @@ "Counter": "0,1,2,3", "EventCode": "0x83", "EventName": "ICACHE_TAG.STALLS", - "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where a code fetch is stalled due to L1 instruction cache tag miss.", "SampleAfterValue": "200003", "UMask": "0x4" }, @@ -279,7 +277,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -289,7 +287,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.DSB_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ. Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the DSB (Decode Stream Buffer) path. Count includes uops that may 'bypass' the IDQ.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -298,7 +296,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.DSB_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the Decode Stream Buffer (DSB) path.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -308,7 +306,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_ANY", - "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles uops were delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -318,7 +316,7 @@ "CounterMask": "6", "EventCode": "0x79", "EventName": "IDQ.MITE_CYCLES_OK", - "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles where optimal number of uops was delivered to the Instruction Decode Queue (IDQ) from the MITE (legacy decode pipeline) path. During these cycles uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -327,7 +325,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MITE_UOPS", - "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to Instruction Decode Queue (IDQ) from the MITE path. This also means that uops are not being delivered from the Decode Stream Buffer (DSB).", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -337,7 +335,7 @@ "CounterMask": "1", "EventCode": "0x79", "EventName": "IDQ.MS_CYCLES_ANY", - "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which uops are being delivered to Instruction Decode Queue (IDQ) while the Microcode Sequencer (MS) is busy. Uops maybe initiated by Decode Stream Buffer (DSB) or MITE.", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -348,7 +346,7 @@ "EdgeDetect": "1", "EventCode": "0x79", "EventName": "IDQ.MS_SWITCHES", - "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer. Available PDIST counters: 0", + "PublicDescription": "Number of switches from DSB (Decode Stream Buffer) or MITE (legacy decode pipeline) to the Microcode Sequencer.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -357,7 +355,7 @@ "Counter": "0,1,2,3", "EventCode": "0x79", "EventName": "IDQ.MS_UOPS", - "PublicDescription": "Counts the total number of uops delivered by the Microcode Sequencer (MS). Available PDIST counters: 0", + "PublicDescription": "Counts the total number of uops delivered by the Microcode Sequencer (MS).", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -366,7 +364,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CORE", - "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -376,7 +374,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -387,7 +385,7 @@ "EventCode": "0x9c", "EventName": "IDQ_BUBBLES.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -396,7 +394,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CORE", - "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops not delivered to by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -406,7 +404,7 @@ "CounterMask": "6", "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_0_UOPS_DELIV.CORE", - "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when no uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_0_UOPS_DELIV.CORE]", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -417,7 +415,7 @@ "EventCode": "0x9c", "EventName": "IDQ_UOPS_NOT_DELIVERED.CYCLES_FE_WAS_OK", "Invert": "1", - "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK] Available PDIST counters: 0", + "PublicDescription": "Counts the number of cycles when the optimal number of uops were delivered by the Instruction Decode Queue (IDQ) to the back-end of the pipeline when there was no back-end stalls. This event counts for one SMT thread in a given cycle. [This event is alias to IDQ_BUBBLES.CYCLES_FE_WAS_OK]", "SampleAfterValue": "1000003", "UMask": "0x1" } diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json index 7c3f9b76d367..5e6c1f05c981 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/memory.json @@ -5,7 +5,6 @@ "CounterMask": "6", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x6" }, @@ -14,7 +13,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.MEMORY_ORDERING", - "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture Available PDIST counters: 0", + "PublicDescription": "Counts the number of Machine Clears detected dye to memory ordering. Memory Ordering Machine Clears may apply when a memory read may not conform to the memory ordering rules of the x86 architecture", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -24,7 +23,6 @@ "CounterMask": "2", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -34,7 +32,6 @@ "CounterMask": "3", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -44,7 +41,7 @@ "CounterMask": "5", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L2 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x5" }, @@ -54,7 +51,7 @@ "CounterMask": "9", "EventCode": "0x47", "EventName": "MEMORY_ACTIVITY.STALLS_L3_MISS", - "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock). Available PDIST counters: 0", + "PublicDescription": "Execution stalls while L3 cache miss demand cacheable load request is outstanding (will not count for uncacheable demand requests e.g. bus lock).", "SampleAfterValue": "1000003", "UMask": "0x9" }, @@ -478,7 +475,6 @@ "Counter": "0,1,2,3", "EventCode": "0x21", "EventName": "OFFCORE_REQUESTS.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "Counts demand data read requests that miss the L3 cache. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -487,7 +483,7 @@ "Counter": "0,1,2,3", "EventCode": "0x20", "EventName": "OFFCORE_REQUESTS_OUTSTANDING.L3_MISS_DEMAND_DATA_RD", - "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache. Available PDIST counters: 0", + "PublicDescription": "For every cycle, increments by the number of demand data read requests pending that are known to have missed the L3 cache. Note that this does not capture all elapsed cycles while requests are outstanding - only cycles from when the requests were known by the requesting core to have missed the L3 cache.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -505,7 +501,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_EVENTS", - "PublicDescription": "Counts the number of times an RTM execution aborted due to none of the previous 3 categories (e.g. interrupt). Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to none of the previous 3 categories (e.g. interrupt).", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -514,7 +510,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_MEM", - "PublicDescription": "Counts the number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts). Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to various memory events (e.g. read/write capacity and conflicts).", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -523,7 +519,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_MEMTYPE", - "PublicDescription": "Counts the number of times an RTM execution aborted due to incompatible memory type. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to incompatible memory type.", "SampleAfterValue": "100003", "UMask": "0x40" }, @@ -532,7 +528,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.ABORTED_UNFRIENDLY", - "PublicDescription": "Counts the number of times an RTM execution aborted due to HLE-unfriendly instructions. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times an RTM execution aborted due to HLE-unfriendly instructions.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -541,7 +537,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.COMMIT", - "PublicDescription": "Counts the number of times RTM commit succeeded. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times RTM commit succeeded.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -550,7 +546,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc9", "EventName": "RTM_RETIRED.START", - "PublicDescription": "Counts the number of times we entered an RTM region. Does not count nested transactions. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times we entered an RTM region. Does not count nested transactions.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -559,7 +555,7 @@ "Counter": "0,1,2,3", "EventCode": "0x54", "EventName": "TX_MEM.ABORT_CAPACITY_READ", - "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional reads Available PDIST counters: 0", + "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional reads", "SampleAfterValue": "100003", "UMask": "0x80" }, @@ -568,7 +564,7 @@ "Counter": "0,1,2,3", "EventCode": "0x54", "EventName": "TX_MEM.ABORT_CAPACITY_WRITE", - "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional writes. Available PDIST counters: 0", + "PublicDescription": "Speculatively counts the number of Transactional Synchronization Extensions (TSX) aborts due to a data capacity limitation for transactional writes.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -577,7 +573,7 @@ "Counter": "0,1,2,3", "EventCode": "0x54", "EventName": "TX_MEM.ABORT_CONFLICT", - "PublicDescription": "Counts the number of times a TSX line had a cache conflict. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times a TSX line had a cache conflict.", "SampleAfterValue": "100003", "UMask": "0x1" } diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/other.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/other.json index a58d65556609..21f49f609ed4 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/other.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/other.json @@ -4,11 +4,35 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.PAGE_FAULT", - "PublicDescription": "ASSISTS.PAGE_FAULT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, { + "BriefDescription": "HW_INTERRUPTS.MASKED", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xcb", + "EventName": "HW_INTERRUPTS.MASKED", + "SampleAfterValue": "100003", + "UMask": "0x2" + }, + { + "BriefDescription": "HW_INTERRUPTS.PENDING_AND_MASKED", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xcb", + "EventName": "HW_INTERRUPTS.PENDING_AND_MASKED", + "SampleAfterValue": "100003", + "UMask": "0x4" + }, + { + "BriefDescription": "Number of hardware interrupts received by the processor.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xcb", + "EventName": "HW_INTERRUPTS.RECEIVED", + "PublicDescription": "Counts the number of hardware interruptions received by the processor.", + "SampleAfterValue": "203", + "UMask": "0x1" + }, + { "BriefDescription": "Counts streaming stores that have any type of response.", "Counter": "0,1,2,3", "EventCode": "0x2A,0x2B", @@ -25,7 +49,7 @@ "CounterMask": "1", "EventCode": "0x2d", "EventName": "XQ.FULL_CYCLES", - "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache). Available PDIST counters: 0", + "PublicDescription": "number of cycles when the thread is active and the uncore cannot take any further requests (for example prefetches, loads or stores initiated by the Core that miss the L2 cache).", "SampleAfterValue": "1000003", "UMask": "0x1" } diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json index 48bec483b49a..1fa7957956df 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/pipeline.json @@ -6,7 +6,6 @@ "Deprecated": "1", "EventCode": "0xb0", "EventName": "ARITH.DIVIDER_ACTIVE", - "PublicDescription": "This event is deprecated. Refer to new event ARITH.DIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x9" }, @@ -16,7 +15,7 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.DIV_ACTIVE", - "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when divide unit is busy executing divide or square root operations. Accounts for integer and floating-point operations.", "SampleAfterValue": "1000003", "UMask": "0x9" }, @@ -27,7 +26,6 @@ "Deprecated": "1", "EventCode": "0xb0", "EventName": "ARITH.FP_DIVIDER_ACTIVE", - "PublicDescription": "This event is deprecated. Refer to new event ARITH.FPDIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -37,7 +35,6 @@ "CounterMask": "1", "EventCode": "0xb0", "EventName": "ARITH.IDIV_ACTIVE", - "PublicDescription": "This event counts the cycles the integer divider is busy. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -48,7 +45,6 @@ "Deprecated": "1", "EventCode": "0xb0", "EventName": "ARITH.INT_DIVIDER_ACTIVE", - "PublicDescription": "This event is deprecated. Refer to new event ARITH.IDIV_ACTIVE Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -57,7 +53,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc1", "EventName": "ASSISTS.ANY", - "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists. Available PDIST counters: 0", + "PublicDescription": "Counts the number of occurrences where a microcode assist is invoked by hardware. Examples include AD (page Access Dirty), FP and AVX related assists.", "SampleAfterValue": "100003", "UMask": "0x1b" }, @@ -217,7 +213,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C01", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 light-weight slower wakeup time but more power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -226,7 +222,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C02", - "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.2 light-weight faster wakeup time but less power saving optimized state. This state can be entered via the TPAUSE or UMWAIT instructions.", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -235,7 +231,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.C0_WAIT", - "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction. Available PDIST counters: 0", + "PublicDescription": "Counts core clocks when the thread is in the C0.1 or C0.2 power saving optimized states (TPAUSE or UMWAIT instructions) or running the PAUSE instruction.", "SampleAfterValue": "2000003", "UMask": "0x70" }, @@ -244,7 +240,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.DISTRIBUTED", - "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes cycle counts between active hyperthreads, i.e., those in C0. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If all other hyperthreads are inactive (or disabled or do not exist), all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -253,7 +249,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.ONE_THREAD_ACTIVE", - "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted. Available PDIST counters: 0", + "PublicDescription": "Counts Core crystal clock cycles when current thread is unhalted and the other thread is halted.", "SampleAfterValue": "25003", "UMask": "0x2" }, @@ -262,7 +258,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40" }, @@ -273,7 +268,6 @@ "EdgeDetect": "1", "EventCode": "0xec", "EventName": "CPU_CLK_UNHALTED.PAUSE_INST", - "PublicDescription": "CPU_CLK_UNHALTED.PAUSE_INST Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x40" }, @@ -282,7 +276,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_DISTRIBUTED", - "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread. Available PDIST counters: 0", + "PublicDescription": "This event distributes Core crystal clock cycle counts between active hyperthreads, i.e., those in C0 sleep-state. A hyperthread becomes inactive when it executes the HLT or MWAIT instructions. If one thread is active in a core, all counts are attributed to this hyperthread. To obtain the full count when the Core is active, sum the counts from each hyperthread.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -299,7 +293,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.REF_TSC_P", - "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case. Available PDIST counters: 0", + "PublicDescription": "Counts the number of reference cycles when the core is not in a halt state. The core enters the halt state when it is running the HLT instruction or the MWAIT instruction. This event is not affected by core frequency changes (for example, P states, TM2 transitions) but has the same incrementing frequency as the time stamp counter. This event can approximate elapsed time while the core was not in a halt state. It is counted on a dedicated fixed counter, leaving the four (eight when Hyperthreading is disabled) programmable counters available for other events. Note: On all current platforms this event stops counting during 'throttling (TM)' states duty off periods the processor is 'halted'. The counter update is done at a lower clock rate then the core clock the overflow status bit for this counter may appear 'sticky'. After the counter has overflowed and software clears the overflow status bit and resets the counter to less than MAX. The reset value to the counter is not clocked immediately so the overflow status bit will flip 'high (1)' and generate another PMI (if enabled) after which the reset value gets clocked into the counter. Therefore, software will get the interrupt, read the overflow status bit '1 for bit 34 while the counter value is less than MAX. Software should ignore this case.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -316,7 +310,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x3c", "EventName": "CPU_CLK_UNHALTED.THREAD_P", - "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time. Available PDIST counters: 0", + "PublicDescription": "This is an architectural event that counts the number of thread cycles while the thread is not in a halt state. The thread enters the halt state when it is running the HLT instruction. The core frequency may change from time to time due to power or thermal throttling. For this reason, this event may have a changing ratio with regards to wall clock time.", "SampleAfterValue": "2000003" }, { @@ -325,7 +319,6 @@ "CounterMask": "8", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L1D_MISS", - "PublicDescription": "Cycles while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x8" }, @@ -335,7 +328,6 @@ "CounterMask": "1", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_L2_MISS", - "PublicDescription": "Cycles while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -345,7 +337,6 @@ "CounterMask": "16", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.CYCLES_MEM_ANY", - "PublicDescription": "Cycles while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -355,7 +346,6 @@ "CounterMask": "12", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L1D_MISS", - "PublicDescription": "Execution stalls while L1 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xc" }, @@ -365,7 +355,6 @@ "CounterMask": "5", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_L2_MISS", - "PublicDescription": "Execution stalls while L2 cache miss demand load is outstanding. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x5" }, @@ -375,7 +364,6 @@ "CounterMask": "4", "EventCode": "0xa3", "EventName": "CYCLE_ACTIVITY.STALLS_TOTAL", - "PublicDescription": "Total execution stalls. Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x4" }, @@ -384,7 +372,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb7", "EventName": "EXE.AMX_BUSY", - "PublicDescription": "Counts the cycles where the AMX (Advance Matrix Extension) unit is busy performing an operation. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -393,7 +380,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.1_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 1 uop was executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -402,7 +389,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_3_PORTS_UTIL", - "PublicDescription": "Cycles total of 2 or 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0xc" }, @@ -411,7 +397,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.2_PORTS_UTIL", - "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which a total of 2 uops were executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -420,7 +406,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.3_PORTS_UTIL", - "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 3 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -429,7 +415,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.4_PORTS_UTIL", - "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty. Available PDIST counters: 0", + "PublicDescription": "Cycles total of 4 uops are executed on all ports and Reservation Station (RS) was not empty.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -439,7 +425,6 @@ "CounterMask": "5", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_LOADS", - "PublicDescription": "Execution stalls while memory subsystem has an outstanding load. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x21" }, @@ -449,7 +434,7 @@ "CounterMask": "2", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.BOUND_ON_STORES", - "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where the Store Buffer was full and no loads caused an execution stall.", "SampleAfterValue": "1000003", "UMask": "0x40" }, @@ -458,7 +443,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa6", "EventName": "EXE_ACTIVITY.EXE_BOUND_0_PORTS", - "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load. Available PDIST counters: 0", + "PublicDescription": "Number of cycles total of 0 uops executed on all ports, Reservation Station (RS) was not empty, the Store Buffer (SB) was not full and there was no outstanding load.", "SampleAfterValue": "1000003", "UMask": "0x80" }, @@ -467,7 +452,7 @@ "Counter": "0,1,2,3", "EventCode": "0x75", "EventName": "INST_DECODED.DECODERS", - "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions. Available PDIST counters: 0", + "PublicDescription": "Number of decoders utilized in a cycle when the MITE (legacy decode pipeline) fetches instructions.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -492,7 +477,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.MACRO_FUSED", - "PublicDescription": "INST_RETIRED.MACRO_FUSED Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -501,7 +485,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.NOP", - "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions Available PDIST counters: 0", + "PublicDescription": "Counts all retired NOP or ENDBR32/64 instructions", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -518,7 +502,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc0", "EventName": "INST_RETIRED.REP_ITERATION", - "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent. Available PDIST counters: 0", + "PublicDescription": "Number of iterations of Repeat (REP) string retired instructions such as MOVS, CMPS, and SCAS. Each has a byte, word, and doubleword version and string instructions can be repeated using a repetition prefix, REP, that allows their architectural execution to be repeated a number of times as specified by the RCX register. Note the number of iterations is implementation-dependent.", "SampleAfterValue": "2000003", "UMask": "0x8" }, @@ -529,7 +513,7 @@ "EdgeDetect": "1", "EventCode": "0xad", "EventName": "INT_MISC.CLEARS_COUNT", - "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears Available PDIST counters: 0", + "PublicDescription": "Counts the number of speculative clears due to any type of branch misprediction or machine clears", "SampleAfterValue": "500009", "UMask": "0x1" }, @@ -538,7 +522,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.CLEAR_RESTEER_CYCLES", - "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path. Available PDIST counters: 0", + "PublicDescription": "Cycles after recovery from a branch misprediction or machine clear till the first uop is issued from the resteered path.", "SampleAfterValue": "500009", "UMask": "0x80" }, @@ -547,7 +531,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.MBA_STALLS", - "PublicDescription": "INT_MISC.MBA_STALLS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -556,7 +539,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.RECOVERY_CYCLES", - "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event. Available PDIST counters: 0", + "PublicDescription": "Counts core cycles when the Resource allocator was stalled due to recovery from an earlier branch misprediction or machine clear event.", "SampleAfterValue": "500009", "UMask": "0x1" }, @@ -567,7 +550,6 @@ "EventName": "INT_MISC.UNKNOWN_BRANCH_CYCLES", "MSRIndex": "0x3F7", "MSRValue": "0x7", - "PublicDescription": "Bubble cycles of BAClear (Unknown Branch). Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40" }, @@ -576,7 +558,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xad", "EventName": "INT_MISC.UOP_DROPPING", - "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons Available PDIST counters: 0", + "PublicDescription": "Estimated number of Top-down Microarchitecture Analysis slots that got dropped due to non front-end reasons", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -585,7 +567,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.128BIT", - "PublicDescription": "INT_VEC_RETIRED.128BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x13" }, @@ -594,7 +575,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.256BIT", - "PublicDescription": "INT_VEC_RETIRED.256BIT Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0xac" }, @@ -603,7 +583,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_128", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 128-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0x3" }, @@ -612,7 +592,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.ADD_256", - "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions. Available PDIST counters: 0", + "PublicDescription": "Number of retired integer ADD/SUB (regular or horizontal), SAD 256-bit vector instructions.", "SampleAfterValue": "1000003", "UMask": "0xc" }, @@ -621,7 +601,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.MUL_256", - "PublicDescription": "INT_VEC_RETIRED.MUL_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x80" }, @@ -630,7 +609,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.SHUFFLES", - "PublicDescription": "INT_VEC_RETIRED.SHUFFLES Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x40" }, @@ -639,7 +617,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_128", - "PublicDescription": "INT_VEC_RETIRED.VNNI_128 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x10" }, @@ -648,7 +625,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe7", "EventName": "INT_VEC_RETIRED.VNNI_256", - "PublicDescription": "INT_VEC_RETIRED.VNNI_256 Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x20" }, @@ -657,7 +633,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.ADDRESS_ALIAS", - "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times a load got blocked due to false dependencies in MOB due to partial compare on address.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -666,7 +642,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.NO_SR", - "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times that split load operations are temporarily blocked because all resources for handling the split accesses are in use.", "SampleAfterValue": "100003", "UMask": "0x88" }, @@ -675,7 +651,7 @@ "Counter": "0,1,2,3", "EventCode": "0x03", "EventName": "LD_BLOCKS.STORE_FORWARD", - "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide. Available PDIST counters: 0", + "PublicDescription": "Counts the number of times where store forwarding was prevented for a load operation. The most common case is a load blocked due to the address of memory access (partially) overlapping with a preceding uncompleted store. Note: See the table of not supported store forwards in the Optimization Guide.", "SampleAfterValue": "100003", "UMask": "0x82" }, @@ -684,7 +660,7 @@ "Counter": "0,1,2,3", "EventCode": "0x4c", "EventName": "LOAD_HIT_PREFETCH.SWPF", - "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions. Available PDIST counters: 0", + "PublicDescription": "Counts all software-prefetch load dispatches that hit the fill buffer (FB) allocated for the software prefetch. It can also be incremented by some lock instructions. So it should only be used with profiling so that the locks can be excluded by ASM (Assembly File) inspection of the nearby instructions.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -694,7 +670,7 @@ "CounterMask": "1", "EventCode": "0xa8", "EventName": "LSD.CYCLES_ACTIVE", - "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when at least one uop is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -704,7 +680,7 @@ "CounterMask": "6", "EventCode": "0xa8", "EventName": "LSD.CYCLES_OK", - "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector). Available PDIST counters: 0", + "PublicDescription": "Counts the cycles when optimal number of uops is delivered by the LSD (Loop-stream detector).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -713,7 +689,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa8", "EventName": "LSD.UOPS", - "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops delivered to the back-end by the LSD(Loop Stream Detector).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -724,7 +700,7 @@ "EdgeDetect": "1", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.COUNT", - "PublicDescription": "Counts the number of machine clears (nukes) of any type. Available PDIST counters: 0", + "PublicDescription": "Counts the number of machine clears (nukes) of any type.", "SampleAfterValue": "100003", "UMask": "0x1" }, @@ -733,7 +709,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc3", "EventName": "MACHINE_CLEARS.SMC", - "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear. Available PDIST counters: 0", + "PublicDescription": "Counts self-modifying code (SMC) detected, which causes a machine clear.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -742,7 +718,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xe0", "EventName": "MISC2_RETIRED.LFENCE", - "PublicDescription": "number of LFENCE retired instructions Available PDIST counters: 0", + "PublicDescription": "number of LFENCE retired instructions", "SampleAfterValue": "400009", "UMask": "0x20" }, @@ -751,7 +727,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xcc", "EventName": "MISC_RETIRED.LBR_INSERTS", - "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT. Available PDIST counters: 0", + "PublicDescription": "Increments when an entry is added to the Last Branch Record (LBR) array (or removed from the array in case of RETURNs in call stack mode). The event requires LBR enable via IA32_DEBUGCTL MSR and branch type selection via MSR_LBR_SELECT.", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -760,7 +736,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SB", - "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end. Available PDIST counters: 0", + "PublicDescription": "Counts allocation stall cycles caused by the store buffer (SB) being full. This counts cycles that the pipeline back-end blocked uop delivery from the front-end.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -769,7 +745,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa2", "EventName": "RESOURCE_STALLS.SCOREBOARD", - "PublicDescription": "Counts cycles where the pipeline is stalled due to serializing operations. Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -778,7 +753,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY", - "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses) Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which the reservation station (RS) is empty for this logical processor. This is usually caused when the front-end pipeline runs into starvation periods (e.g. branch mispredictions or i-cache misses)", "SampleAfterValue": "1000003", "UMask": "0x7" }, @@ -790,7 +765,7 @@ "EventCode": "0xa5", "EventName": "RS.EMPTY_COUNT", "Invert": "1", - "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events) Available PDIST counters: 0", + "PublicDescription": "Counts end of periods where the Reservation Station (RS) was empty. Could be useful to closely sample on front-end latency issues (see the FRONTEND_RETIRED event of designated precise events)", "SampleAfterValue": "100003", "UMask": "0x7" }, @@ -799,7 +774,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa5", "EventName": "RS.EMPTY_RESOURCE", - "PublicDescription": "Cycles when Reservation Station (RS) is empty due to a resource in the back-end Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -812,7 +786,6 @@ "EventCode": "0xa5", "EventName": "RS_EMPTY.COUNT", "Invert": "1", - "PublicDescription": "This event is deprecated. Refer to new event RS.EMPTY_COUNT Available PDIST counters: 0", "SampleAfterValue": "100003", "UMask": "0x7" }, @@ -822,7 +795,6 @@ "Deprecated": "1", "EventCode": "0xa5", "EventName": "RS_EMPTY.CYCLES", - "PublicDescription": "This event is deprecated. Refer to new event RS.EMPTY Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x7" }, @@ -831,7 +803,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.BACKEND_BOUND_SLOTS", - "PublicDescription": "Number of slots in TMA method where no micro-operations were being issued from front-end to back-end of the machine due to lack of back-end resources. Available PDIST counters: 0", + "PublicDescription": "Number of slots in TMA method where no micro-operations were being issued from front-end to back-end of the machine due to lack of back-end resources.", "SampleAfterValue": "10000003", "UMask": "0x2" }, @@ -840,7 +812,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BAD_SPEC_SLOTS", - "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations. Available PDIST counters: 0", + "PublicDescription": "Number of slots of TMA method that were wasted due to incorrect speculation. It covers all types of control-flow or data-related mis-speculations.", "SampleAfterValue": "10000003", "UMask": "0x4" }, @@ -849,7 +821,7 @@ "Counter": "0", "EventCode": "0xa4", "EventName": "TOPDOWN.BR_MISPREDICT_SLOTS", - "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction. Available PDIST counters: 0", + "PublicDescription": "Number of TMA slots that were wasted due to incorrect speculation by (any type of) branch mispredictions. This event estimates number of speculative operations that were issued but not retired as well as the out-of-order engine recovery past a branch misprediction.", "SampleAfterValue": "10000003", "UMask": "0x8" }, @@ -858,7 +830,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.MEMORY_BOUND_SLOTS", - "PublicDescription": "TOPDOWN.MEMORY_BOUND_SLOTS Available PDIST counters: 0", "SampleAfterValue": "10000003", "UMask": "0x10" }, @@ -875,7 +846,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xa4", "EventName": "TOPDOWN.SLOTS_P", - "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core. Available PDIST counters: 0", + "PublicDescription": "Counts the number of available slots for an unhalted logical processor. The event increments by machine-width of the narrowest pipeline as employed by the Top-down Microarchitecture Analysis method. The count is distributed among unhalted logical processors (hyper-threads) who share the same physical core.", "SampleAfterValue": "10000003", "UMask": "0x1" }, @@ -884,7 +855,6 @@ "Counter": "0,1,2,3", "EventCode": "0x76", "EventName": "UOPS_DECODED.DEC0_UOPS", - "PublicDescription": "UOPS_DECODED.DEC0_UOPS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x1" }, @@ -893,7 +863,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_0", - "PublicDescription": "Number of uops dispatch to execution port 0. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 0.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -902,7 +872,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_1", - "PublicDescription": "Number of uops dispatch to execution port 1. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 1.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -911,7 +881,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_2_3_10", - "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 2, 3 and 10", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -920,7 +890,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_4_9", - "PublicDescription": "Number of uops dispatch to execution ports 4 and 9 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 4 and 9", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -929,7 +899,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_5_11", - "PublicDescription": "Number of uops dispatch to execution ports 5 and 11 Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 5 and 11", "SampleAfterValue": "2000003", "UMask": "0x20" }, @@ -938,7 +908,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_6", - "PublicDescription": "Number of uops dispatch to execution port 6. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution port 6.", "SampleAfterValue": "2000003", "UMask": "0x40" }, @@ -947,7 +917,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb2", "EventName": "UOPS_DISPATCHED.PORT_7_8", - "PublicDescription": "Number of uops dispatch to execution ports 7 and 8. Available PDIST counters: 0", + "PublicDescription": "Number of uops dispatch to execution ports 7 and 8.", "SampleAfterValue": "2000003", "UMask": "0x80" }, @@ -956,7 +926,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE", - "PublicDescription": "Counts the number of uops executed from any thread. Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops executed from any thread.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -966,7 +936,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_1", - "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 1 micro-op is executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -976,7 +946,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_2", - "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 2 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -986,7 +956,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_3", - "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 3 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -996,7 +966,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CORE_CYCLES_GE_4", - "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least 4 micro-ops are executed from any thread on physical core.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1006,7 +976,7 @@ "CounterMask": "1", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_1", - "PublicDescription": "Cycles where at least 1 uop was executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 1 uop was executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1016,7 +986,7 @@ "CounterMask": "2", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_2", - "PublicDescription": "Cycles where at least 2 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 2 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1026,7 +996,7 @@ "CounterMask": "3", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_3", - "PublicDescription": "Cycles where at least 3 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 3 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1036,7 +1006,7 @@ "CounterMask": "4", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.CYCLES_GE_4", - "PublicDescription": "Cycles where at least 4 uops were executed per-thread. Available PDIST counters: 0", + "PublicDescription": "Cycles where at least 4 uops were executed per-thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1047,7 +1017,7 @@ "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.STALLS", "Invert": "1", - "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread. Available PDIST counters: 0", + "PublicDescription": "Counts cycles during which no uops were dispatched from the Reservation Station (RS) per thread.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1059,7 +1029,6 @@ "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.STALL_CYCLES", "Invert": "1", - "PublicDescription": "This event is deprecated. Refer to new event UOPS_EXECUTED.STALLS Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1068,7 +1037,6 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.THREAD", - "PublicDescription": "Counts the number of uops to be executed per-thread each cycle. Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1077,7 +1045,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xb1", "EventName": "UOPS_EXECUTED.X87", - "PublicDescription": "Counts the number of x87 uops executed. Available PDIST counters: 0", + "PublicDescription": "Counts the number of x87 uops executed.", "SampleAfterValue": "2000003", "UMask": "0x10" }, @@ -1086,7 +1054,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xae", "EventName": "UOPS_ISSUED.ANY", - "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS). Available PDIST counters: 0", + "PublicDescription": "Counts the number of uops that the Resource Allocation Table (RAT) issues to the Reservation Station (RS).", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1096,7 +1064,6 @@ "CounterMask": "1", "EventCode": "0xae", "EventName": "UOPS_ISSUED.CYCLES", - "PublicDescription": "UOPS_ISSUED.CYCLES Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1106,7 +1073,7 @@ "CounterMask": "1", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.CYCLES", - "PublicDescription": "Counts cycles where at least one uop has retired. Available PDIST counters: 0", + "PublicDescription": "Counts cycles where at least one uop has retired.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1115,7 +1082,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.HEAVY", - "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count. Available PDIST counters: 0", + "PublicDescription": "Counts the number of retired micro-operations (uops) except the last uop of each instruction. An instruction that is decoded into less than two uops does not contribute to the count.", "SampleAfterValue": "2000003", "UMask": "0x1" }, @@ -1126,7 +1093,6 @@ "EventName": "UOPS_RETIRED.MS", "MSRIndex": "0x3F7", "MSRValue": "0x8", - "PublicDescription": "UOPS_RETIRED.MS Available PDIST counters: 0", "SampleAfterValue": "2000003", "UMask": "0x4" }, @@ -1135,7 +1101,7 @@ "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xc2", "EventName": "UOPS_RETIRED.SLOTS", - "PublicDescription": "Counts the retirement slots used each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the retirement slots used each cycle.", "SampleAfterValue": "2000003", "UMask": "0x2" }, @@ -1146,7 +1112,7 @@ "EventCode": "0xc2", "EventName": "UOPS_RETIRED.STALLS", "Invert": "1", - "PublicDescription": "This event counts cycles without actually retired uops. Available PDIST counters: 0", + "PublicDescription": "This event counts cycles without actually retired uops.", "SampleAfterValue": "1000003", "UMask": "0x2" }, @@ -1158,7 +1124,6 @@ "EventCode": "0xc2", "EventName": "UOPS_RETIRED.STALL_CYCLES", "Invert": "1", - "PublicDescription": "This event is deprecated. Refer to new event UOPS_RETIRED.STALLS Available PDIST counters: 0", "SampleAfterValue": "1000003", "UMask": "0x2" } diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json index fe3f288be10e..b6a28227a58a 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/spr-metrics.json @@ -1,28 +1,28 @@ [ { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" @@ -40,6 +40,18 @@ "ScaleUnit": "1per_instr" }, { + "BriefDescription": "The average number of cores that are in cstate C0 as observed by the power control unit (PCU)", + "MetricExpr": "UNC_P_POWER_STATE_OCCUPANCY_CORES_C0 / UNC_P_CLOCKTICKS * #num_packages", + "MetricGroup": "cpu_cstate", + "MetricName": "cpu_cstate_c0" + }, + { + "BriefDescription": "The average number of cores are in cstate C6 as observed by the power control unit (PCU)", + "MetricExpr": "UNC_P_POWER_STATE_OCCUPANCY_CORES_C6 / UNC_P_CLOCKTICKS * #num_packages", + "MetricGroup": "cpu_cstate", + "MetricName": "cpu_cstate_c6" + }, + { "BriefDescription": "CPU operating frequency (in GHz)", "MetricExpr": "CPU_CLK_UNHALTED.THREAD / CPU_CLK_UNHALTED.REF_TSC * #SYSTEM_TSC_FREQ / 1e9", "MetricName": "cpu_operating_frequency", @@ -91,6 +103,12 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "Bandwidth of inbound IO reads that are initiated by end device controllers that are requesting memory from the CPU and miss the L3 cache", + "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_read_l3_miss", + "ScaleUnit": "1MB/s" + }, + { "BriefDescription": "Bandwidth of IO reads that are initiated by end device controllers that are requesting memory from the local CPU socket", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_PCIRDCUR_LOCAL * 64 / 1e6 / duration_time", "MetricName": "io_bandwidth_read_local", @@ -109,6 +127,12 @@ "ScaleUnit": "1MB/s" }, { + "BriefDescription": "Bandwidth of inbound IO writes that are initiated by end device controllers that are writing memory to the CPU", + "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_MISS_ITOM + UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR) * 64 / 1e6 / duration_time", + "MetricName": "io_bandwidth_write_l3_miss", + "ScaleUnit": "1MB/s" + }, + { "BriefDescription": "Bandwidth of IO writes that are initiated by end device controllers that are writing memory to the local CPU socket", "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_ITOM_LOCAL + UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR_LOCAL) * 64 / 1e6 / duration_time", "MetricName": "io_bandwidth_write_local", @@ -123,19 +147,19 @@ { "BriefDescription": "Percentage of inbound full cacheline writes initiated by end device controllers that miss the L3 cache", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_ITOM / UNC_CHA_TOR_INSERTS.IO_ITOM", - "MetricName": "io_percent_of_inbound_full_writes_that_miss_l3", + "MetricName": "io_full_write_l3_miss", "ScaleUnit": "100%" }, { "BriefDescription": "Percentage of inbound partial cacheline writes initiated by end device controllers that miss the L3 cache", "MetricExpr": "(UNC_CHA_TOR_INSERTS.IO_MISS_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_MISS_RFO) / (UNC_CHA_TOR_INSERTS.IO_ITOMCACHENEAR + UNC_CHA_TOR_INSERTS.IO_RFO)", - "MetricName": "io_percent_of_inbound_partial_writes_that_miss_l3", + "MetricName": "io_partial_write_l3_miss", "ScaleUnit": "100%" }, { "BriefDescription": "Percentage of inbound reads initiated by end device controllers that miss the L3 cache", "MetricExpr": "UNC_CHA_TOR_INSERTS.IO_MISS_PCIRDCUR / UNC_CHA_TOR_INSERTS.IO_PCIRDCUR", - "MetricName": "io_percent_of_inbound_reads_that_miss_l3", + "MetricName": "io_read_l3_miss", "ScaleUnit": "100%" }, { @@ -395,7 +419,7 @@ { "BriefDescription": "This category represents fraction of slots where no uops are being delivered due to a lack of required resources for accepting new uops in the Backend", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-be\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvOB;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_backend_bound", "MetricThreshold": "tma_backend_bound > 0.2", @@ -430,39 +454,39 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * tma_amx_busy / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * tma_amx_busy / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - (1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20" }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * ((1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + RS.EMPTY_RESOURCE / tma_info_thread_clks * tma_ports_utilized_0) / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricExpr": "100 * ((1 - INST_RETIRED.REP_ITERATION / cpu@UOPS_RETIRED.MS\\,cmask\\=1@) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * tma_other_mispredicts / tma_branch_mispredicts) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + RS.EMPTY_RESOURCE / tma_info_thread_clks * tma_ports_utilized_0) / (tma_amx_busy + tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -470,7 +494,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", - "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", + "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", "MetricThreshold": "tma_bottleneck_memory_data_tlbs > 20", @@ -478,7 +502,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", - "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", + "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_cxl_mem_bound + tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", "MetricGroup": "BvMS;LockCont;Mem;Offcore;tma_issueSyncxn", "MetricName": "tma_bottleneck_memory_synchronization", "MetricThreshold": "tma_bottleneck_memory_synchronization > 10", @@ -494,7 +518,7 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -510,7 +534,7 @@ { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Branch Misprediction", "DefaultMetricgroupName": "TopdownL2", - "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-br\\-mispredict / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BadSpec;BrMispredicts;BvMP;Default;TmaL2;TopdownL2;tma_L2_group;tma_bad_speculation_group;tma_issueBM", "MetricName": "tma_branch_mispredicts", "MetricThreshold": "tma_branch_mispredicts > 0.1 & tma_bad_speculation > 0.15", @@ -611,7 +635,6 @@ }, { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to contested accesses", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(76.6 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) + 74.6 * tma_info_system_core_frequency * MEM_LOAD_L3_HIT_RETIRED.XSNP_MISS) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "BvMS;DataSharing;LockCont;Offcore;Snoop;TopdownL4;tma_L4_group;tma_issueSyncxn;tma_l3_bound_group", "MetricName": "tma_contested_accesses", @@ -631,6 +654,15 @@ "ScaleUnit": "100%" }, { + "BriefDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external CXL Memory by loads (e.g", + "MetricExpr": "(((1 - ((19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS))) / (19 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + 10 * (MEM_LOAD_L3_MISS_RETIRED.LOCAL_DRAM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_FWD * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) + MEM_LOAD_L3_MISS_RETIRED.REMOTE_HITM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS)) + (25 * (MEM_LOAD_RETIRED.LOCAL_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0) + 33 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS) if #has_pmem > 0 else 0))) if #has_pmem > 0 else 1)) * (MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks) if 1e6 * (MEM_LOAD_L3_MISS_RETIRED.REMOTE_PMM + MEM_LOAD_RETIRED.LOCAL_PMM) > MEM_LOAD_RETIRED.L1_MISS else 0) if #has_pmem > 0 else 0)", + "MetricGroup": "MemoryBound;Server;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", + "MetricName": "tma_cxl_mem_bound", + "MetricThreshold": "tma_cxl_mem_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", + "PublicDescription": "This metric roughly estimates (based on idle latencies) how often the CPU was stalled on accesses to external CXL Memory by loads (e.g. 3D-Xpoint (Crystal Ridge, a.k.a. IXP) memory, PMM - Persistent Memory Module [from CLX to SPR] or any other CXL Type3 Memory [EMR onwards]).", + "ScaleUnit": "100%" + }, + { "BriefDescription": "This metric estimates fraction of cycles while the memory subsystem was handling synchronizations due to data-sharing accesses", "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "74.6 * tma_info_system_core_frequency * (MEM_LOAD_L3_HIT_RETIRED.XSNP_NO_FWD + MEM_LOAD_L3_HIT_RETIRED.XSNP_FWD * (1 - OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM / (OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HITM + OCR.DEMAND_DATA_RD.L3_HIT.SNOOP_HIT_WITH_FWD))) * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", @@ -660,7 +692,7 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled on accesses to external memory (DRAM) by loads", - "MetricExpr": "MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks", + "MetricExpr": "(MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks - tma_cxl_mem_bound if #has_pmem > 0 else MEMORY_ACTIVITY.STALLS_L3_MISS / tma_info_thread_clks)", "MetricGroup": "MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_dram_bound", "MetricThreshold": "tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2)", @@ -718,7 +750,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -846,7 +878,7 @@ { "BriefDescription": "This metric represents fraction of slots where the CPU was retiring heavy-weight operations -- instructions that require two or more uops or micro-coded sequences", "DefaultMetricgroupName": "TopdownL2", - "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-heavy\\-ops / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "Default;Retire;TmaL2;TopdownL2;tma_L2_group;tma_retiring_group", "MetricName": "tma_heavy_operations", "MetricThreshold": "tma_heavy_operations > 0.1", @@ -1357,19 +1389,19 @@ { "BriefDescription": "Off-core accesses per kilo instruction for modified write requests", "MetricExpr": "1e3 * OCR.MODIFIED_WRITE.ANY_RESPONSE / tma_info_inst_mix_instructions", - "MetricGroup": "Offcore", + "MetricGroup": "Offcore;Server", "MetricName": "tma_info_memory_mix_offcore_mwrite_any_pki" }, { "BriefDescription": "Off-core accesses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.ANY_RESPONSE / tma_info_inst_mix_instructions", - "MetricGroup": "CacheHits;Offcore", + "MetricGroup": "CacheHits;Offcore;Server", "MetricName": "tma_info_memory_mix_offcore_read_any_pki" }, { "BriefDescription": "L3 cache misses per kilo instruction for reads-to-core requests (speculative; including in-core HW prefetches)", "MetricExpr": "1e3 * OCR.READS_TO_CORE.L3_MISS / tma_info_inst_mix_instructions", - "MetricGroup": "Offcore", + "MetricGroup": "Offcore;Server", "MetricName": "tma_info_memory_mix_offcore_read_l3m_pki" }, { @@ -1395,21 +1427,21 @@ { "BriefDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket", "MetricExpr": "64 * OCR.READS_TO_CORE.DRAM / 1e9 / tma_info_system_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricGroup": "HPC;Mem;MemoryBW;Offcore;Server", "MetricName": "tma_info_memory_soc_r2c_dram_bw", "PublicDescription": "Average DRAM BW for Reads-to-Core (R2C) covering for memory attached to local- and remote-socket. See R2C_Offcore_BW." }, { "BriefDescription": "Average L3-cache miss BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.L3_MISS / 1e9 / tma_info_system_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricGroup": "HPC;Mem;MemoryBW;Offcore;Server", "MetricName": "tma_info_memory_soc_r2c_l3m_bw", "PublicDescription": "Average L3-cache miss BW for Reads-to-Core (R2C). This covering going to DRAM or other memory off-chip memory tears. See R2C_Offcore_BW." }, { "BriefDescription": "Average Off-core access BW for Reads-to-Core (R2C)", "MetricExpr": "64 * OCR.READS_TO_CORE.ANY_RESPONSE / 1e9 / tma_info_system_time", - "MetricGroup": "HPC;Mem;MemoryBW;SoC", + "MetricGroup": "HPC;Mem;MemoryBW;Offcore;Server", "MetricName": "tma_info_memory_soc_r2c_offcore_bw", "PublicDescription": "Average Off-core access BW for Reads-to-Core (R2C). R2C account for demand or prefetch load/RFO/code access that fill data into the Core caches." }, @@ -1439,7 +1471,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -1486,7 +1518,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1498,16 +1530,28 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for reads [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_RPQ_INSERTS / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_read_bw" + }, + { + "BriefDescription": "Average 3DXP Memory Bandwidth Use for Writes [GB / sec]", + "MetricExpr": "(64 * UNC_M_PMM_WPQ_INSERTS / 1e9 / tma_info_system_time if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryBW;Server;SoC", + "MetricName": "tma_info_system_cxl_mem_write_bw" + }, + { "BriefDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]", "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / tma_info_system_time", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1572,8 +1616,14 @@ "PublicDescription": "Average number of parallel data read requests to external memory. Accounts for demand loads and L1/L2 prefetches" }, { + "BriefDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]", + "MetricExpr": "(1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD_PMM / UNC_CHA_TOR_INSERTS.IA_MISS_DRD_PMM) / uncore_cha_0@event\\=0x1@ if #has_pmem > 0 else 0)", + "MetricGroup": "MemOffcore;MemoryLat;Server;SoC", + "MetricName": "tma_info_system_mem_pmm_read_latency", + "PublicDescription": "Average latency of data read request to external 3D X-Point memory [in nanoseconds]. Accounts for demand loads and L1/L2 data-read prefetches" + }, + { "BriefDescription": "Average latency of data read request to external memory (in nanoseconds)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "1e9 * (UNC_CHA_TOR_OCCUPANCY.IA_MISS_DRD / UNC_CHA_TOR_INSERTS.IA_MISS_DRD) / (tma_info_system_socket_clks / tma_info_system_time)", "MetricGroup": "Mem;MemoryLat;SoC", "MetricName": "tma_info_system_mem_read_latency", @@ -1734,12 +1784,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - MEMORY_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1753,7 +1803,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles with demand load accesses that hit the L2 cache under unloaded scenarios (possibly L2 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "4.4 * tma_info_system_core_frequency * MEM_LOAD_RETIRED.L2_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2) / tma_info_thread_clks", "MetricGroup": "MemoryLat;TopdownL4;tma_L4_group;tma_l2_bound_group", "MetricName": "tma_l2_hit_latency", @@ -1772,12 +1821,11 @@ }, { "BriefDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "32.6 * tma_info_system_core_frequency * (MEM_LOAD_RETIRED.L3_HIT * (1 + MEM_LOAD_RETIRED.FB_HIT / MEM_LOAD_RETIRED.L1_MISS / 2)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1860,6 +1908,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(16 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (10 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -1892,7 +1941,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1901,13 +1950,13 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { "BriefDescription": "This metric represents fraction of slots the Memory subsystem within the Backend was a bottleneck", "DefaultMetricgroupName": "TopdownL2", - "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-mem\\-bound / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "Backend;Default;TmaL2;TopdownL2;tma_L2_group;tma_backend_bound_group", "MetricName": "tma_memory_bound", "MetricThreshold": "tma_memory_bound > 0.2 & tma_backend_bound > 0.2", @@ -1917,7 +1966,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to LFENCE Instructions.", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * MISC2_RETIRED.LFENCE / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_memory_fence", @@ -1970,7 +2018,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "max(IDQ.MS_CYCLES_ANY, cpu@UOPS_RETIRED.MS\\,cmask\\=1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY)) / tma_info_core_core_clks / 2", + "MetricExpr": "max(IDQ.MS_CYCLES_ANY, cpu@UOPS_RETIRED.MS\\,cmask\\=1@ / (UOPS_RETIRED.SLOTS / UOPS_ISSUED.ANY)) / tma_info_core_core_clks / 2.4", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -2005,6 +2053,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_int_operations + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", @@ -2066,6 +2115,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_3_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIV_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_3_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -2075,6 +2125,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed no uops on any execution port (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "(EXE_ACTIVITY.EXE_BOUND_0_PORTS + max(RS.EMPTY_RESOURCE - RESOURCE_STALLS.SCOREBOARD, 0)) / tma_info_thread_clks * (CYCLE_ACTIVITY.STALLS_TOTAL - EXE_ACTIVITY.BOUND_ON_LOADS) / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_0", @@ -2084,6 +2135,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles where the CPU executed total of 1 uop per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", + "MetricConstraint": "NO_THRESHOLD_AND_NMI", "MetricExpr": "EXE_ACTIVITY.1_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issueL1;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_1", @@ -2093,7 +2145,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 2 uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "EXE_ACTIVITY.2_PORTS_UTIL / tma_info_thread_clks", "MetricGroup": "PortsUtil;TopdownL4;tma_L4_group;tma_issue2P;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_2", @@ -2103,7 +2154,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles CPU executed total of 3 or more uops per cycle on all execution ports (Logical Processor cycles since ICL, Physical Core cycles otherwise)", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "UOPS_EXECUTED.CYCLES_GE_3 / tma_info_thread_clks", "MetricGroup": "BvCB;PortsUtil;TopdownL4;tma_L4_group;tma_ports_utilization_group", "MetricName": "tma_ports_utilized_3m", @@ -2132,7 +2182,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -2160,7 +2210,6 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to PAUSE Instructions", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "CPU_CLK_UNHALTED.PAUSE / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_serializing_operation_group", "MetricName": "tma_slow_pause", @@ -2192,7 +2241,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json index a38db3e253f2..1bdda3c3ccbf 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-cache.json @@ -312,6 +312,17 @@ "Unit": "CHA" }, { + "BriefDescription": "Distress signal asserted : DPT Remote", + "Counter": "0,1,2,3", + "EventCode": "0xaf", + "EventName": "UNC_CHA_DISTRESS_ASSERTED.DPT_NONLOCAL", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Distress signal asserted : DPT Remote : Counts the number of cycles either the local or incoming distress signals are asserted. : Dynamic Prefetch Throttle received by this tile", + "UMask": "0x8", + "Unit": "CHA" + }, + { "BriefDescription": "Egress Blocking due to Ordering requirements : Down", "Counter": "0,1,2,3", "EventCode": "0xba", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json index 68be01dad7c9..30044177ccf8 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-memory.json @@ -2770,6 +2770,88 @@ "Unit": "iMC" }, { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.HIGH", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Number of DRAM Refreshes Issued : Counts the number of refreshes issued.", + "UMask": "0x24", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.HIGH_ALL", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x24", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.HIGH_PCH0", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x4", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.HIGH_PCH1", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.PANIC", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Number of DRAM Refreshes Issued : Counts the number of refreshes issued.", + "UMask": "0x12", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.PANIC_ALL", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x12", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.PANIC_PCH0", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x2", + "Unit": "iMC" + }, + { + "BriefDescription": "Number of DRAM Refreshes Issued", + "Counter": "0,1,2,3", + "EventCode": "0x45", + "EventName": "UNC_M_DRAM_REFRESH.PANIC_PCH1", + "Experimental": "1", + "PerPkg": "1", + "UMask": "0x10", + "Unit": "iMC" + }, + { "BriefDescription": "ECC Correctable Errors", "Counter": "0,1,2,3", "EventCode": "0x09", @@ -3048,6 +3130,28 @@ "Unit": "iMC" }, { + "BriefDescription": "Throttle Cycles for Rank 0", + "Counter": "0,1,2,3", + "EventCode": "0x46", + "EventName": "UNC_M_POWER_THROTTLE_CYCLES.SLOT0", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Throttle Cycles for Rank 0 : Counts the number of cycles while the iMC is being throttled by either thermal constraints or by the PCU throttling. It is not possible to distinguish between the two. This can be filtered by rank. If multiple ranks are selected and are being throttled at the same time, the counter will only increment by 1. : Thermal throttling is performed per DIMM. We support 3 DIMMs per channel. This ID allows us to filter by ID.", + "UMask": "0x1", + "Unit": "iMC" + }, + { + "BriefDescription": "Throttle Cycles for Rank 0", + "Counter": "0,1,2,3", + "EventCode": "0x46", + "EventName": "UNC_M_POWER_THROTTLE_CYCLES.SLOT1", + "Experimental": "1", + "PerPkg": "1", + "PublicDescription": "Throttle Cycles for Rank 0 : Counts the number of cycles while the iMC is being throttled by either thermal constraints or by the PCU throttling. It is not possible to distinguish between the two. This can be filtered by rank. If multiple ranks are selected and are being throttled at the same time, the counter will only increment by 1.", + "UMask": "0x2", + "Unit": "iMC" + }, + { "BriefDescription": "Precharge due to read, write, underfill, or PGT.", "Counter": "0,1,2,3", "EventCode": "0x03", diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-power.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-power.json index 9482ddaea4d1..71c35b165a3e 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-power.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/uncore-power.json @@ -178,7 +178,6 @@ "Counter": "0,1,2,3", "EventCode": "0x35", "EventName": "UNC_P_POWER_STATE_OCCUPANCY_CORES_C0", - "Experimental": "1", "PerPkg": "1", "PublicDescription": "Number of cores in C0 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" @@ -198,7 +197,6 @@ "Counter": "0,1,2,3", "EventCode": "0x37", "EventName": "UNC_P_POWER_STATE_OCCUPANCY_CORES_C6", - "Experimental": "1", "PerPkg": "1", "PublicDescription": "Number of cores in C6 : This is an occupancy event that tracks the number of cores that are in the chosen C-State. It can be used by itself to get the average number of cores in that C-state with thresholding to generate histograms, or with other PCU events and occupancy triggering to capture other details.", "Unit": "PCU" diff --git a/tools/perf/pmu-events/arch/x86/sapphirerapids/virtual-memory.json b/tools/perf/pmu-events/arch/x86/sapphirerapids/virtual-memory.json index 3d3f88600e26..609a9549cbf3 100644 --- a/tools/perf/pmu-events/arch/x86/sapphirerapids/virtual-memory.json +++ b/tools/perf/pmu-events/arch/x86/sapphirerapids/virtual-memory.json @@ -4,7 +4,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.STLB_HIT", - "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts loads that miss the DTLB (Data TLB) and hit the STLB (Second level TLB).", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -14,7 +14,7 @@ "CounterMask": "1", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a demand load.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -23,7 +23,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data loads. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe" }, @@ -32,7 +32,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -41,7 +41,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -50,7 +50,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data loads. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -59,7 +59,7 @@ "Counter": "0,1,2,3", "EventCode": "0x12", "EventName": "DTLB_LOAD_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a demand load in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -68,7 +68,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.STLB_HIT", - "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts stores that miss the DTLB (Data TLB) and hit the STLB (2nd Level TLB).", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -78,7 +78,7 @@ "CounterMask": "1", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a store.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -87,7 +87,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by demand data stores. This implies it missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe" }, @@ -96,7 +96,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_1G", - "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (1G sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x8" }, @@ -105,7 +105,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -114,7 +114,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K sizes) caused by demand data stores. This implies address translations missed in the DTLB and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -123,7 +123,7 @@ "Counter": "0,1,2,3", "EventCode": "0x13", "EventName": "DTLB_STORE_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for a store in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -132,7 +132,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.STLB_HIT", - "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB). Available PDIST counters: 0", + "PublicDescription": "Counts instruction fetch requests that miss the ITLB (Instruction TLB) and hit the STLB (Second-level TLB).", "SampleAfterValue": "100003", "UMask": "0x20" }, @@ -142,7 +142,7 @@ "CounterMask": "1", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_ACTIVE", - "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request. Available PDIST counters: 0", + "PublicDescription": "Counts cycles when at least one PMH (Page Miss Handler) is busy with a page walk for a code (instruction fetch) request.", "SampleAfterValue": "100003", "UMask": "0x10" }, @@ -151,7 +151,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED", - "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (all page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0xe" }, @@ -160,7 +160,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_2M_4M", - "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (2M/4M page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x4" }, @@ -169,7 +169,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_COMPLETED_4K", - "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault. Available PDIST counters: 0", + "PublicDescription": "Counts completed page walks (4K page sizes) caused by a code fetch. This implies it missed in the ITLB (Instruction TLB) and further levels of TLB. The page walk can end with or without a fault.", "SampleAfterValue": "100003", "UMask": "0x2" }, @@ -178,7 +178,7 @@ "Counter": "0,1,2,3", "EventCode": "0x11", "EventName": "ITLB_MISSES.WALK_PENDING", - "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle. Available PDIST counters: 0", + "PublicDescription": "Counts the number of page walks outstanding for an outstanding code (instruction fetch) request in the PMH (Page Miss Handler) each cycle.", "SampleAfterValue": "100003", "UMask": "0x10" } diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/cache.json b/tools/perf/pmu-events/arch/x86/sierraforest/cache.json index 877052db1490..b2650e8ae252 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/cache.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/cache.json @@ -163,6 +163,14 @@ "UMask": "0x6" }, { + "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which hit in the LLC, no snoop was required. LLC provides the data. If the core has access to an L3 cache, an LLC hit refers to an L3 cache hit, otherwise it counts zeros.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0x34", + "EventName": "MEM_BOUND_STALLS_LOAD.LLC_HIT_NOSNOOP", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { "BriefDescription": "Counts the number of unhalted cycles when the core is stalled due to a demand load miss which missed all the local caches. If the core has access to an L3 cache, an LLC miss refers to an L3 cache miss, otherwise it is an L2 cache miss.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0x34", @@ -179,6 +187,14 @@ "UMask": "0x80" }, { + "BriefDescription": "Counts the total number of load ops retired that miss the L3 cache.", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.ALL", + "SampleAfterValue": "1000003", + "UMask": "0xff" + }, + { "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in DRAM", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd3", @@ -187,6 +203,31 @@ "UMask": "0x1" }, { + "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in a Remote DRAM", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_DRAM_OR_NOFWD", + "PublicDescription": "Counts the number of load ops retired that miss the L3 cache and hit in a Remote DRAM, OR had a Remote snoop miss/no fwd and hit in the Local Dram", + "SampleAfterValue": "1000003", + "UMask": "0x2" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in a Remote Cache and modified data was forwarded", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD_HITM", + "SampleAfterValue": "1000003", + "UMask": "0x8" + }, + { + "BriefDescription": "Counts the number of load ops retired that miss the L3 cache and hit in a Remote Cache and non-modified data was forwarded", + "Counter": "0,1,2,3,4,5,6,7", + "EventCode": "0xd3", + "EventName": "MEM_LOAD_UOPS_L3_MISS_RETIRED.REMOTE_FWD_NONM", + "SampleAfterValue": "1000003", + "UMask": "0x4" + }, + { "BriefDescription": "Counts the number of load ops retired that hit the L1 data cache.", "Counter": "0,1,2,3,4,5,6,7", "EventCode": "0xd1", @@ -286,7 +327,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_1024", @@ -297,7 +338,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_128", @@ -308,7 +349,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_16", @@ -319,7 +360,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_2048", @@ -330,7 +371,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_256", @@ -341,7 +382,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_32", @@ -352,7 +393,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_4", @@ -363,7 +404,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_512", @@ -374,7 +415,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_64", @@ -385,7 +426,7 @@ }, { "BriefDescription": "Counts the number of tagged load uops retired that exceed the latency threshold defined in MEC_CR_PEBS_LD_LAT_THRESHOLD - Only counts with PEBS enabled.", - "Counter": "0,1", + "Counter": "0,1,2,3,4,5,6,7", "Data_LA": "1", "EventCode": "0xd0", "EventName": "MEM_UOPS_RETIRED.LOAD_LATENCY_GT_8", diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/srf-metrics.json b/tools/perf/pmu-events/arch/x86/sierraforest/srf-metrics.json index b9f3c611d87b..ca2c55917e55 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/srf-metrics.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/srf-metrics.json @@ -1,56 +1,56 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C1 residency percent per core", - "MetricExpr": "cstate_core@c1\\-residency@ / TSC", + "MetricExpr": "cstate_core@c1\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C1_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c8\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C8_Pkg_Residency", "ScaleUnit": "100%" @@ -735,7 +735,7 @@ }, { "BriefDescription": "Average CPU Utilization", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricName": "tma_info_system_cpu_utilization" }, { @@ -747,7 +747,7 @@ }, { "BriefDescription": "Fraction of cycles spent in Kernel mode", - "MetricExpr": "cpu@CPU_CLK_UNHALTED.CORE_P@k / CPU_CLK_UNHALTED.CORE", + "MetricExpr": "CPU_CLK_UNHALTED.CORE_P:k / CPU_CLK_UNHALTED.CORE", "MetricGroup": "Summary", "MetricName": "tma_info_system_kernel_utilization" }, diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json index 952b6de3fefc..251e5d20fefe 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-interconnect.json @@ -839,12 +839,20 @@ "Counter": "0,1,2,3", "EventCode": "0x1F", "EventName": "UNC_I_MISC1.LOST_FWD", - "Experimental": "1", "PerPkg": "1", "UMask": "0x10", "Unit": "IRP" }, { + "BriefDescription": "Misc Events - Set 1 : Received Invalid : Secondary received a transfer that did not have sufficient MESI state", + "Counter": "0,1,2,3", + "EventCode": "0x1F", + "EventName": "UNC_I_MISC1.SEC_RCVD_INVLD", + "PerPkg": "1", + "UMask": "0x20", + "Unit": "IRP" + }, + { "BriefDescription": "Snoop Hit E/S responses", "Counter": "0,1,2,3", "EventCode": "0x12", diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json index f4f956966e16..2ea2637df3fb 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-io.json @@ -1321,7 +1321,6 @@ "FCMask": "0x01", "PerPkg": "1", "PortMask": "0x0FF", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IIO" }, diff --git a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json index c7e9dbe02eb0..a9fd7a34b24b 100644 --- a/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json +++ b/tools/perf/pmu-events/arch/x86/sierraforest/uncore-memory.json @@ -60,6 +60,33 @@ "BriefDescription": "CAS count for SubChannel 0 regular reads", "Counter": "0,1,2,3", "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_NON_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc3", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 auto-precharge reads", + "Counter": "0,1,2,3", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_PRE_REG", + "PerPkg": "1", + "UMask": "0xc2", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 auto-precharge underfill reads", + "Counter": "0,1,2,3", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_PRE_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc8", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 0 regular reads", + "Counter": "0,1,2,3", + "EventCode": "0x05", "EventName": "UNC_M_CAS_COUNT_SCH0.RD_REG", "PerPkg": "1", "UMask": "0xc1", @@ -75,6 +102,15 @@ "Unit": "IMC" }, { + "BriefDescription": "CAS count for SubChannel 0 underfill reads", + "Counter": "0,1,2,3", + "EventCode": "0x05", + "EventName": "UNC_M_CAS_COUNT_SCH0.RD_UNDERFILL_ALL", + "PerPkg": "1", + "UMask": "0xcc", + "Unit": "IMC" + }, + { "BriefDescription": "CAS count for SubChannel 0, all writes", "Counter": "0,1,2,3", "EventCode": "0x05", @@ -125,6 +161,33 @@ "BriefDescription": "CAS count for SubChannel 1 regular reads", "Counter": "0,1,2,3", "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_NON_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc3", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 auto-precharge reads", + "Counter": "0,1,2,3", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_PRE_REG", + "PerPkg": "1", + "UMask": "0xc2", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 auto-precharge underfill reads", + "Counter": "0,1,2,3", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_PRE_UNDERFILL", + "PerPkg": "1", + "UMask": "0xc8", + "Unit": "IMC" + }, + { + "BriefDescription": "CAS count for SubChannel 1 regular reads", + "Counter": "0,1,2,3", + "EventCode": "0x06", "EventName": "UNC_M_CAS_COUNT_SCH1.RD_REG", "PerPkg": "1", "UMask": "0xc1", @@ -140,6 +203,15 @@ "Unit": "IMC" }, { + "BriefDescription": "CAS count for SubChannel 1 underfill reads", + "Counter": "0,1,2,3", + "EventCode": "0x06", + "EventName": "UNC_M_CAS_COUNT_SCH1.RD_UNDERFILL_ALL", + "PerPkg": "1", + "UMask": "0xcc", + "Unit": "IMC" + }, + { "BriefDescription": "CAS count for SubChannel 1, all writes", "Counter": "0,1,2,3", "EventCode": "0x06", @@ -195,7 +267,6 @@ "EventName": "UNC_M_MR4_2XREF_CYCLES.SCH0_DIMM0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -206,7 +277,6 @@ "EventName": "UNC_M_MR4_2XREF_CYCLES.SCH0_DIMM1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -217,7 +287,6 @@ "EventName": "UNC_M_MR4_2XREF_CYCLES.SCH1_DIMM0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IMC" }, @@ -228,7 +297,6 @@ "EventName": "UNC_M_MR4_2XREF_CYCLES.SCH1_DIMM1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x8", "Unit": "IMC" }, @@ -239,7 +307,6 @@ "EventName": "UNC_M_PDC_MR4ACTIVE_CYCLES.SCH0_DIMM0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -250,7 +317,6 @@ "EventName": "UNC_M_PDC_MR4ACTIVE_CYCLES.SCH0_DIMM1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -261,7 +327,6 @@ "EventName": "UNC_M_PDC_MR4ACTIVE_CYCLES.SCH1_DIMM0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IMC" }, @@ -272,7 +337,6 @@ "EventName": "UNC_M_PDC_MR4ACTIVE_CYCLES.SCH1_DIMM1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x8", "Unit": "IMC" }, @@ -283,7 +347,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH0_RANK0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -294,7 +357,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH0_RANK1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -305,7 +367,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH0_RANK2", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IMC" }, @@ -316,7 +377,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH0_RANK3", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x8", "Unit": "IMC" }, @@ -327,7 +387,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH1_RANK0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x10", "Unit": "IMC" }, @@ -338,7 +397,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH1_RANK1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x20", "Unit": "IMC" }, @@ -349,7 +407,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH1_RANK2", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x40", "Unit": "IMC" }, @@ -360,7 +417,6 @@ "EventName": "UNC_M_POWERDOWN_CYCLES.SCH1_RANK3", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x80", "Unit": "IMC" }, @@ -371,7 +427,6 @@ "EventName": "UNC_M_POWER_CHANNEL_PPD_CYCLES", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "Unit": "IMC" }, { @@ -381,7 +436,6 @@ "EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -392,7 +446,6 @@ "EventName": "UNC_M_POWER_CRITICAL_THROTTLE_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -423,7 +476,6 @@ "EventName": "UNC_M_POWER_THROTTLE_CYCLES.MR4BLKEN", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x8", "Unit": "IMC" }, @@ -434,7 +486,6 @@ "EventName": "UNC_M_POWER_THROTTLE_CYCLES.RAPLBLK", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x4", "Unit": "IMC" }, @@ -617,7 +668,6 @@ "EventName": "UNC_M_SELF_REFRESH.ENTER_SUCCESS", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "UNC_M_SELF_REFRESH.ENTER_SUCCESS", "UMask": "0x2", "Unit": "IMC" }, @@ -628,7 +678,6 @@ "EventName": "UNC_M_SELF_REFRESH.ENTER_SUCCESS_CYCLES", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -639,7 +688,6 @@ "EventName": "UNC_M_THROTTLE_CRIT_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -650,7 +698,6 @@ "EventName": "UNC_M_THROTTLE_CRIT_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -661,7 +708,6 @@ "EventName": "UNC_M_THROTTLE_HIGH_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -672,7 +718,6 @@ "EventName": "UNC_M_THROTTLE_HIGH_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -683,7 +728,6 @@ "EventName": "UNC_M_THROTTLE_LOW_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -694,7 +738,6 @@ "EventName": "UNC_M_THROTTLE_LOW_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, @@ -705,7 +748,6 @@ "EventName": "UNC_M_THROTTLE_MID_CYCLES.SLOT0", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x1", "Unit": "IMC" }, @@ -716,7 +758,6 @@ "EventName": "UNC_M_THROTTLE_MID_CYCLES.SLOT1", "Experimental": "1", "PerPkg": "1", - "PublicDescription": "-", "UMask": "0x2", "Unit": "IMC" }, diff --git a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json index 2d3a037e88b5..707bd8790ed2 100644 --- a/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylake/skl-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -80,6 +80,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -117,6 +118,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_bottleneck_big_code", @@ -131,31 +133,35 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", @@ -163,6 +169,7 @@ }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", @@ -171,6 +178,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", @@ -179,6 +187,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", "MetricGroup": "BvMS;LockCont;Mem;Offcore;tma_issueSyncxn", "MetricName": "tma_bottleneck_memory_synchronization", @@ -187,6 +196,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM", "MetricName": "tma_bottleneck_mispredictions", @@ -195,7 +205,8 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -203,6 +214,7 @@ }, { "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + 2 * BR_INST_RETIRED.NEAR_CALL + INST_RETIRED.NOP) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "BvUW;Ret", "MetricName": "tma_bottleneck_useful_work", @@ -230,6 +242,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -391,7 +404,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -454,7 +467,6 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "FP_ARITH_INST_RETIRED.VECTOR / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", @@ -520,6 +532,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Cycles representing fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_bottleneck_mispredictions * tma_info_thread_slots / 4 / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", @@ -555,6 +568,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_mite)))", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", "MetricName": "tma_info_botlnk_l2_dsb_bandwidth", @@ -572,6 +586,7 @@ }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", "MetricName": "tma_info_botlnk_l2_ic_misses", @@ -713,7 +728,6 @@ }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + FP_ARITH_INST_RETIRED.VECTOR)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", @@ -975,7 +989,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -993,6 +1007,12 @@ "MetricName": "tma_info_pipeline_fetch_mite" }, { + "BriefDescription": "Average number of uops fetched from MS per cycle", + "MetricExpr": "IDQ.MS_UOPS / cpu@IDQ.MS_UOPS\\,cmask\\=1@", + "MetricGroup": "Fed;FetchLat;MicroSeq", + "MetricName": "tma_info_pipeline_fetch_ms" + }, + { "BriefDescription": "Instructions per a microcode Assist invocation", "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)", "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", @@ -1008,7 +1028,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1020,7 +1040,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -1029,7 +1049,7 @@ "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / tma_info_system_time / 1e3", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1175,12 +1195,13 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1217,7 +1238,7 @@ "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1241,6 +1262,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -1267,6 +1289,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 1 GB pages for data load accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_load_stlb_miss * DTLB_LOAD_MISSES.WALK_COMPLETED_1G / (DTLB_LOAD_MISSES.WALK_COMPLETED_4K + DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M + DTLB_LOAD_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_load_stlb_miss_group", "MetricName": "tma_load_stlb_miss_1g", @@ -1275,6 +1298,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 2 or 4 MB pages for data load accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_load_stlb_miss * DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / (DTLB_LOAD_MISSES.WALK_COMPLETED_4K + DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M + DTLB_LOAD_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_load_stlb_miss_group", "MetricName": "tma_load_stlb_miss_2m", @@ -1283,6 +1307,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 4 KB pages for data load accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_load_stlb_miss * DTLB_LOAD_MISSES.WALK_COMPLETED_4K / (DTLB_LOAD_MISSES.WALK_COMPLETED_4K + DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M + DTLB_LOAD_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_load_stlb_miss_group", "MetricName": "tma_load_stlb_miss_4k", @@ -1291,6 +1316,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (9 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -1315,7 +1341,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1324,7 +1350,7 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1348,7 +1374,6 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", @@ -1358,6 +1383,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;BvMP;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", @@ -1412,6 +1438,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", @@ -1421,6 +1448,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", "MetricGroup": "BrMispredicts;BvIO;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", "MetricName": "tma_other_mispredicts", @@ -1429,6 +1457,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", "MetricGroup": "BvIO;Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", "MetricName": "tma_other_nukes", @@ -1509,6 +1538,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -1595,7 +1625,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1652,6 +1682,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 1 GB pages for data store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_store_stlb_miss * DTLB_STORE_MISSES.WALK_COMPLETED_1G / (DTLB_STORE_MISSES.WALK_COMPLETED_4K + DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M + DTLB_STORE_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_store_stlb_miss_group", "MetricName": "tma_store_stlb_miss_1g", @@ -1660,6 +1691,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 2 or 4 MB pages for data store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_store_stlb_miss * DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M / (DTLB_STORE_MISSES.WALK_COMPLETED_4K + DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M + DTLB_STORE_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_store_stlb_miss_group", "MetricName": "tma_store_stlb_miss_2m", @@ -1668,6 +1700,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 4 KB pages for data store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_store_stlb_miss * DTLB_STORE_MISSES.WALK_COMPLETED_4K / (DTLB_STORE_MISSES.WALK_COMPLETED_4K + DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M + DTLB_STORE_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_store_stlb_miss_group", "MetricName": "tma_store_stlb_miss_4k", diff --git a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json index 7cc7b076c3e2..56c2caaafef4 100644 --- a/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json +++ b/tools/perf/pmu-events/arch/x86/skylakex/skx-metrics.json @@ -1,49 +1,49 @@ [ { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per core", - "MetricExpr": "cstate_core@c3\\-residency@ / TSC", + "MetricExpr": "cstate_core@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" @@ -301,6 +301,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution ports for ALU operations.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_0 + UOPS_DISPATCHED_PORT.PORT_1 + UOPS_DISPATCHED_PORT.PORT_5 + UOPS_DISPATCHED_PORT.PORT_6) / tma_info_thread_slots", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_alu_op_utilization", @@ -338,6 +339,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_bottleneck_big_code", @@ -352,31 +354,35 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", @@ -384,6 +390,7 @@ }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + tma_mispredicts_resteers * (10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts)) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", @@ -392,6 +399,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", @@ -400,6 +408,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) * tma_remote_cache / (tma_local_mem + tma_remote_cache + tma_remote_mem) + tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", "MetricGroup": "BvMS;LockCont;Mem;Offcore;tma_issueSyncxn", "MetricName": "tma_bottleneck_memory_synchronization", @@ -408,6 +417,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM", "MetricName": "tma_bottleneck_mispredictions", @@ -416,7 +426,8 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -424,6 +435,7 @@ }, { "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + 2 * BR_INST_RETIRED.NEAR_CALL + INST_RETIRED.NOP) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "BvUW;Ret", "MetricName": "tma_bottleneck_useful_work", @@ -451,6 +463,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU retired uops originated from CISC (complex instruction set computer) instruction", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_microcode_sequencer - tma_assists)", "MetricGroup": "TopdownL4;tma_L4_group;tma_microcode_sequencer_group", "MetricName": "tma_cisc", @@ -612,7 +625,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -675,7 +688,6 @@ }, { "BriefDescription": "This metric approximates arithmetic floating-point (FP) vector uops fraction the CPU has retired aggregated across all vector widths", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@ / UOPS_RETIRED.RETIRE_SLOTS", "MetricGroup": "Compute;Flops;TopdownL4;tma_L4_group;tma_fp_arith_group;tma_issue2P", "MetricName": "tma_fp_vector", @@ -750,6 +762,7 @@ }, { "BriefDescription": "Branch Misprediction Cost: Cycles representing fraction of TMA slots wasted per non-speculative branch misprediction (retired JEClear)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "tma_bottleneck_mispredictions * tma_info_thread_slots / 4 / BR_MISP_RETIRED.ALL_BRANCHES / 100", "MetricGroup": "Bad;BrMispredicts;tma_issueBM", "MetricName": "tma_info_bad_spec_branch_misprediction_cost", @@ -785,6 +798,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_mite)))", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", "MetricName": "tma_info_botlnk_l2_dsb_bandwidth", @@ -802,6 +816,7 @@ }, { "BriefDescription": "Total pipeline cost of Instruction Cache misses - subset of the Big_Code Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_fetch_latency * tma_icache_misses / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Fed;FetchLat;IcMiss;tma_issueFL", "MetricName": "tma_info_botlnk_l2_ic_misses", @@ -943,7 +958,6 @@ }, { "BriefDescription": "Instructions per FP Arithmetic instruction (lower number means higher occurrence rate)", - "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "INST_RETIRED.ANY / (FP_ARITH_INST_RETIRED.SCALAR + cpu@FP_ARITH_INST_RETIRED.128B_PACKED_DOUBLE\\,umask\\=0xfc@)", "MetricGroup": "Flops;InsType", "MetricName": "tma_info_inst_mix_iparith", @@ -1225,7 +1239,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -1243,6 +1257,12 @@ "MetricName": "tma_info_pipeline_fetch_mite" }, { + "BriefDescription": "Average number of uops fetched from MS per cycle", + "MetricExpr": "IDQ.MS_UOPS / cpu@IDQ.MS_UOPS\\,cmask\\=1@", + "MetricGroup": "Fed;FetchLat;MicroSeq", + "MetricName": "tma_info_pipeline_fetch_ms" + }, + { "BriefDescription": "Instructions per a microcode Assist invocation", "MetricExpr": "INST_RETIRED.ANY / (FP_ASSIST.ANY + OTHER_ASSISTS.ANY)", "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", @@ -1258,7 +1278,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1270,7 +1290,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -1279,7 +1299,7 @@ "MetricExpr": "64 * (UNC_M_CAS_COUNT.RD + UNC_M_CAS_COUNT.WR) / 1e9 / tma_info_system_time", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1475,12 +1495,13 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1517,7 +1538,7 @@ "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1541,6 +1562,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles CPU dispatched uops on execution port for Load operations", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(UOPS_DISPATCHED_PORT.PORT_2 + UOPS_DISPATCHED_PORT.PORT_3 + UOPS_DISPATCHED_PORT.PORT_7 - UOPS_DISPATCHED_PORT.PORT_4) / (2 * tma_info_core_core_clks)", "MetricGroup": "TopdownL5;tma_L5_group;tma_ports_utilized_3m_group", "MetricName": "tma_load_op_utilization", @@ -1567,6 +1589,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 1 GB pages for data load accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_load_stlb_miss * DTLB_LOAD_MISSES.WALK_COMPLETED_1G / (DTLB_LOAD_MISSES.WALK_COMPLETED_4K + DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M + DTLB_LOAD_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_load_stlb_miss_group", "MetricName": "tma_load_stlb_miss_1g", @@ -1575,6 +1598,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 2 or 4 MB pages for data load accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_load_stlb_miss * DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M / (DTLB_LOAD_MISSES.WALK_COMPLETED_4K + DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M + DTLB_LOAD_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_load_stlb_miss_group", "MetricName": "tma_load_stlb_miss_2m", @@ -1583,6 +1607,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 4 KB pages for data load accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_load_stlb_miss * DTLB_LOAD_MISSES.WALK_COMPLETED_4K / (DTLB_LOAD_MISSES.WALK_COMPLETED_4K + DTLB_LOAD_MISSES.WALK_COMPLETED_2M_4M + DTLB_LOAD_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_load_stlb_miss_group", "MetricName": "tma_load_stlb_miss_4k", @@ -1600,6 +1625,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU spent handling cache misses due to lock operations", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "(12 * max(0, MEM_INST_RETIRED.LOCK_LOADS - L2_RQSTS.ALL_RFO) + MEM_INST_RETIRED.LOCK_LOADS / MEM_INST_RETIRED.ALL_STORES * (11 * L2_RQSTS.RFO_HIT + min(CPU_CLK_UNHALTED.THREAD, OFFCORE_REQUESTS_OUTSTANDING.CYCLES_WITH_DEMAND_RFO))) / tma_info_thread_clks", "MetricGroup": "LockCont;Offcore;TopdownL4;tma_L4_group;tma_issueRFO;tma_l1_bound_group", "MetricName": "tma_lock_latency", @@ -1624,7 +1650,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1633,7 +1659,7 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1657,7 +1683,6 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU was retiring uops fetched by the Microcode Sequencer (MS) unit", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "UOPS_RETIRED.RETIRE_SLOTS / UOPS_ISSUED.ANY * IDQ.MS_UOPS / tma_info_thread_slots", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_heavy_operations_group;tma_issueMC;tma_issueMS", "MetricName": "tma_microcode_sequencer", @@ -1667,6 +1692,7 @@ }, { "BriefDescription": "This metric represents fraction of cycles the CPU was stalled due to Branch Resteers as a result of Branch Misprediction at execution stage", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "BR_MISP_RETIRED.ALL_BRANCHES / (BR_MISP_RETIRED.ALL_BRANCHES + MACHINE_CLEARS.COUNT) * INT_MISC.CLEAR_RESTEER_CYCLES / tma_info_thread_clks", "MetricGroup": "BadSpec;BrMispredicts;BvMP;TopdownL4;tma_L4_group;tma_branch_resteers_group;tma_issueBM", "MetricName": "tma_mispredicts_resteers", @@ -1721,6 +1747,7 @@ }, { "BriefDescription": "This metric represents the remaining light uops fraction the CPU has executed - remaining means not covered by other sibling nodes", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(0, tma_light_operations - (tma_fp_arith + tma_memory_operations + tma_fused_instructions + tma_non_fused_branches))", "MetricGroup": "Pipeline;TopdownL3;tma_L3_group;tma_light_operations_group", "MetricName": "tma_other_light_ops", @@ -1730,6 +1757,7 @@ }, { "BriefDescription": "This metric estimates fraction of slots the CPU was stalled due to other cases of misprediction (non-retired x86 branches or other types).", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(tma_branch_mispredicts * (1 - BR_MISP_RETIRED.ALL_BRANCHES / (INT_MISC.CLEARS_COUNT - MACHINE_CLEARS.COUNT)), 0.0001)", "MetricGroup": "BrMispredicts;BvIO;TopdownL3;tma_L3_group;tma_branch_mispredicts_group", "MetricName": "tma_other_mispredicts", @@ -1738,6 +1766,7 @@ }, { "BriefDescription": "This metric represents fraction of slots the CPU has wasted due to Nukes (Machine Clears) not related to memory ordering.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "max(tma_machine_clears * (1 - MACHINE_CLEARS.MEMORY_ORDERING / MACHINE_CLEARS.COUNT), 0.0001)", "MetricGroup": "BvIO;Machine_Clears;TopdownL3;tma_L3_group;tma_machine_clears_group", "MetricName": "tma_other_nukes", @@ -1818,6 +1847,7 @@ }, { "BriefDescription": "This metric estimates fraction of cycles the CPU performance was potentially limited due to Core computation issues (non divider-related)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "((tma_ports_utilized_0 * tma_info_thread_clks + (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL)) / tma_info_thread_clks if ARITH.DIVIDER_ACTIVE < CYCLE_ACTIVITY.STALLS_TOTAL - CYCLE_ACTIVITY.STALLS_MEM_ANY else (EXE_ACTIVITY.1_PORTS_UTIL + tma_retiring * EXE_ACTIVITY.2_PORTS_UTIL) / tma_info_thread_clks)", "MetricGroup": "PortsUtil;TopdownL3;tma_L3_group;tma_core_bound_group", "MetricName": "tma_ports_utilization", @@ -1923,7 +1953,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1980,6 +2010,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 1 GB pages for data store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_store_stlb_miss * DTLB_STORE_MISSES.WALK_COMPLETED_1G / (DTLB_STORE_MISSES.WALK_COMPLETED_4K + DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M + DTLB_STORE_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_store_stlb_miss_group", "MetricName": "tma_store_stlb_miss_1g", @@ -1988,6 +2019,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 2 or 4 MB pages for data store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_store_stlb_miss * DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M / (DTLB_STORE_MISSES.WALK_COMPLETED_4K + DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M + DTLB_STORE_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_store_stlb_miss_group", "MetricName": "tma_store_stlb_miss_2m", @@ -1996,6 +2028,7 @@ }, { "BriefDescription": "This metric estimates the fraction of cycles to walk the memory paging structures to cache translation of 4 KB pages for data store accesses.", + "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "tma_store_stlb_miss * DTLB_STORE_MISSES.WALK_COMPLETED_4K / (DTLB_STORE_MISSES.WALK_COMPLETED_4K + DTLB_STORE_MISSES.WALK_COMPLETED_2M_4M + DTLB_STORE_MISSES.WALK_COMPLETED_1G)", "MetricGroup": "MemoryTLB;TopdownL6;tma_L6_group;tma_store_stlb_miss_group", "MetricName": "tma_store_stlb_miss_4k", diff --git a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json index 2db7a70f7a07..908da985c594 100644 --- a/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json +++ b/tools/perf/pmu-events/arch/x86/tigerlake/tgl-metrics.json @@ -1,63 +1,63 @@ [ { "BriefDescription": "C10 residency percent per package", - "MetricExpr": "cstate_pkg@c10\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c10\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C10_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C2 residency percent per package", - "MetricExpr": "cstate_pkg@c2\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c2\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C2_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C3 residency percent per package", - "MetricExpr": "cstate_pkg@c3\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c3\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C3_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per core", - "MetricExpr": "cstate_core@c6\\-residency@ / TSC", + "MetricExpr": "cstate_core@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C6 residency percent per package", - "MetricExpr": "cstate_pkg@c6\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c6\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C6_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per core", - "MetricExpr": "cstate_core@c7\\-residency@ / TSC", + "MetricExpr": "cstate_core@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Core_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C7 residency percent per package", - "MetricExpr": "cstate_pkg@c7\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c7\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C7_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C8 residency percent per package", - "MetricExpr": "cstate_pkg@c8\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c8\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C8_Pkg_Residency", "ScaleUnit": "100%" }, { "BriefDescription": "C9 residency percent per package", - "MetricExpr": "cstate_pkg@c9\\-residency@ / TSC", + "MetricExpr": "cstate_pkg@c9\\-residency@ / msr@tsc@", "MetricGroup": "Power", "MetricName": "C9_Pkg_Residency", "ScaleUnit": "100%" @@ -85,7 +85,6 @@ }, { "BriefDescription": "This metric estimates how often memory load accesses were aliased by preceding stores (in program order) with a 4K address offset", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "LD_BLOCKS_PARTIAL.ADDRESS_ALIAS / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_4k_aliasing", @@ -134,6 +133,7 @@ }, { "BriefDescription": "Total pipeline cost of instruction fetch related bottlenecks by large code footprint programs (i-side cache; TLB and BTB misses)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * tma_fetch_latency * (tma_itlb_misses + tma_icache_misses + tma_unknown_branches) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches)", "MetricGroup": "BigFootprint;BvBC;Fed;Frontend;IcMiss;MemoryTLB", "MetricName": "tma_bottleneck_big_code", @@ -148,39 +148,44 @@ "PublicDescription": "Total pipeline cost of instructions used for program control-flow - a subset of the Retiring category in TMA. Examples include function calls; loops and alignments. (A lower bound)" }, { + "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", + "MetricGroup": "BvCB;Cor;tma_issueComp", + "MetricName": "tma_bottleneck_compute_bound_est", + "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", + "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " + }, + { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_bandwidth / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_sq_full / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_fb_full / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)))", "MetricGroup": "BvMB;Mem;MemoryBW;Offcore;tma_issueBW", - "MetricName": "tma_bottleneck_cache_memory_bandwidth", - "MetricThreshold": "tma_bottleneck_cache_memory_bandwidth > 20", + "MetricName": "tma_bottleneck_data_cache_memory_bandwidth", + "MetricThreshold": "tma_bottleneck_data_cache_memory_bandwidth > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Bandwidth related bottlenecks. Related metrics: tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_dram_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_mem_latency / (tma_mem_bandwidth + tma_mem_latency)) + tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l3_hit_latency / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full)) + tma_memory_bound * tma_l2_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_l1_latency_dependency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_lock_latency / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_l1_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_loads / (tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_split_stores / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_store_latency / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvML;Mem;MemoryLat;Offcore;tma_issueLat", - "MetricName": "tma_bottleneck_cache_memory_latency", - "MetricThreshold": "tma_bottleneck_cache_memory_latency > 20", + "MetricName": "tma_bottleneck_data_cache_memory_latency", + "MetricThreshold": "tma_bottleneck_data_cache_memory_latency > 20", "PublicDescription": "Total pipeline cost of external Memory- or Cache-Latency related bottlenecks. Related metrics: tma_l3_hit_latency, tma_mem_latency" }, { - "BriefDescription": "Total pipeline cost when the execution is compute-bound - an estimation", - "MetricExpr": "100 * (tma_core_bound * tma_divider / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_core_bound * (tma_ports_utilization / (tma_divider + tma_ports_utilization + tma_serializing_operation)) * (tma_ports_utilized_3m / (tma_ports_utilized_0 + tma_ports_utilized_1 + tma_ports_utilized_2 + tma_ports_utilized_3m)))", - "MetricGroup": "BvCB;Cor;tma_issueComp", - "MetricName": "tma_bottleneck_compute_bound_est", - "MetricThreshold": "tma_bottleneck_compute_bound_est > 20", - "PublicDescription": "Total pipeline cost when the execution is compute-bound - an estimation. Covers Core Bound when High ILP as well as when long-latency execution units are busy. Related metrics: " - }, - { "BriefDescription": "Total pipeline cost of instruction fetch bandwidth related bottlenecks (when the front-end could not sustain operations delivery to the back-end)", - "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms))) - tma_bottleneck_big_code", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_frontend_bound - (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms)) - tma_bottleneck_big_code", "MetricGroup": "BvFB;Fed;FetchBW;Frontend", "MetricName": "tma_bottleneck_instruction_fetch_bw", "MetricThreshold": "tma_bottleneck_instruction_fetch_bw > 20" }, { "BriefDescription": "Total pipeline cost of irregular execution (e.g", - "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_fetch_bandwidth * tma_ms / (tma_dsb + tma_lsd + tma_mite + tma_ms)) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 * (tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * (tma_fetch_latency * (tma_ms_switches + tma_branch_resteers * (tma_clears_resteers + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_mispredicts_resteers) / (tma_clears_resteers + tma_mispredicts_resteers + tma_unknown_branches)) / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches) + tma_ms) + 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts * tma_branch_mispredicts + tma_machine_clears * tma_other_nukes / tma_other_nukes + tma_core_bound * (tma_serializing_operation + tma_core_bound * RS_EVENTS.EMPTY_CYCLES / tma_info_thread_clks * tma_ports_utilized_0) / (tma_divider + tma_ports_utilization + tma_serializing_operation) + tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "Bad;BvIO;Cor;Ret;tma_issueMS", "MetricName": "tma_bottleneck_irregular_overhead", "MetricThreshold": "tma_bottleneck_irregular_overhead > 10", @@ -188,6 +193,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Address Translation related bottlenecks (data-side TLBs)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_l1_bound / max(tma_memory_bound, tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_load / max(tma_l1_bound, tma_4k_aliasing + tma_dtlb_load + tma_fb_full + tma_l1_latency_dependency + tma_lock_latency + tma_split_loads + tma_store_fwd_blk)) + tma_memory_bound * (tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound)) * (tma_dtlb_store / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores)))", "MetricGroup": "BvMT;Mem;MemoryTLB;Offcore;tma_issueTLB", "MetricName": "tma_bottleneck_memory_data_tlbs", @@ -196,6 +202,7 @@ }, { "BriefDescription": "Total pipeline cost of Memory Synchronization related bottlenecks (data transfers and coherency updates across processors)", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_memory_bound * (tma_l3_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * (tma_contested_accesses + tma_data_sharing) / (tma_contested_accesses + tma_data_sharing + tma_l3_hit_latency + tma_sq_full) + tma_store_bound / (tma_dram_bound + tma_l1_bound + tma_l2_bound + tma_l3_bound + tma_store_bound) * tma_false_sharing / (tma_dtlb_store + tma_false_sharing + tma_split_stores + tma_store_latency + tma_streaming_stores - tma_store_latency)) + tma_machine_clears * (1 - tma_other_nukes / tma_other_nukes))", "MetricGroup": "BvMS;LockCont;Mem;Offcore;tma_issueSyncxn", "MetricName": "tma_bottleneck_memory_synchronization", @@ -204,6 +211,7 @@ }, { "BriefDescription": "Total pipeline cost of Branch Misprediction related bottlenecks", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (1 - 10 * tma_microcode_sequencer * tma_other_mispredicts / tma_branch_mispredicts) * (tma_branch_mispredicts + tma_fetch_latency * tma_mispredicts_resteers / (tma_branch_resteers + tma_dsb_switches + tma_icache_misses + tma_itlb_misses + tma_lcp + tma_ms_switches))", "MetricGroup": "Bad;BadSpec;BrMispredicts;BvMP;tma_issueBM", "MetricName": "tma_bottleneck_mispredictions", @@ -212,7 +220,8 @@ }, { "BriefDescription": "Total pipeline cost of remaining bottlenecks in the back-end", - "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_cache_memory_bandwidth + tma_bottleneck_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", + "MetricConstraint": "NO_GROUP_EVENTS", + "MetricExpr": "100 - (tma_bottleneck_big_code + tma_bottleneck_instruction_fetch_bw + tma_bottleneck_mispredictions + tma_bottleneck_data_cache_memory_bandwidth + tma_bottleneck_data_cache_memory_latency + tma_bottleneck_memory_data_tlbs + tma_bottleneck_memory_synchronization + tma_bottleneck_compute_bound_est + tma_bottleneck_irregular_overhead + tma_bottleneck_branching_overhead + tma_bottleneck_useful_work)", "MetricGroup": "BvOB;Cor;Offcore", "MetricName": "tma_bottleneck_other_bottlenecks", "MetricThreshold": "tma_bottleneck_other_bottlenecks > 20", @@ -220,6 +229,7 @@ }, { "BriefDescription": "Total pipeline cost of \"useful operations\" - the portion of Retiring category not covered by Branching_Overhead nor Irregular_Overhead.", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_retiring - (BR_INST_RETIRED.ALL_BRANCHES + 2 * BR_INST_RETIRED.NEAR_CALL + INST_RETIRED.NOP) / tma_info_thread_slots - tma_microcode_sequencer / (tma_few_uops_instructions + tma_microcode_sequencer) * (tma_assists / tma_microcode_sequencer) * tma_heavy_operations)", "MetricGroup": "BvUW;Ret", "MetricName": "tma_bottleneck_useful_work", @@ -427,7 +437,7 @@ "MetricGroup": "BvMB;MemoryBW;TopdownL4;tma_L4_group;tma_issueBW;tma_issueSL;tma_issueSmSt;tma_l1_bound_group", "MetricName": "tma_fb_full", "MetricThreshold": "tma_fb_full > 0.3", - "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", + "PublicDescription": "This metric does a *rough estimation* of how often L1D Fill Buffer unavailability limited additional L1D miss memory access requests to proceed. The higher the metric value; the deeper the memory hierarchy level the misses are satisfied from (metric values >1 are valid). Often it hints on approaching bandwidth limits (to L2 cache; L3 cache or external memory). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_info_system_dram_bw_use, tma_mem_bandwidth, tma_sq_full, tma_store_latency, tma_streaming_stores", "ScaleUnit": "100%" }, { @@ -619,6 +629,7 @@ }, { "BriefDescription": "Total pipeline cost of DSB (uop cache) hits - subset of the Instruction_Fetch_BW Bottleneck", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "100 * (tma_frontend_bound * (tma_fetch_bandwidth / (tma_fetch_bandwidth + tma_fetch_latency)) * (tma_dsb / (tma_dsb + tma_lsd + tma_mite + tma_ms)))", "MetricGroup": "DSB;Fed;FetchBW;tma_issueFB", "MetricName": "tma_info_botlnk_l2_dsb_bandwidth", @@ -1074,7 +1085,7 @@ "MetricName": "tma_info_memory_tlb_store_stlb_mpki" }, { - "BriefDescription": "", + "BriefDescription": "Mem;Backend;CacheHits", "MetricExpr": "UOPS_EXECUTED.THREAD / (UOPS_EXECUTED.CORE_CYCLES_GE_1 / 2 if #SMT_on else cpu@UOPS_EXECUTED.THREAD\\,cmask\\=1@)", "MetricGroup": "Cor;Pipeline;PortsUtil;SMT", "MetricName": "tma_info_pipeline_execute" @@ -1098,6 +1109,12 @@ "MetricName": "tma_info_pipeline_fetch_mite" }, { + "BriefDescription": "Average number of uops fetched from MS per cycle", + "MetricExpr": "IDQ.MS_UOPS / cpu@IDQ.MS_UOPS\\,cmask\\=1@", + "MetricGroup": "Fed;FetchLat;MicroSeq", + "MetricName": "tma_info_pipeline_fetch_ms" + }, + { "BriefDescription": "Instructions per a microcode Assist invocation", "MetricExpr": "INST_RETIRED.ANY / ASSISTS.ANY", "MetricGroup": "MicroSeq;Pipeline;Ret;Retire", @@ -1113,7 +1130,7 @@ }, { "BriefDescription": "Measured Average Core Frequency for unhalted processors [GHz]", - "MetricExpr": "tma_info_system_turbo_utilization * TSC / 1e9 / tma_info_system_time", + "MetricExpr": "tma_info_system_turbo_utilization * msr@tsc@ / 1e9 / tma_info_system_time", "MetricGroup": "Power;Summary", "MetricName": "tma_info_system_core_frequency" }, @@ -1125,7 +1142,7 @@ }, { "BriefDescription": "Average number of utilized CPUs", - "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / TSC", + "MetricExpr": "CPU_CLK_UNHALTED.REF_TSC / msr@tsc@", "MetricGroup": "Summary", "MetricName": "tma_info_system_cpus_utilized" }, @@ -1134,7 +1151,7 @@ "MetricExpr": "64 * (UNC_ARB_TRK_REQUESTS.ALL + UNC_ARB_COH_TRK_REQUESTS.ALL) / 1e6 / tma_info_system_time / 1e3", "MetricGroup": "HPC;MemOffcore;MemoryBW;SoC;tma_issueBW", "MetricName": "tma_info_system_dram_bw_use", - "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" + "PublicDescription": "Average external Memory Bandwidth Use for reads and writes [GB / sec]. Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_mem_bandwidth, tma_sq_full" }, { "BriefDescription": "Giga Floating Point Operations Per Second", @@ -1165,6 +1182,7 @@ }, { "BriefDescription": "Average number of parallel data read requests to external memory", + "MetricConstraint": "NO_GROUP_EVENTS", "MetricExpr": "UNC_ARB_DAT_OCCUPANCY.RD / UNC_ARB_DAT_OCCUPANCY.RD@cmask\\=1@", "MetricGroup": "Mem;MemoryBW;SoC", "MetricName": "tma_info_system_mem_parallel_reads", @@ -1316,12 +1334,12 @@ "ScaleUnit": "100%" }, { - "BriefDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", + "BriefDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache", "MetricExpr": "min(2 * (MEM_INST_RETIRED.ALL_LOADS - MEM_LOAD_RETIRED.FB_HIT - MEM_LOAD_RETIRED.L1_MISS) * 20 / 100, max(CYCLE_ACTIVITY.CYCLES_MEM_ANY - CYCLE_ACTIVITY.CYCLES_L1D_MISS, 0)) / tma_info_thread_clks", "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_l1_latency_dependency", "MetricThreshold": "tma_l1_latency_dependency > 0.1 & (tma_l1_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", + "PublicDescription": "This metric ([SKL+] roughly; [LNL]) estimates fraction of cycles with demand load accesses that hit the L1D cache. The short latency of the L1D cache may be exposed in pointer-chasing memory access patterns as an example. Sample with: MEM_LOAD_RETIRED.L1_HIT", "ScaleUnit": "100%" }, { @@ -1345,7 +1363,6 @@ }, { "BriefDescription": "This metric estimates how often the CPU was stalled due to loads accesses to L3 cache or contended with a sibling Core", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "(CYCLE_ACTIVITY.STALLS_L2_MISS - CYCLE_ACTIVITY.STALLS_L3_MISS) / tma_info_thread_clks", "MetricGroup": "CacheHits;MemoryBound;TmaL3mem;TopdownL3;tma_L3_group;tma_memory_bound_group", "MetricName": "tma_l3_bound", @@ -1359,7 +1376,7 @@ "MetricGroup": "BvML;MemoryLat;TopdownL4;tma_L4_group;tma_issueLat;tma_l3_bound_group", "MetricName": "tma_l3_hit_latency", "MetricThreshold": "tma_l3_hit_latency > 0.1 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_cache_memory_latency, tma_mem_latency", + "PublicDescription": "This metric estimates fraction of cycles with demand load accesses that hit the L3 cache under unloaded scenarios (possibly L3 latency limited). Avoiding private cache misses (i.e. L2 misses/L3 hits) will improve the latency; reduce contention with sibling physical cores and increase performance. Note the value of this node may overlap with its siblings. Sample with: MEM_LOAD_RETIRED.L3_HIT_PS. Related metrics: tma_bottleneck_data_cache_memory_latency, tma_mem_latency", "ScaleUnit": "100%" }, { @@ -1465,7 +1482,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueBW", "MetricName": "tma_mem_bandwidth", "MetricThreshold": "tma_mem_bandwidth > 0.2 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", + "PublicDescription": "This metric estimates fraction of cycles where the core's performance was likely hurt due to approaching bandwidth limits of external memory - DRAM ([SPR-HBM] and/or HBM). The underlying heuristic assumes that a similar off-core traffic is generated by all IA cores. This metric does not aggregate non-data-read requests by this logical processor; requests from other IA Logical Processors/Physical Cores/sockets; or other non-IA devices like GPU; hence the maximum external memory bandwidth limits may or may not be approached when this metric is flagged (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_sq_full", "ScaleUnit": "100%" }, { @@ -1474,7 +1491,7 @@ "MetricGroup": "BvML;MemoryLat;Offcore;TopdownL4;tma_L4_group;tma_dram_bound_group;tma_issueLat", "MetricName": "tma_mem_latency", "MetricThreshold": "tma_mem_latency > 0.1 & (tma_dram_bound > 0.1 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_cache_memory_latency, tma_l3_hit_latency", + "PublicDescription": "This metric estimates fraction of cycles where the performance was likely hurt due to latency from external memory - DRAM ([SPR-HBM] and/or HBM). This metric does not aggregate requests from other Logical Processors/Physical Cores/sockets (see Uncore counters for that). Related metrics: tma_bottleneck_data_cache_memory_latency, tma_l3_hit_latency", "ScaleUnit": "100%" }, { @@ -1542,7 +1559,7 @@ }, { "BriefDescription": "This metric represents Core fraction of cycles in which CPU was likely limited due to the Microcode Sequencer (MS) unit - see Microcode_Sequencer node for details.", - "MetricExpr": "cpu@IDQ.MS_UOPS\\,cmask\\=1@ / tma_info_core_core_clks / 2", + "MetricExpr": "cpu@IDQ.MS_UOPS\\,cmask\\=1@ / tma_info_core_core_clks / 3.3", "MetricGroup": "MicroSeq;TopdownL3;tma_L3_group;tma_fetch_bandwidth_group", "MetricName": "tma_ms", "MetricThreshold": "tma_ms > 0.05 & tma_fetch_bandwidth > 0.2", @@ -1676,7 +1693,7 @@ { "BriefDescription": "This category represents fraction of slots utilized by useful work i.e. issued uops that eventually get retired", "DefaultMetricgroupName": "TopdownL1", - "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound) + 0 * tma_info_thread_slots", + "MetricExpr": "topdown\\-retiring / (topdown\\-fe\\-bound + topdown\\-bad\\-spec + topdown\\-retiring + topdown\\-be\\-bound)", "MetricGroup": "BvUW;Default;TmaL1;TopdownL1;tma_L1_group", "MetricName": "tma_retiring", "MetricThreshold": "tma_retiring > 0.7 | tma_heavy_operations > 0.1", @@ -1713,7 +1730,6 @@ }, { "BriefDescription": "This metric represents rate of split store accesses", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "MEM_INST_RETIRED.SPLIT_STORES / tma_info_core_core_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_issueSpSt;tma_store_bound_group", "MetricName": "tma_split_stores", @@ -1727,7 +1743,7 @@ "MetricGroup": "BvMB;MemoryBW;Offcore;TopdownL4;tma_L4_group;tma_issueBW;tma_l3_bound_group", "MetricName": "tma_sq_full", "MetricThreshold": "tma_sq_full > 0.3 & (tma_l3_bound > 0.05 & (tma_memory_bound > 0.2 & tma_backend_bound > 0.2))", - "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", + "PublicDescription": "This metric measures fraction of cycles where the Super Queue (SQ) was full taking into account all request-types and both hardware SMT threads (Logical Processors). Related metrics: tma_bottleneck_data_cache_memory_bandwidth, tma_fb_full, tma_info_system_dram_bw_use, tma_mem_bandwidth", "ScaleUnit": "100%" }, { @@ -1741,7 +1757,6 @@ }, { "BriefDescription": "This metric roughly estimates fraction of cycles when the memory subsystem had loads blocked since they could not forward data from earlier (in program order) overlapping stores", - "MetricConstraint": "NO_GROUP_EVENTS_NMI", "MetricExpr": "13 * LD_BLOCKS.STORE_FORWARD / tma_info_thread_clks", "MetricGroup": "TopdownL4;tma_L4_group;tma_l1_bound_group", "MetricName": "tma_store_fwd_blk", diff --git a/tools/perf/python/ilist.py b/tools/perf/python/ilist.py new file mode 100755 index 000000000000..9d6465c60df3 --- /dev/null +++ b/tools/perf/python/ilist.py @@ -0,0 +1,495 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) +"""Interactive perf list.""" + +from abc import ABC, abstractmethod +import argparse +from dataclasses import dataclass +import math +from typing import Any, Dict, Optional, Tuple +import perf +from textual import on +from textual.app import App, ComposeResult +from textual.binding import Binding +from textual.containers import Horizontal, HorizontalGroup, Vertical, VerticalScroll +from textual.css.query import NoMatches +from textual.command import SearchIcon +from textual.screen import ModalScreen +from textual.widgets import Button, Footer, Header, Input, Label, Sparkline, Static, Tree +from textual.widgets.tree import TreeNode + + +def get_info(info: Dict[str, str], key: str): + return (info[key] + "\n") if key in info else "" + + +class TreeValue(ABC): + """Abstraction for the data of value in the tree.""" + + @abstractmethod + def name(self) -> str: + pass + + @abstractmethod + def description(self) -> str: + pass + + @abstractmethod + def matches(self, query: str) -> bool: + pass + + @abstractmethod + def parse(self) -> perf.evlist: + pass + + @abstractmethod + def value(self, evlist: perf.evlist, evsel: perf.evsel, cpu: int, thread: int) -> float: + pass + + +@dataclass +class Metric(TreeValue): + """A metric in the tree.""" + metric_name: str + + def name(self) -> str: + return self.metric_name + + def description(self) -> str: + """Find and format metric description.""" + for metric in perf.metrics(): + if metric["MetricName"] != self.metric_name: + continue + desc = get_info(metric, "BriefDescription") + desc += get_info(metric, "PublicDescription") + desc += get_info(metric, "MetricExpr") + desc += get_info(metric, "MetricThreshold") + return desc + return "description" + + def matches(self, query: str) -> bool: + return query in self.metric_name + + def parse(self) -> perf.evlist: + return perf.parse_metrics(self.metric_name) + + def value(self, evlist: perf.evlist, evsel: perf.evsel, cpu: int, thread: int) -> float: + val = evlist.compute_metric(self.metric_name, cpu, thread) + return 0 if math.isnan(val) else val + + +@dataclass +class PmuEvent(TreeValue): + """A PMU and event within the tree.""" + pmu: str + event: str + + def name(self) -> str: + if self.event.startswith(self.pmu) or ':' in self.event: + return self.event + else: + return f"{self.pmu}/{self.event}/" + + def description(self) -> str: + """Find and format event description for {pmu}/{event}/.""" + for p in perf.pmus(): + if p.name() != self.pmu: + continue + for info in p.events(): + if "name" not in info or info["name"] != self.event: + continue + + desc = get_info(info, "topic") + desc += get_info(info, "event_type_desc") + desc += get_info(info, "desc") + desc += get_info(info, "long_desc") + desc += get_info(info, "encoding_desc") + return desc + return "description" + + def matches(self, query: str) -> bool: + return query in self.pmu or query in self.event + + def parse(self) -> perf.evlist: + return perf.parse_events(self.name()) + + def value(self, evlist: perf.evlist, evsel: perf.evsel, cpu: int, thread: int) -> float: + return evsel.read(cpu, thread).val + + +class ErrorScreen(ModalScreen[bool]): + """Pop up dialog for errors.""" + + CSS = """ + ErrorScreen { + align: center middle; + } + """ + + def __init__(self, error: str): + self.error = error + super().__init__() + + def compose(self) -> ComposeResult: + yield Button(f"Error: {self.error}", variant="primary", id="error") + + def on_button_pressed(self, event: Button.Pressed) -> None: + self.dismiss(True) + + +class SearchScreen(ModalScreen[str]): + """Pop up dialog for search.""" + + CSS = """ + SearchScreen Horizontal { + align: center middle; + margin-top: 1; + } + SearchScreen Input { + width: 1fr; + } + """ + + def compose(self) -> ComposeResult: + yield Horizontal(SearchIcon(), Input(placeholder="Event name")) + + def on_input_submitted(self, event: Input.Submitted) -> None: + """Handle the user pressing Enter in the input field.""" + self.dismiss(event.value) + + +class Counter(HorizontalGroup): + """Two labels for a CPU and its counter value.""" + + CSS = """ + Label { + gutter: 1; + } + """ + + def __init__(self, cpu: int) -> None: + self.cpu = cpu + super().__init__() + + def compose(self) -> ComposeResult: + label = f"cpu{self.cpu}" if self.cpu >= 0 else "total" + yield Label(label + " ") + yield Label("0", id=f"counter_{label}") + + +class CounterSparkline(HorizontalGroup): + """A Sparkline for a performance counter.""" + + def __init__(self, cpu: int) -> None: + self.cpu = cpu + super().__init__() + + def compose(self) -> ComposeResult: + label = f"cpu{self.cpu}" if self.cpu >= 0 else "total" + yield Label(label) + yield Sparkline([], summary_function=max, id=f"sparkline_{label}") + + +class IListApp(App): + TITLE = "Interactive Perf List" + + BINDINGS = [ + Binding(key="s", action="search", description="Search", + tooltip="Search events and PMUs"), + Binding(key="n", action="next", description="Next", + tooltip="Next search result or item"), + Binding(key="p", action="prev", description="Previous", + tooltip="Previous search result or item"), + Binding(key="c", action="collapse", description="Collapse", + tooltip="Collapse the current PMU"), + Binding(key="^q", action="quit", description="Quit", + tooltip="Quit the app"), + ] + + CSS = """ + /* Make the 'total' sparkline a different color. */ + #sparkline_total > .sparkline--min-color { + color: $accent; + } + #sparkline_total > .sparkline--max-color { + color: $accent 30%; + } + /* + * Make the active_search initially not displayed with the text in + * the middle of the line. + */ + #active_search { + display: none; + width: 100%; + text-align: center; + } + """ + + def __init__(self, interval: float) -> None: + self.interval = interval + self.evlist = None + self.selected: Optional[TreeValue] = None + self.search_results: list[TreeNode[TreeValue]] = [] + self.cur_search_result: TreeNode[TreeValue] | None = None + super().__init__() + + def expand_and_select(self, node: TreeNode[Any]) -> None: + """Expand select a node in the tree.""" + if node.parent: + node.parent.expand() + if node.parent.parent: + node.parent.parent.expand() + node.expand() + node.tree.select_node(node) + node.tree.scroll_to_node(node) + + def set_searched_tree_node(self, previous: bool) -> None: + """Set the cur_search_result node to either the next or previous.""" + l = len(self.search_results) + + if l < 1: + tree: Tree[TreeValue] = self.query_one("#root", Tree) + if previous: + tree.action_cursor_up() + else: + tree.action_cursor_down() + return + + if self.cur_search_result: + idx = self.search_results.index(self.cur_search_result) + if previous: + idx = idx - 1 if idx > 0 else l - 1 + else: + idx = idx + 1 if idx < l - 1 else 0 + else: + idx = l - 1 if previous else 0 + + node = self.search_results[idx] + if node == self.cur_search_result: + return + + self.cur_search_result = node + self.expand_and_select(node) + + def action_search(self) -> None: + """Search was chosen.""" + def set_initial_focus(event: str | None) -> None: + """Sets the focus after the SearchScreen is dismissed.""" + + search_label = self.query_one("#active_search", Label) + search_label.display = True if event else False + if not event: + return + event = event.lower() + search_label.update(f'Searching for events matching "{event}"') + + tree: Tree[str] = self.query_one("#root", Tree) + + def find_search_results(event: str, node: TreeNode[str], + cursor_seen: bool = False, + match_after_cursor: Optional[TreeNode[str]] = None + ) -> Tuple[bool, Optional[TreeNode[str]]]: + """Find nodes that match the search remembering the one after the cursor.""" + if not cursor_seen and node == tree.cursor_node: + cursor_seen = True + if node.data and node.data.matches(event): + if cursor_seen and not match_after_cursor: + match_after_cursor = node + self.search_results.append(node) + + if node.children: + for child in node.children: + (cursor_seen, match_after_cursor) = \ + find_search_results(event, child, cursor_seen, match_after_cursor) + return (cursor_seen, match_after_cursor) + + self.search_results.clear() + (_, self.cur_search_result) = find_search_results(event, tree.root) + if len(self.search_results) < 1: + self.push_screen(ErrorScreen(f"Failed to find pmu/event or metric {event}")) + search_label.display = False + elif self.cur_search_result: + self.expand_and_select(self.cur_search_result) + else: + self.set_searched_tree_node(previous=False) + + self.push_screen(SearchScreen(), set_initial_focus) + + def action_next(self) -> None: + """Next was chosen.""" + self.set_searched_tree_node(previous=False) + + def action_prev(self) -> None: + """Previous was chosen.""" + self.set_searched_tree_node(previous=True) + + def action_collapse(self) -> None: + """Collapse the part of the tree currently on.""" + tree: Tree[str] = self.query_one("#root", Tree) + node = tree.cursor_node + if node and node.parent: + node.parent.collapse_all() + node.tree.scroll_to_node(node.parent) + + def update_counts(self) -> None: + """Called every interval to update counts.""" + if not self.selected or not self.evlist: + return + + def update_count(cpu: int, count: int): + # Update the raw count display. + counter: Label = self.query(f"#counter_cpu{cpu}" if cpu >= 0 else "#counter_total") + if not counter: + return + counter = counter.first(Label) + counter.update(str(count)) + + # Update the sparkline. + line: Sparkline = self.query(f"#sparkline_cpu{cpu}" if cpu >= 0 else "#sparkline_total") + if not line: + return + line = line.first(Sparkline) + # If there are more events than the width, remove the front event. + if len(line.data) > line.size.width: + line.data.pop(0) + line.data.append(count) + line.mutate_reactive(Sparkline.data) + + # Update the total and each CPU counts, assume there's just 1 evsel. + total = 0 + self.evlist.disable() + for evsel in self.evlist: + for cpu in evsel.cpus(): + aggr = 0 + for thread in evsel.threads(): + aggr += self.selected.value(self.evlist, evsel, cpu, thread) + update_count(cpu, aggr) + total += aggr + update_count(-1, total) + self.evlist.enable() + + def on_mount(self) -> None: + """When App starts set up periodic event updating.""" + self.update_counts() + self.set_interval(self.interval, self.update_counts) + + def set_selected(self, value: TreeValue) -> None: + """Updates the event/description and starts the counters.""" + try: + label_name = self.query_one("#event_name", Label) + event_description = self.query_one("#event_description", Static) + lines = self.query_one("#lines") + except NoMatches: + # A race with rendering, ignore the update as we can't + # mount the assumed output widgets. + return + + self.selected = value + + # Remove previous event information. + if self.evlist: + self.evlist.disable() + self.evlist.close() + old_lines = self.query(CounterSparkline) + for line in old_lines: + line.remove() + old_counters = self.query(Counter) + for counter in old_counters: + counter.remove() + + # Update event/metric text and description. + label_name.update(value.name()) + event_description.update(value.description()) + + # Open the event. + try: + self.evlist = value.parse() + if self.evlist: + self.evlist.open() + self.evlist.enable() + except: + self.evlist = None + + if not self.evlist: + self.push_screen(ErrorScreen(f"Failed to open {value.name()}")) + return + + # Add spark lines for all the CPUs. Note, must be done after + # open so that the evlist CPUs have been computed by propagate + # maps. + line = CounterSparkline(cpu=-1) + lines.mount(line) + for cpu in self.evlist.all_cpus(): + line = CounterSparkline(cpu) + lines.mount(line) + line = Counter(cpu=-1) + lines.mount(line) + for cpu in self.evlist.all_cpus(): + line = Counter(cpu) + lines.mount(line) + + def compose(self) -> ComposeResult: + """Draws the app.""" + def metric_event_tree() -> Tree: + """Create tree of PMUs and metricgroups with events or metrics under.""" + tree: Tree[TreeValue] = Tree("Root", id="root") + pmus = tree.root.add("PMUs") + for pmu in perf.pmus(): + pmu_name = pmu.name().lower() + pmu_node = pmus.add(pmu_name) + try: + for event in sorted(pmu.events(), key=lambda x: x["name"]): + if "name" in event: + e = event["name"].lower() + if "alias" in event: + pmu_node.add_leaf(f'{e} ({event["alias"]})', + data=PmuEvent(pmu_name, e)) + else: + pmu_node.add_leaf(e, data=PmuEvent(pmu_name, e)) + except: + # Reading events may fail with EPERM, ignore. + pass + metrics = tree.root.add("Metrics") + groups = set() + for metric in perf.metrics(): + groups.update(metric["MetricGroup"]) + + def add_metrics_to_tree(node: TreeNode[TreeValue], parent: str): + for metric in sorted(perf.metrics(), key=lambda x: x["MetricName"]): + if parent in metric["MetricGroup"]: + name = metric["MetricName"] + node.add_leaf(name, data=Metric(name)) + child_group_name = f'{name}_group' + if child_group_name in groups: + add_metrics_to_tree(node.add(child_group_name), child_group_name) + + for group in sorted(groups): + if group.endswith('_group'): + continue + add_metrics_to_tree(metrics.add(group), group) + + tree.root.expand() + return tree + + yield Header(id="header") + yield Horizontal(Vertical(metric_event_tree(), id="events"), + Vertical(Label("event name", id="event_name"), + Static("description", markup=False, id="event_description"), + )) + yield Label(id="active_search") + yield VerticalScroll(id="lines") + yield Footer(id="footer") + + @on(Tree.NodeSelected) + def on_tree_node_selected(self, event: Tree.NodeSelected[TreeValue]) -> None: + """Called when a tree node is selected, selecting the event.""" + if event.node.data: + self.set_selected(event.node.data) + + +if __name__ == "__main__": + ap = argparse.ArgumentParser() + ap.add_argument('-I', '--interval', help="Counter update interval in seconds", default=0.1) + args = ap.parse_args() + app = IListApp(float(args.interval)) + app.run() diff --git a/tools/perf/scripts/perl/Perf-Trace-Util/Build b/tools/perf/scripts/perl/Perf-Trace-Util/Build index 9b0e5a8b5070..01a1a0ed51ae 100644 --- a/tools/perf/scripts/perl/Perf-Trace-Util/Build +++ b/tools/perf/scripts/perl/Perf-Trace-Util/Build @@ -2,7 +2,7 @@ perf-util-y += Context.o CFLAGS_Context.o += $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-bad-function-cast -Wno-declaration-after-statement -Wno-switch-enum CFLAGS_Context.o += -Wno-unused-parameter -Wno-nested-externs -Wno-undef -CFLAGS_Context.o += -Wno-switch-default -Wno-shadow +CFLAGS_Context.o += -Wno-switch-default -Wno-shadow -Wno-thread-safety-analysis ifeq ($(CC_NO_CLANG), 1) CFLAGS_Context.o += -Wno-unused-command-line-argument diff --git a/tools/perf/tests/Build b/tools/perf/tests/Build index 3e8394be15ae..af67f8ef74b4 100644 --- a/tools/perf/tests/Build +++ b/tools/perf/tests/Build @@ -20,7 +20,6 @@ perf-test-y += hists_link.o perf-test-y += hists_filter.o perf-test-y += hists_output.o perf-test-y += hists_cumulate.o -perf-test-y += python-use.o perf-test-y += bp_signal.o perf-test-y += bp_signal_overflow.o perf-test-y += bp_account.o @@ -75,7 +74,6 @@ ifeq ($(SRCARCH),$(filter $(SRCARCH),x86 arm arm64 powerpc)) perf-test-$(CONFIG_DWARF_UNWIND) += dwarf-unwind.o endif -CFLAGS_python-use.o += -DPYTHONPATH="BUILD_STR($(OUTPUT)python)" -DPYTHON="BUILD_STR($(PYTHON_WORD))" CFLAGS_dwarf-unwind.o += -fno-optimize-sibling-calls perf-test-y += workloads/ diff --git a/tools/perf/tests/builtin-test.c b/tools/perf/tests/builtin-test.c index 85142dfb3e01..0d2fb7a4ae5b 100644 --- a/tools/perf/tests/builtin-test.c +++ b/tools/perf/tests/builtin-test.c @@ -84,7 +84,6 @@ static struct test_suite *generic_tests[] = { &suite__syscall_openat_tp_fields, #endif &suite__hists_link, - &suite__python_use, &suite__bp_signal, &suite__bp_signal_overflow, &suite__bp_accounting, @@ -152,6 +151,7 @@ static struct test_workload *workloads[] = { &workload__brstack, &workload__datasym, &workload__landlock, + &workload__traploop, }; #define workloads__for_each(workload) \ diff --git a/tools/perf/tests/code-reading.c b/tools/perf/tests/code-reading.c index 9c2091310191..4c9fbf6965c4 100644 --- a/tools/perf/tests/code-reading.c +++ b/tools/perf/tests/code-reading.c @@ -2,6 +2,7 @@ #include <errno.h> #include <linux/kconfig.h> #include <linux/kernel.h> +#include <linux/rbtree.h> #include <linux/types.h> #include <inttypes.h> #include <stdlib.h> @@ -39,11 +40,64 @@ #define BUFSZ 1024 #define READLEN 128 -struct state { - u64 done[1024]; - size_t done_cnt; +struct tested_section { + struct rb_node rb_node; + u64 addr; + char path[PATH_MAX]; }; +static bool tested_code_insert_or_exists(const char *path, u64 addr, + struct rb_root *tested_sections) +{ + struct rb_node **node = &tested_sections->rb_node; + struct rb_node *parent = NULL; + struct tested_section *data; + + while (*node) { + int cmp; + + parent = *node; + data = rb_entry(*node, struct tested_section, rb_node); + cmp = strcmp(path, data->path); + if (!cmp) { + if (addr < data->addr) + cmp = -1; + else if (addr > data->addr) + cmp = 1; + else + return true; /* already tested */ + } + + if (cmp < 0) + node = &(*node)->rb_left; + else + node = &(*node)->rb_right; + } + + data = zalloc(sizeof(*data)); + if (!data) + return true; + + data->addr = addr; + strlcpy(data->path, path, sizeof(data->path)); + rb_link_node(&data->rb_node, parent, node); + rb_insert_color(&data->rb_node, tested_sections); + return false; +} + +static void tested_sections__free(struct rb_root *root) +{ + while (!RB_EMPTY_ROOT(root)) { + struct rb_node *node = rb_first(root); + struct tested_section *ts = rb_entry(node, + struct tested_section, + rb_node); + + rb_erase(node, root); + free(ts); + } +} + static size_t read_objdump_chunk(const char **line, unsigned char **buf, size_t *buf_len) { @@ -316,13 +370,15 @@ static void dump_buf(unsigned char *buf, size_t len) } static int read_object_code(u64 addr, size_t len, u8 cpumode, - struct thread *thread, struct state *state) + struct thread *thread, + struct rb_root *tested_sections) { struct addr_location al; unsigned char buf1[BUFSZ] = {0}; unsigned char buf2[BUFSZ] = {0}; size_t ret_len; u64 objdump_addr; + u64 skip_addr; const char *objdump_name; char decomp_name[KMOD_DECOMP_LEN]; bool decomp = false; @@ -350,6 +406,18 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, goto out; } + /* + * Don't retest the same addresses. objdump struggles with kcore - try + * each map only once even if the address is different. + */ + skip_addr = dso__is_kcore(dso) ? map__start(al.map) : al.addr; + if (tested_code_insert_or_exists(dso__long_name(dso), skip_addr, + tested_sections)) { + pr_debug("Already tested %s @ %#"PRIx64" - skipping\n", + dso__long_name(dso), skip_addr); + goto out; + } + pr_debug("On file address is: %#"PRIx64"\n", al.addr); if (len > BUFSZ) @@ -387,24 +455,6 @@ static int read_object_code(u64 addr, size_t len, u8 cpumode, goto out; } - /* objdump struggles with kcore - try each map only once */ - if (dso__is_kcore(dso)) { - size_t d; - - for (d = 0; d < state->done_cnt; d++) { - if (state->done[d] == map__start(al.map)) { - pr_debug("kcore map tested already"); - pr_debug(" - skipping\n"); - goto out; - } - } - if (state->done_cnt >= ARRAY_SIZE(state->done)) { - pr_debug("Too many kcore maps - skipping\n"); - goto out; - } - state->done[state->done_cnt++] = map__start(al.map); - } - objdump_name = dso__long_name(dso); if (dso__needs_decompress(dso)) { if (dso__decompress_kmodule_path(dso, objdump_name, @@ -471,9 +521,9 @@ out: return err; } -static int process_sample_event(struct machine *machine, - struct evlist *evlist, - union perf_event *event, struct state *state) +static int process_sample_event(struct machine *machine, struct evlist *evlist, + union perf_event *event, + struct rb_root *tested_sections) { struct perf_sample sample; struct thread *thread; @@ -494,7 +544,8 @@ static int process_sample_event(struct machine *machine, goto out; } - ret = read_object_code(sample.ip, READLEN, sample.cpumode, thread, state); + ret = read_object_code(sample.ip, READLEN, sample.cpumode, thread, + tested_sections); thread__put(thread); out: perf_sample__exit(&sample); @@ -502,10 +553,11 @@ out: } static int process_event(struct machine *machine, struct evlist *evlist, - union perf_event *event, struct state *state) + union perf_event *event, struct rb_root *tested_sections) { if (event->header.type == PERF_RECORD_SAMPLE) - return process_sample_event(machine, evlist, event, state); + return process_sample_event(machine, evlist, event, + tested_sections); if (event->header.type == PERF_RECORD_THROTTLE || event->header.type == PERF_RECORD_UNTHROTTLE) @@ -525,7 +577,7 @@ static int process_event(struct machine *machine, struct evlist *evlist, } static int process_events(struct machine *machine, struct evlist *evlist, - struct state *state) + struct rb_root *tested_sections) { union perf_event *event; struct mmap *md; @@ -537,7 +589,7 @@ static int process_events(struct machine *machine, struct evlist *evlist, continue; while ((event = perf_mmap__read_event(&md->core)) != NULL) { - ret = process_event(machine, evlist, event, state); + ret = process_event(machine, evlist, event, tested_sections); perf_mmap__consume(&md->core); if (ret < 0) return ret; @@ -637,9 +689,7 @@ static int do_test_code_reading(bool try_kcore) .uses_mmap = true, }, }; - struct state state = { - .done_cnt = 0, - }; + struct rb_root tested_sections = RB_ROOT; struct perf_thread_map *threads = NULL; struct perf_cpu_map *cpus = NULL; struct evlist *evlist = NULL; @@ -773,7 +823,7 @@ static int do_test_code_reading(bool try_kcore) evlist__disable(evlist); - ret = process_events(machine, evlist, &state); + ret = process_events(machine, evlist, &tested_sections); if (ret < 0) goto out_put; @@ -793,6 +843,7 @@ out_err: perf_thread_map__put(threads); machine__delete(machine); perf_env__exit(&host_env); + tested_sections__free(&tested_sections); return err; } diff --git a/tools/perf/tests/make b/tools/perf/tests/make index c574a678c28a..b650ce8864ed 100644 --- a/tools/perf/tests/make +++ b/tools/perf/tests/make @@ -73,9 +73,9 @@ make_extra_tests := EXTRA_TESTS=1 make_jevents_all := JEVENTS_ARCH=all make_no_bpf_skel := BUILD_BPF_SKEL=0 make_gen_vmlinux_h := GEN_VMLINUX_H=1 -make_no_libperl := NO_LIBPERL=1 +make_libperl := LIBPERL=1 make_no_libpython := NO_LIBPYTHON=1 -make_no_scripts := NO_LIBPYTHON=1 NO_LIBPERL=1 +make_no_scripts := NO_LIBPYTHON=1 make_no_slang := NO_SLANG=1 make_no_gtk2 := NO_GTK2=1 make_no_ui := NO_SLANG=1 NO_GTK2=1 @@ -118,7 +118,7 @@ make_install_prefix_slash := install prefix=/tmp/krava/ make_static := LDFLAGS=-static NO_PERF_READ_VDSO32=1 NO_PERF_READ_VDSOX32=1 NO_JVMTI=1 NO_LIBTRACEEVENT=1 NO_LIBELF=1 # all the NO_* variable combined -make_minimal := NO_LIBPERL=1 NO_LIBPYTHON=1 NO_GTK2=1 +make_minimal := NO_LIBPYTHON=1 NO_GTK2=1 make_minimal += NO_DEMANGLE=1 NO_LIBELF=1 NO_BACKTRACE=1 make_minimal += NO_LIBNUMA=1 NO_LIBBIONIC=1 NO_LIBDW=1 make_minimal += NO_LIBDW_DWARF_UNWIND=1 NO_AUXTRACE=1 NO_LIBBPF=1 @@ -143,7 +143,7 @@ run += make_extra_tests run += make_jevents_all run += make_no_bpf_skel run += make_gen_vmlinux_h -run += make_no_libperl +run += make_libperl run += make_no_libpython run += make_no_scripts run += make_no_slang diff --git a/tools/perf/tests/parse-events.c b/tools/perf/tests/parse-events.c index bb8004397650..67550cc60555 100644 --- a/tools/perf/tests/parse-events.c +++ b/tools/perf/tests/parse-events.c @@ -1736,6 +1736,53 @@ static int test__intel_pt(struct evlist *evlist) return TEST_OK; } +static bool test__acr_valid(void) +{ + struct perf_pmu *pmu = NULL; + + while ((pmu = perf_pmus__scan_core(pmu)) != NULL) { + if (perf_pmu__has_format(pmu, "acr_mask")) + return true; + } + + return false; +} + +static int test__ratio_to_prev(struct evlist *evlist) +{ + struct evsel *evsel; + int ret; + + TEST_ASSERT_VAL("wrong number of entries", 2 * perf_pmus__num_core_pmus() == evlist->core.nr_entries); + + evlist__for_each_entry(evlist, evsel) { + if (!perf_pmu__has_format(evsel->pmu, "acr_mask")) + return TEST_OK; + + if (evsel == evlist__first(evlist)) { + TEST_ASSERT_VAL("wrong config2", 0 == evsel->core.attr.config2); + TEST_ASSERT_VAL("wrong leader", evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 2); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 0); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_CPU_CYCLES, "cycles"); + } else { + TEST_ASSERT_VAL("wrong config2", 0 == evsel->core.attr.config2); + TEST_ASSERT_VAL("wrong leader", !evsel__is_group_leader(evsel)); + TEST_ASSERT_VAL("wrong core.nr_members", evsel->core.nr_members == 0); + TEST_ASSERT_VAL("wrong group_idx", evsel__group_idx(evsel) == 1); + ret = assert_hw(&evsel->core, PERF_COUNT_HW_INSTRUCTIONS, "instructions"); + } + if (ret) + return ret; + /* + * The period value gets configured within evlist__config, + * while this test executes only parse events method. + */ + TEST_ASSERT_VAL("wrong period", 0 == evsel->core.attr.sample_period); + } + return TEST_OK; +} + static int test__checkevent_complex_name(struct evlist *evlist) { struct evsel *evsel = evlist__first(evlist); @@ -2249,6 +2296,13 @@ static const struct evlist_test test__events[] = { .check = test__checkevent_tracepoint, /* 4 */ }, + { + .name = "{cycles,instructions/period=200000,ratio-to-prev=2.0/}", + .valid = test__acr_valid, + .check = test__ratio_to_prev, + /* 5 */ + }, + }; static const struct evlist_test test__events_pmu[] = { diff --git a/tools/perf/tests/perf-record.c b/tools/perf/tests/perf-record.c index 0b3c37e66871..efbd9cd60c63 100644 --- a/tools/perf/tests/perf-record.c +++ b/tools/perf/tests/perf-record.c @@ -13,15 +13,19 @@ #include "tests.h" #include "util/mmap.h" #include "util/sample.h" +#include "util/cpumap.h" static int sched__get_first_possible_cpu(pid_t pid, cpu_set_t *maskp) { - int i, cpu = -1, nrcpus = 1024; + int i, cpu = -1; + int nrcpus = cpu__max_cpu().cpu; + size_t size = CPU_ALLOC_SIZE(nrcpus); + realloc: - CPU_ZERO(maskp); + CPU_ZERO_S(size, maskp); - if (sched_getaffinity(pid, sizeof(*maskp), maskp) == -1) { - if (errno == EINVAL && nrcpus < (1024 << 8)) { + if (sched_getaffinity(pid, size, maskp) == -1) { + if (errno == EINVAL && nrcpus < (cpu__max_cpu().cpu << 8)) { nrcpus = nrcpus << 2; goto realloc; } @@ -30,11 +34,11 @@ realloc: } for (i = 0; i < nrcpus; i++) { - if (CPU_ISSET(i, maskp)) { + if (CPU_ISSET_S(i, size, maskp)) { if (cpu == -1) cpu = i; else - CPU_CLR(i, maskp); + CPU_CLR_S(i, size, maskp); } } @@ -50,8 +54,9 @@ static int test__PERF_RECORD(struct test_suite *test __maybe_unused, int subtest .no_buffering = true, .mmap_pages = 256, }; - cpu_set_t cpu_mask; - size_t cpu_mask_size = sizeof(cpu_mask); + int nrcpus = cpu__max_cpu().cpu; + cpu_set_t *cpu_mask; + size_t cpu_mask_size; struct evlist *evlist = evlist__new_dummy(); struct evsel *evsel; struct perf_sample sample; @@ -69,12 +74,22 @@ static int test__PERF_RECORD(struct test_suite *test __maybe_unused, int subtest int total_events = 0, nr_events[PERF_RECORD_MAX] = { 0, }; char sbuf[STRERR_BUFSIZE]; + cpu_mask = CPU_ALLOC(nrcpus); + if (!cpu_mask) { + pr_debug("failed to create cpumask\n"); + goto out; + } + + cpu_mask_size = CPU_ALLOC_SIZE(nrcpus); + CPU_ZERO_S(cpu_mask_size, cpu_mask); + perf_sample__init(&sample, /*all=*/false); if (evlist == NULL) /* Fallback for kernels lacking PERF_COUNT_SW_DUMMY */ evlist = evlist__new_default(); if (evlist == NULL) { pr_debug("Not enough memory to create evlist\n"); + CPU_FREE(cpu_mask); goto out; } @@ -111,10 +126,11 @@ static int test__PERF_RECORD(struct test_suite *test __maybe_unused, int subtest evsel__set_sample_bit(evsel, TIME); evlist__config(evlist, &opts, NULL); - err = sched__get_first_possible_cpu(evlist->workload.pid, &cpu_mask); + err = sched__get_first_possible_cpu(evlist->workload.pid, cpu_mask); if (err < 0) { pr_debug("sched__get_first_possible_cpu: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); + evlist__cancel_workload(evlist); goto out_delete_evlist; } @@ -123,9 +139,10 @@ static int test__PERF_RECORD(struct test_suite *test __maybe_unused, int subtest /* * So that we can check perf_sample.cpu on all the samples. */ - if (sched_setaffinity(evlist->workload.pid, cpu_mask_size, &cpu_mask) < 0) { + if (sched_setaffinity(evlist->workload.pid, cpu_mask_size, cpu_mask) < 0) { pr_debug("sched_setaffinity: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); + evlist__cancel_workload(evlist); goto out_delete_evlist; } @@ -137,6 +154,7 @@ static int test__PERF_RECORD(struct test_suite *test __maybe_unused, int subtest if (err < 0) { pr_debug("perf_evlist__open: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); + evlist__cancel_workload(evlist); goto out_delete_evlist; } @@ -149,6 +167,7 @@ static int test__PERF_RECORD(struct test_suite *test __maybe_unused, int subtest if (err < 0) { pr_debug("evlist__mmap: %s\n", str_error_r(errno, sbuf, sizeof(sbuf))); + evlist__cancel_workload(evlist); goto out_delete_evlist; } @@ -328,6 +347,7 @@ found_exit: ++errs; } out_delete_evlist: + CPU_FREE(cpu_mask); evlist__delete(evlist); out: perf_sample__exit(&sample); diff --git a/tools/perf/tests/python-use.c b/tools/perf/tests/python-use.c deleted file mode 100644 index 0ebc22ac8d5b..000000000000 --- a/tools/perf/tests/python-use.c +++ /dev/null @@ -1,27 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Just test if we can load the python binding. - */ - -#include <stdio.h> -#include <stdlib.h> -#include <linux/compiler.h> -#include "tests.h" -#include "util/debug.h" - -static int test__python_use(struct test_suite *test __maybe_unused, int subtest __maybe_unused) -{ - char *cmd; - int ret; - - if (asprintf(&cmd, "echo \"import sys ; sys.path.insert(0, '%s'); import perf\" | %s %s", - PYTHONPATH, PYTHON, verbose > 0 ? "" : "2> /dev/null") < 0) - return -1; - - pr_debug("python usage test: \"%s\"\n", cmd); - ret = system(cmd) ? -1 : 0; - free(cmd); - return ret; -} - -DEFINE_SUITE("'import perf' in python", python_use); diff --git a/tools/perf/tests/shell/amd-ibs-swfilt.sh b/tools/perf/tests/shell/amd-ibs-swfilt.sh index 7045ec72ba4c..e7f66df05c4b 100755 --- a/tools/perf/tests/shell/amd-ibs-swfilt.sh +++ b/tools/perf/tests/shell/amd-ibs-swfilt.sh @@ -1,6 +1,10 @@ #!/bin/bash # AMD IBS software filtering +ParanoidAndNotRoot() { + [ "$(id -u)" != 0 ] && [ "$(cat /proc/sys/kernel/perf_event_paranoid)" -gt $1 ] +} + echo "check availability of IBS swfilt" # check if IBS PMU is available @@ -16,6 +20,7 @@ if [ ! -f /sys/bus/event_source/devices/ibs_op/format/swfilt ]; then fi echo "run perf record with modifier and swfilt" +err=0 # setting any modifiers should fail perf record -B -e ibs_op//u -o /dev/null true 2> /dev/null @@ -31,11 +36,17 @@ if [ $? -ne 0 ]; then exit 1 fi -# setting it with swfilt=1 should be fine -perf record -B -e ibs_op/swfilt=1/k -o /dev/null true -if [ $? -ne 0 ]; then - echo "[FAIL] IBS op PMU cannot handle swfilt for exclude_user" - exit 1 +if ! ParanoidAndNotRoot 1 +then + # setting it with swfilt=1 should be fine + perf record -B -e ibs_op/swfilt=1/k -o /dev/null true + if [ $? -ne 0 ]; then + echo "[FAIL] IBS op PMU cannot handle swfilt for exclude_user" + exit 1 + fi +else + echo "[SKIP] not root and perf_event_paranoid too high for exclude_user" + err=2 fi # check ibs_fetch PMU as well @@ -46,10 +57,16 @@ if [ $? -ne 0 ]; then fi # check system wide recording -perf record -aB --synth=no -e ibs_op/swfilt/k -o /dev/null true -if [ $? -ne 0 ]; then - echo "[FAIL] IBS op PMU cannot handle swfilt in system-wide mode" - exit 1 +if ! ParanoidAndNotRoot 0 +then + perf record -aB --synth=no -e ibs_op/swfilt/k -o /dev/null true + if [ $? -ne 0 ]; then + echo "[FAIL] IBS op PMU cannot handle swfilt in system-wide mode" + exit 1 + fi +else + echo "[SKIP] not root and perf_event_paranoid too high for system-wide/exclude_user" + err=2 fi echo "check number of samples with swfilt" @@ -60,8 +77,16 @@ if [ ${kernel_sample} -ne 0 ]; then exit 1 fi -user_sample=$(perf record -e ibs_fetch/swfilt/k -o- true | perf script -i- -F misc | grep -c ^U) -if [ ${user_sample} -ne 0 ]; then - echo "[FAIL] unexpected user samples: " ${user_sample} - exit 1 +if ! ParanoidAndNotRoot 1 +then + user_sample=$(perf record -e ibs_fetch/swfilt/k -o- true | perf script -i- -F misc | grep -c ^U) + if [ ${user_sample} -ne 0 ]; then + echo "[FAIL] unexpected user samples: " ${user_sample} + exit 1 + fi +else + echo "[SKIP] not root and perf_event_paranoid too high for exclude_user" + err=2 fi + +exit $err diff --git a/tools/perf/tests/shell/attr/test-stat-default b/tools/perf/tests/shell/attr/test-stat-default index e47fb4944679..8dd27c1fb661 100644 --- a/tools/perf/tests/shell/attr/test-stat-default +++ b/tools/perf/tests/shell/attr/test-stat-default @@ -227,3 +227,10 @@ fd=28 type=4 config=270 optional=1 + +# PERF_TYPE_RAW / INT_MISC.UOP_DROPPING +[event29:base-stat] +fd=29 +type=4 +config=4269 +optional=1
\ No newline at end of file diff --git a/tools/perf/tests/shell/attr/test-stat-detailed-1 b/tools/perf/tests/shell/attr/test-stat-detailed-1 index 3d500d3e0c5c..12a2ebf4e64a 100644 --- a/tools/perf/tests/shell/attr/test-stat-detailed-1 +++ b/tools/perf/tests/shell/attr/test-stat-detailed-1 @@ -269,3 +269,10 @@ fd=32 type=3 config=65538 optional=1 + +# PERF_TYPE_RAW / INT_MISC.UOP_DROPPING +[event33:base-stat] +fd=33 +type=4 +config=4269 +optional=1
\ No newline at end of file diff --git a/tools/perf/tests/shell/attr/test-stat-detailed-2 b/tools/perf/tests/shell/attr/test-stat-detailed-2 index 01777a63752f..66ea25b7d38f 100644 --- a/tools/perf/tests/shell/attr/test-stat-detailed-2 +++ b/tools/perf/tests/shell/attr/test-stat-detailed-2 @@ -329,3 +329,10 @@ fd=38 type=3 config=65540 optional=1 + +# PERF_TYPE_RAW / INT_MISC.UOP_DROPPING +[event39:base-stat] +fd=39 +type=4 +config=4269 +optional=1
\ No newline at end of file diff --git a/tools/perf/tests/shell/attr/test-stat-detailed-3 b/tools/perf/tests/shell/attr/test-stat-detailed-3 index 8400abd7e1e4..4a27bbfb9f87 100644 --- a/tools/perf/tests/shell/attr/test-stat-detailed-3 +++ b/tools/perf/tests/shell/attr/test-stat-detailed-3 @@ -349,3 +349,10 @@ fd=40 type=3 config=66048 optional=1 + +# PERF_TYPE_RAW / INT_MISC.UOP_DROPPING +[event41:base-stat] +fd=41 +type=4 +config=4269 +optional=1
\ No newline at end of file diff --git a/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh b/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh index 8226449ac5c3..f74aab5c5d7f 100755 --- a/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh +++ b/tools/perf/tests/shell/base_probe/test_adding_blacklisted.sh @@ -13,11 +13,12 @@ # they must be skipped. # -# include working environment -. ../common/init.sh - +DIR_PATH="$(dirname $0)" TEST_RESULT=0 +# include working environment +. "$DIR_PATH/../common/init.sh" + # skip if not supported BLACKFUNC_LIST=`head -n 5 /sys/kernel/debug/kprobes/blacklist 2> /dev/null | cut -f2` if [ -z "$BLACKFUNC_LIST" ]; then @@ -53,7 +54,8 @@ for BLACKFUNC in $BLACKFUNC_LIST; do PERF_EXIT_CODE=$? # check for bad DWARF polluting the result - ../common/check_all_patterns_found.pl "$REGEX_MISSING_DECL_LINE" >/dev/null < $LOGS_DIR/adding_blacklisted.err + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$REGEX_MISSING_DECL_LINE" >/dev/null < $LOGS_DIR/adding_blacklisted.err if [ $? -eq 0 ]; then SKIP_DWARF=1 @@ -73,7 +75,11 @@ for BLACKFUNC in $BLACKFUNC_LIST; do fi fi else - ../common/check_all_lines_matched.pl "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err + "$DIR_PATH/../common/check_all_lines_matched.pl" \ + "$REGEX_SKIP_MESSAGE" "$REGEX_NOT_FOUND_MESSAGE" \ + "$REGEX_ERROR_MESSAGE" "$REGEX_SCOPE_FAIL" \ + "$REGEX_INVALID_ARGUMENT" "$REGEX_SYMBOL_FAIL" \ + "$REGEX_OUT_SECTION" < $LOGS_DIR/adding_blacklisted.err CHECK_EXIT_CODE=$? SKIP_DWARF=0 @@ -94,7 +100,9 @@ fi $CMD_PERF list probe:\* > $LOGS_DIR/adding_blacklisted_list.log PERF_EXIT_CODE=$? -../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "List of pre-defined events" "Metric Groups:" < $LOGS_DIR/adding_blacklisted_list.log +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "$RE_LINE_EMPTY" "List of pre-defined events" "Metric Groups:" \ + < $LOGS_DIR/adding_blacklisted_list.log CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "listing blacklisted probe (should NOT be listed)" diff --git a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh index df288cf90cd6..555a825d55f2 100755 --- a/tools/perf/tests/shell/base_probe/test_adding_kernel.sh +++ b/tools/perf/tests/shell/base_probe/test_adding_kernel.sh @@ -13,13 +13,14 @@ # and removing. # -# include working environment -. ../common/init.sh - +DIR_PATH="$(dirname $0)" TEST_RESULT=0 +# include working environment +. "$DIR_PATH/../common/init.sh" + # shellcheck source=lib/probe_vfs_getname.sh -. "$(dirname "$0")/../lib/probe_vfs_getname.sh" +. "$DIR_PATH/../lib/probe_vfs_getname.sh" TEST_PROBE=${TEST_PROBE:-"inode_permission"} @@ -44,7 +45,9 @@ for opt in "" "-a" "--add"; do $CMD_PERF probe $opt $TEST_PROBE 2> $LOGS_DIR/adding_kernel_add$opt.err PERF_EXIT_CODE=$? - ../common/check_all_patterns_found.pl "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE" < $LOGS_DIR/adding_kernel_add$opt.err + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE" \ + < $LOGS_DIR/adding_kernel_add$opt.err CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "adding probe $TEST_PROBE :: $opt" @@ -58,7 +61,10 @@ done $CMD_PERF list probe:\* > $LOGS_DIR/adding_kernel_list.log PERF_EXIT_CODE=$? -../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "List of pre-defined events" "probe:${TEST_PROBE}(?:_\d+)?\s+\[Tracepoint event\]" "Metric Groups:" < $LOGS_DIR/adding_kernel_list.log +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "$RE_LINE_EMPTY" "List of pre-defined events" \ + "probe:${TEST_PROBE}(?:_\d+)?\s+\[Tracepoint event\]" \ + "Metric Groups:" < $LOGS_DIR/adding_kernel_list.log CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "listing added probe :: perf list" @@ -71,7 +77,9 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "listing added probe :: perf list $CMD_PERF probe -l > $LOGS_DIR/adding_kernel_list-l.log PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "\s*probe:${TEST_PROBE}(?:_\d+)?\s+\(on ${TEST_PROBE}(?:[:\+]$RE_NUMBER_HEX)?@.+\)" < $LOGS_DIR/adding_kernel_list-l.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "\s*probe:${TEST_PROBE}(?:_\d+)?\s+\(on ${TEST_PROBE}(?:[:\+]$RE_NUMBER_HEX)?@.+\)" \ + < $LOGS_DIR/adding_kernel_list-l.log CHECK_EXIT_CODE=$? if [ $NO_DEBUGINFO ] ; then @@ -93,9 +101,13 @@ REGEX_STAT_VALUES="\s*\d+\s+probe:$TEST_PROBE" # the value should be greater than 1 REGEX_STAT_VALUE_NONZERO="\s*[1-9][0-9]*\s+probe:$TEST_PROBE" REGEX_STAT_TIME="\s*$RE_NUMBER\s+seconds (?:time elapsed|user|sys)" -../common/check_all_lines_matched.pl "$REGEX_STAT_HEADER" "$REGEX_STAT_VALUES" "$REGEX_STAT_TIME" "$RE_LINE_COMMENT" "$RE_LINE_EMPTY" < $LOGS_DIR/adding_kernel_using_probe.log +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "$REGEX_STAT_HEADER" "$REGEX_STAT_VALUES" "$REGEX_STAT_TIME" \ + "$RE_LINE_COMMENT" "$RE_LINE_EMPTY" < $LOGS_DIR/adding_kernel_using_probe.log CHECK_EXIT_CODE=$? -../common/check_all_patterns_found.pl "$REGEX_STAT_HEADER" "$REGEX_STAT_VALUE_NONZERO" "$REGEX_STAT_TIME" < $LOGS_DIR/adding_kernel_using_probe.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$REGEX_STAT_HEADER" "$REGEX_STAT_VALUE_NONZERO" "$REGEX_STAT_TIME" \ + < $LOGS_DIR/adding_kernel_using_probe.log (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "using added probe" @@ -108,7 +120,8 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "using added probe" $CMD_PERF probe -d $TEST_PROBE\* 2> $LOGS_DIR/adding_kernel_removing.err PERF_EXIT_CODE=$? -../common/check_all_lines_matched.pl "Removed event: probe:$TEST_PROBE" < $LOGS_DIR/adding_kernel_removing.err +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "Removed event: probe:$TEST_PROBE" < $LOGS_DIR/adding_kernel_removing.err CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "deleting added probe" @@ -121,7 +134,9 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "deleting added probe" $CMD_PERF list probe:\* > $LOGS_DIR/adding_kernel_list_removed.log PERF_EXIT_CODE=$? -../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "List of pre-defined events" "Metric Groups:" < $LOGS_DIR/adding_kernel_list_removed.log +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "$RE_LINE_EMPTY" "List of pre-defined events" "Metric Groups:" \ + < $LOGS_DIR/adding_kernel_list_removed.log CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "listing removed probe (should NOT be listed)" @@ -135,7 +150,9 @@ $CMD_PERF probe -n --add $TEST_PROBE 2> $LOGS_DIR/adding_kernel_dryrun.err PERF_EXIT_CODE=$? # check for the output (should be the same as usual) -../common/check_all_patterns_found.pl "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE" < $LOGS_DIR/adding_kernel_dryrun.err +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE" \ + < $LOGS_DIR/adding_kernel_dryrun.err CHECK_EXIT_CODE=$? # check that no probe was added in real @@ -152,7 +169,9 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "dry run :: adding probe" $CMD_PERF probe --add $TEST_PROBE 2> $LOGS_DIR/adding_kernel_forceadd_01.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE" < $LOGS_DIR/adding_kernel_forceadd_01.err +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE" \ + < $LOGS_DIR/adding_kernel_forceadd_01.err CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "force-adding probes :: first probe adding" @@ -162,7 +181,9 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "force-adding probes :: first pro ! $CMD_PERF probe --add $TEST_PROBE 2> $LOGS_DIR/adding_kernel_forceadd_02.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "Error: event \"$TEST_PROBE\" already exists." "Error: Failed to add events." < $LOGS_DIR/adding_kernel_forceadd_02.err +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "Error: event \"$TEST_PROBE\" already exists." \ + "Error: Failed to add events." < $LOGS_DIR/adding_kernel_forceadd_02.err CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "force-adding probes :: second probe adding (without force)" @@ -173,7 +194,9 @@ NO_OF_PROBES=`$CMD_PERF probe -l $TEST_PROBE| wc -l` $CMD_PERF probe --force --add $TEST_PROBE 2> $LOGS_DIR/adding_kernel_forceadd_03.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "Added new events?:" "probe:${TEST_PROBE}_${NO_OF_PROBES}" "on $TEST_PROBE" < $LOGS_DIR/adding_kernel_forceadd_03.err +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "Added new events?:" "probe:${TEST_PROBE}_${NO_OF_PROBES}" \ + "on $TEST_PROBE" < $LOGS_DIR/adding_kernel_forceadd_03.err CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "force-adding probes :: second probe adding (with force)" @@ -187,7 +210,9 @@ $CMD_PERF stat -e probe:$TEST_PROBE -e probe:${TEST_PROBE}_${NO_OF_PROBES} -x';' PERF_EXIT_CODE=$? REGEX_LINE="$RE_NUMBER;+probe:${TEST_PROBE}_?(?:$NO_OF_PROBES)?;$RE_NUMBER;$RE_NUMBER" -../common/check_all_lines_matched.pl "$REGEX_LINE" "$RE_LINE_EMPTY" "$RE_LINE_COMMENT" < $LOGS_DIR/adding_kernel_using_two.log +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "$REGEX_LINE" "$RE_LINE_EMPTY" "$RE_LINE_COMMENT" \ + < $LOGS_DIR/adding_kernel_using_two.log CHECK_EXIT_CODE=$? VALUE_1=`grep "$TEST_PROBE;" $LOGS_DIR/adding_kernel_using_two.log | awk -F';' '{print $1}'` @@ -205,7 +230,9 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "using doubled probe" $CMD_PERF probe --del \* 2> $LOGS_DIR/adding_kernel_removing_wildcard.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "Removed event: probe:$TEST_PROBE" "Removed event: probe:${TEST_PROBE}_1" < $LOGS_DIR/adding_kernel_removing_wildcard.err +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "Removed event: probe:$TEST_PROBE" \ + "Removed event: probe:${TEST_PROBE}_1" < $LOGS_DIR/adding_kernel_removing_wildcard.err CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "removing multiple probes" @@ -217,7 +244,9 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "removing multiple probes" $CMD_PERF probe -nf --max-probes=512 -a 'vfs_* $params' 2> $LOGS_DIR/adding_kernel_adding_wildcard.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "probe:vfs_mknod" "probe:vfs_create" "probe:vfs_rmdir" "probe:vfs_link" "probe:vfs_write" < $LOGS_DIR/adding_kernel_adding_wildcard.err +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "probe:vfs_mknod" "probe:vfs_create" "probe:vfs_rmdir" \ + "probe:vfs_link" "probe:vfs_write" < $LOGS_DIR/adding_kernel_adding_wildcard.err CHECK_EXIT_CODE=$? if [ $NO_DEBUGINFO ] ; then @@ -240,13 +269,22 @@ test $PERF_EXIT_CODE -ne 139 -a $PERF_EXIT_CODE -ne 0 PERF_EXIT_CODE=$? # check that the error message is reasonable -../common/check_all_patterns_found.pl "Failed to find" "somenonexistingrandomstuffwhichisalsoprettylongorevenlongertoexceed64" < $LOGS_DIR/adding_kernel_nonexisting.err +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "Failed to find" \ + "somenonexistingrandomstuffwhichisalsoprettylongorevenlongertoexceed64" \ + < $LOGS_DIR/adding_kernel_nonexisting.err CHECK_EXIT_CODE=$? -../common/check_all_patterns_found.pl "in this function|at this address" "Error" "Failed to add events" < $LOGS_DIR/adding_kernel_nonexisting.err +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "in this function|at this address" "Error" "Failed to add events" \ + < $LOGS_DIR/adding_kernel_nonexisting.err (( CHECK_EXIT_CODE += $? )) -../common/check_all_lines_matched.pl "Failed to find" "Error" "Probe point .+ not found" "optimized out" "Use.+\-\-range option to show.+location range" < $LOGS_DIR/adding_kernel_nonexisting.err +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "Failed to find" "Error" "Probe point .+ not found" "optimized out" \ + "Use.+\-\-range option to show.+location range" \ + < $LOGS_DIR/adding_kernel_nonexisting.err (( CHECK_EXIT_CODE += $? )) -../common/check_no_patterns_found.pl "$RE_SEGFAULT" < $LOGS_DIR/adding_kernel_nonexisting.err +"$DIR_PATH/../common/check_no_patterns_found.pl" \ + "$RE_SEGFAULT" < $LOGS_DIR/adding_kernel_nonexisting.err (( CHECK_EXIT_CODE += $? )) if [ $NO_DEBUGINFO ]; then @@ -264,7 +302,10 @@ fi $CMD_PERF probe --add "$TEST_PROBE%return \$retval" 2> $LOGS_DIR/adding_kernel_func_retval_add.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "Added new events?:" "probe:$TEST_PROBE" "on $TEST_PROBE%return with \\\$retval" < $LOGS_DIR/adding_kernel_func_retval_add.err +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "Added new events?:" "probe:$TEST_PROBE" \ + "on $TEST_PROBE%return with \\\$retval" \ + < $LOGS_DIR/adding_kernel_func_retval_add.err CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "function with retval :: add" @@ -274,7 +315,9 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "function with retval :: add" $CMD_PERF record -e probe:$TEST_PROBE\* -o $CURRENT_TEST_DIR/perf.data -- cat /proc/cpuinfo > /dev/null 2> $LOGS_DIR/adding_kernel_func_retval_record.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "$RE_LINE_RECORD1" "$RE_LINE_RECORD2" < $LOGS_DIR/adding_kernel_func_retval_record.err +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$RE_LINE_RECORD1" "$RE_LINE_RECORD2" \ + < $LOGS_DIR/adding_kernel_func_retval_record.err CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "function with retval :: record" @@ -285,9 +328,11 @@ $CMD_PERF script -i $CURRENT_TEST_DIR/perf.data > $LOGS_DIR/adding_kernel_func_r PERF_EXIT_CODE=$? REGEX_SCRIPT_LINE="\s*cat\s+$RE_NUMBER\s+\[$RE_NUMBER\]\s+$RE_NUMBER:\s+probe:$TEST_PROBE\w*:\s+\($RE_NUMBER_HEX\s+<\-\s+$RE_NUMBER_HEX\)\s+arg1=$RE_NUMBER_HEX" -../common/check_all_lines_matched.pl "$REGEX_SCRIPT_LINE" < $LOGS_DIR/adding_kernel_func_retval_script.log +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "$REGEX_SCRIPT_LINE" < $LOGS_DIR/adding_kernel_func_retval_script.log CHECK_EXIT_CODE=$? -../common/check_all_patterns_found.pl "$REGEX_SCRIPT_LINE" < $LOGS_DIR/adding_kernel_func_retval_script.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$REGEX_SCRIPT_LINE" < $LOGS_DIR/adding_kernel_func_retval_script.log (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "function argument probing :: script" diff --git a/tools/perf/tests/shell/base_probe/test_basic.sh b/tools/perf/tests/shell/base_probe/test_basic.sh index 9d8b5afbeddd..162838ddc974 100755 --- a/tools/perf/tests/shell/base_probe/test_basic.sh +++ b/tools/perf/tests/shell/base_probe/test_basic.sh @@ -12,11 +12,12 @@ # This test tests basic functionality of perf probe command. # -# include working environment -. ../common/init.sh - +DIR_PATH="$(dirname $0)" TEST_RESULT=0 +# include working environment +. "$DIR_PATH/../common/init.sh" + if ! check_kprobes_available; then print_overall_skipped exit 2 @@ -30,15 +31,25 @@ if [ "$PARAM_GENERAL_HELP_TEXT_CHECK" = "y" ]; then $CMD_PERF probe --help > $LOGS_DIR/basic_helpmsg.log 2> $LOGS_DIR/basic_helpmsg.err PERF_EXIT_CODE=$? - ../common/check_all_patterns_found.pl "PERF-PROBE" "NAME" "SYNOPSIS" "DESCRIPTION" "OPTIONS" "PROBE\s+SYNTAX" "PROBE\s+ARGUMENT" "LINE\s+SYNTAX" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "PERF-PROBE" "NAME" "SYNOPSIS" "DESCRIPTION" "OPTIONS" \ + "PROBE\s+SYNTAX" "PROBE\s+ARGUMENT" "LINE\s+SYNTAX" \ + < $LOGS_DIR/basic_helpmsg.log CHECK_EXIT_CODE=$? - ../common/check_all_patterns_found.pl "LAZY\s+MATCHING" "FILTER\s+PATTERN" "EXAMPLES" "SEE\s+ALSO" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "LAZY\s+MATCHING" "FILTER\s+PATTERN" "EXAMPLES" "SEE\s+ALSO" \ + < $LOGS_DIR/basic_helpmsg.log (( CHECK_EXIT_CODE += $? )) - ../common/check_all_patterns_found.pl "vmlinux" "module=" "source=" "verbose" "quiet" "add=" "del=" "list.*EVENT" "line=" "vars=" "externs" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "vmlinux" "module=" "source=" "verbose" "quiet" "add=" "del=" \ + "list.*EVENT" "line=" "vars=" "externs" < $LOGS_DIR/basic_helpmsg.log (( CHECK_EXIT_CODE += $? )) - ../common/check_all_patterns_found.pl "no-inlines" "funcs.*FILTER" "filter=FILTER" "force" "dry-run" "max-probes" "exec=" "demangle-kernel" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "no-inlines" "funcs.*FILTER" "filter=FILTER" "force" "dry-run" \ + "max-probes" "exec=" "demangle-kernel" < $LOGS_DIR/basic_helpmsg.log (( CHECK_EXIT_CODE += $? )) - ../common/check_no_patterns_found.pl "No manual entry for" < $LOGS_DIR/basic_helpmsg.err + "$DIR_PATH/../common/check_no_patterns_found.pl" \ + "No manual entry for" < $LOGS_DIR/basic_helpmsg.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "help message" @@ -53,7 +64,9 @@ fi # without any args perf-probe should print usage $CMD_PERF probe 2> $LOGS_DIR/basic_usage.log > /dev/null -../common/check_all_patterns_found.pl "[Uu]sage" "perf probe" "verbose" "quiet" "add" "del" "force" "line" "vars" "externs" "range" < $LOGS_DIR/basic_usage.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "[Uu]sage" "perf probe" "verbose" "quiet" "add" "del" "force" \ + "line" "vars" "externs" "range" < $LOGS_DIR/basic_usage.log CHECK_EXIT_CODE=$? print_results 0 $CHECK_EXIT_CODE "usage message" diff --git a/tools/perf/tests/shell/base_probe/test_invalid_options.sh b/tools/perf/tests/shell/base_probe/test_invalid_options.sh index 92f7254eb32a..44a3ae014bfa 100755 --- a/tools/perf/tests/shell/base_probe/test_invalid_options.sh +++ b/tools/perf/tests/shell/base_probe/test_invalid_options.sh @@ -12,11 +12,12 @@ # This test checks whether the invalid and incompatible options are reported # -# include working environment -. ../common/init.sh - +DIR_PATH="$(dirname $0)" TEST_RESULT=0 +# include working environment +. "$DIR_PATH/../common/init.sh" + if ! check_kprobes_available; then print_overall_skipped exit 2 @@ -33,7 +34,9 @@ for opt in '-a' '-d' '-L' '-V'; do ! $CMD_PERF probe $opt 2> $LOGS_DIR/invalid_options_missing_argument$opt.err PERF_EXIT_CODE=$? - ../common/check_all_patterns_found.pl "Error: switch .* requires a value" < $LOGS_DIR/invalid_options_missing_argument$opt.err + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "Error: switch .* requires a value" \ + < $LOGS_DIR/invalid_options_missing_argument$opt.err CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "missing argument for $opt" @@ -66,7 +69,8 @@ for opt in '-a xxx -d xxx' '-a xxx -L foo' '-a xxx -V foo' '-a xxx -l' '-a xxx - ! $CMD_PERF probe $opt > /dev/null 2> $LOGS_DIR/aux.log PERF_EXIT_CODE=$? - ../common/check_all_patterns_found.pl "Error: switch .+ cannot be used with switch .+" < $LOGS_DIR/aux.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "Error: switch .+ cannot be used with switch .+" < $LOGS_DIR/aux.log CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "mutually exclusive options :: $opt" diff --git a/tools/perf/tests/shell/base_probe/test_line_semantics.sh b/tools/perf/tests/shell/base_probe/test_line_semantics.sh index 20435b6bf6bc..576442d87a44 100755 --- a/tools/perf/tests/shell/base_probe/test_line_semantics.sh +++ b/tools/perf/tests/shell/base_probe/test_line_semantics.sh @@ -13,11 +13,12 @@ # arguments are properly reported. # -# include working environment -. ../common/init.sh - +DIR_PATH="$(dirname $0)" TEST_RESULT=0 +# include working environment +. "$DIR_PATH/../common/init.sh" + if ! check_kprobes_available; then print_overall_skipped exit 2 diff --git a/tools/perf/tests/shell/base_report/setup.sh b/tools/perf/tests/shell/base_report/setup.sh index 8634e7e0dda6..bb49b0fabb11 100755 --- a/tools/perf/tests/shell/base_report/setup.sh +++ b/tools/perf/tests/shell/base_report/setup.sh @@ -12,8 +12,10 @@ # # +DIR_PATH="$(dirname $0)" + # include working environment -. ../common/init.sh +. "$DIR_PATH/../common/init.sh" TEST_RESULT=0 @@ -24,7 +26,8 @@ SW_EVENT="cpu-clock" $CMD_PERF record -asdg -e $SW_EVENT -o $CURRENT_TEST_DIR/perf.data -- $CMD_LONGER_SLEEP 2> $LOGS_DIR/setup.log PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "$RE_LINE_RECORD1" "$RE_LINE_RECORD2" < $LOGS_DIR/setup.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$RE_LINE_RECORD1" "$RE_LINE_RECORD2" < $LOGS_DIR/setup.log CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "prepare the perf.data file" @@ -38,7 +41,8 @@ echo ================== cat $LOGS_DIR/setup-latency.log echo ================== -../common/check_all_patterns_found.pl "$RE_LINE_RECORD1" "$RE_LINE_RECORD2" < $LOGS_DIR/setup-latency.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$RE_LINE_RECORD1" "$RE_LINE_RECORD2" < $LOGS_DIR/setup-latency.log CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "prepare the perf.data.1 file" diff --git a/tools/perf/tests/shell/base_report/test_basic.sh b/tools/perf/tests/shell/base_report/test_basic.sh index adfd8713b8f8..0dfe7e5fd1ca 100755 --- a/tools/perf/tests/shell/base_report/test_basic.sh +++ b/tools/perf/tests/shell/base_report/test_basic.sh @@ -12,11 +12,12 @@ # # -# include working environment -. ../common/init.sh - +DIR_PATH="$(dirname $0)" TEST_RESULT=0 +# include working environment +. "$DIR_PATH/../common/init.sh" + ### help message @@ -25,19 +26,37 @@ if [ "$PARAM_GENERAL_HELP_TEXT_CHECK" = "y" ]; then $CMD_PERF report --help > $LOGS_DIR/basic_helpmsg.log 2> $LOGS_DIR/basic_helpmsg.err PERF_EXIT_CODE=$? - ../common/check_all_patterns_found.pl "PERF-REPORT" "NAME" "SYNOPSIS" "DESCRIPTION" "OPTIONS" "OVERHEAD\s+CALCULATION" "SEE ALSO" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "PERF-REPORT" "NAME" "SYNOPSIS" "DESCRIPTION" "OPTIONS" \ + "OVERHEAD\s+CALCULATION" "SEE ALSO" < $LOGS_DIR/basic_helpmsg.log CHECK_EXIT_CODE=$? - ../common/check_all_patterns_found.pl "input" "verbose" "show-nr-samples" "show-cpu-utilization" "threads" "comms" "pid" "tid" "dsos" "symbols" "symbol-filter" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "input" "verbose" "show-nr-samples" "show-cpu-utilization" \ + "threads" "comms" "pid" "tid" "dsos" "symbols" "symbol-filter" \ + < $LOGS_DIR/basic_helpmsg.log (( CHECK_EXIT_CODE += $? )) - ../common/check_all_patterns_found.pl "hide-unresolved" "sort" "fields" "parent" "exclude-other" "column-widths" "field-separator" "dump-raw-trace" "children" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "hide-unresolved" "sort" "fields" "parent" "exclude-other" \ + "column-widths" "field-separator" "dump-raw-trace" "children" \ + < $LOGS_DIR/basic_helpmsg.log (( CHECK_EXIT_CODE += $? )) - ../common/check_all_patterns_found.pl "call-graph" "max-stack" "inverted" "ignore-callees" "pretty" "stdio" "tui" "gtk" "vmlinux" "kallsyms" "modules" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "call-graph" "max-stack" "inverted" "ignore-callees" "pretty" \ + "stdio" "tui" "gtk" "vmlinux" "kallsyms" "modules" \ + < $LOGS_DIR/basic_helpmsg.log (( CHECK_EXIT_CODE += $? )) - ../common/check_all_patterns_found.pl "force" "symfs" "cpu" "disassembler-style" "source" "asm-raw" "show-total-period" "show-info" "branch-stack" "group" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "force" "symfs" "cpu" "disassembler-style" "source" "asm-raw" \ + "show-total-period" "show-info" "branch-stack" "group" \ + < $LOGS_DIR/basic_helpmsg.log (( CHECK_EXIT_CODE += $? )) - ../common/check_all_patterns_found.pl "branch-history" "objdump" "demangle" "percent-limit" "percentage" "header" "itrace" "full-source-path" "show-ref-call-graph" < $LOGS_DIR/basic_helpmsg.log + "$DIR_PATH/../common/check_all_patterns_found.pl" \ + "branch-history" "objdump" "demangle" "percent-limit" "percentage" \ + "header" "itrace" "full-source-path" "show-ref-call-graph" \ + < $LOGS_DIR/basic_helpmsg.log (( CHECK_EXIT_CODE += $? )) - ../common/check_no_patterns_found.pl "No manual entry for" < $LOGS_DIR/basic_helpmsg.err + "$DIR_PATH/../common/check_no_patterns_found.pl" \ + "No manual entry for" < $LOGS_DIR/basic_helpmsg.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "help message" @@ -57,9 +76,12 @@ REGEX_LOST_SAMPLES_INFO="#\s*Total Lost Samples:\s+$RE_NUMBER" REGEX_SAMPLES_INFO="#\s*Samples:\s+(?:$RE_NUMBER)\w?\s+of\s+event\s+'$RE_EVENT_ANY'" REGEX_LINES_HEADER="#\s*Children\s+Self\s+Command\s+Shared Object\s+Symbol" REGEX_LINES="\s*$RE_NUMBER%\s+$RE_NUMBER%\s+\S+\s+\[kernel\.(?:vmlinux)|(?:kallsyms)\]\s+\[[k\.]\]\s+\w+" -../common/check_all_patterns_found.pl "$REGEX_LOST_SAMPLES_INFO" "$REGEX_SAMPLES_INFO" "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_basic.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$REGEX_LOST_SAMPLES_INFO" "$REGEX_SAMPLES_INFO" \ + "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_basic.log CHECK_EXIT_CODE=$? -../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_basic.err +"$DIR_PATH/../common/check_errors_whitelisted.pl" \ + "$DIR_PATH/stderr-whitelist.txt" < $LOGS_DIR/basic_basic.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "basic execution" @@ -74,9 +96,11 @@ PERF_EXIT_CODE=$? REGEX_LINES_HEADER="#\s*Children\s+Self\s+Samples\s+Command\s+Shared Object\s+Symbol" REGEX_LINES="\s*$RE_NUMBER%\s+$RE_NUMBER%\s+$RE_NUMBER\s+\S+\s+\[kernel\.(?:vmlinux)|(?:kallsyms)\]\s+\[[k\.]\]\s+\w+" -../common/check_all_patterns_found.pl "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_nrsamples.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_nrsamples.log CHECK_EXIT_CODE=$? -../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_nrsamples.err +"$DIR_PATH/../common/check_errors_whitelisted.pl" \ + "$DIR_PATH/stderr-whitelist.txt" < $LOGS_DIR/basic_nrsamples.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "number of samples" @@ -98,7 +122,10 @@ REGEX_LINE_CPUS_ONLINE="#\s+nrcpus online\s*:\s*$MY_CPUS_ONLINE" REGEX_LINE_CPUS_AVAIL="#\s+nrcpus avail\s*:\s*$MY_CPUS_AVAILABLE" # disable precise check for "nrcpus avail" in BASIC runmode test $PERFTOOL_TESTSUITE_RUNMODE -lt $RUNMODE_STANDARD && REGEX_LINE_CPUS_AVAIL="#\s+nrcpus avail\s*:\s*$RE_NUMBER" -../common/check_all_patterns_found.pl "$REGEX_LINE_TIMESTAMP" "$REGEX_LINE_HOSTNAME" "$REGEX_LINE_KERNEL" "$REGEX_LINE_PERF" "$REGEX_LINE_ARCH" "$REGEX_LINE_CPUS_ONLINE" "$REGEX_LINE_CPUS_AVAIL" < $LOGS_DIR/basic_header.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$REGEX_LINE_TIMESTAMP" "$REGEX_LINE_HOSTNAME" "$REGEX_LINE_KERNEL" \ + "$REGEX_LINE_PERF" "$REGEX_LINE_ARCH" "$REGEX_LINE_CPUS_ONLINE" \ + "$REGEX_LINE_CPUS_AVAIL" < $LOGS_DIR/basic_header.log CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "header" @@ -129,9 +156,11 @@ PERF_EXIT_CODE=$? REGEX_LINES_HEADER="#\s*Children\s+Self\s+sys\s+usr\s+Command\s+Shared Object\s+Symbol" REGEX_LINES="\s*$RE_NUMBER%\s+$RE_NUMBER%\s+$RE_NUMBER%\s+$RE_NUMBER%\s+\S+\s+\[kernel\.(?:vmlinux)|(?:kallsyms)\]\s+\[[k\.]\]\s+\w+" -../common/check_all_patterns_found.pl "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_cpuut.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "$REGEX_LINES_HEADER" "$REGEX_LINES" < $LOGS_DIR/basic_cpuut.log CHECK_EXIT_CODE=$? -../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_cpuut.err +"$DIR_PATH/../common/check_errors_whitelisted.pl" \ + "$DIR_PATH/stderr-whitelist.txt" < $LOGS_DIR/basic_cpuut.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "show CPU utilization" @@ -144,9 +173,11 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "show CPU utilization" $CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --pid=1 > $LOGS_DIR/basic_pid.log 2> $LOGS_DIR/basic_pid.err PERF_EXIT_CODE=$? -grep -P -v '^#' $LOGS_DIR/basic_pid.log | grep -P '\s+[\d\.]+%' | ../common/check_all_lines_matched.pl "systemd|init" +grep -P -v '^#' $LOGS_DIR/basic_pid.log | grep -P '\s+[\d\.]+%' | \ + "$DIR_PATH/../common/check_all_lines_matched.pl" "systemd|init" CHECK_EXIT_CODE=$? -../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_pid.err +"$DIR_PATH/../common/check_errors_whitelisted.pl" \ + "$DIR_PATH/stderr-whitelist.txt" < $LOGS_DIR/basic_pid.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "pid" @@ -159,9 +190,11 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "pid" $CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --symbols=dummynonexistingsymbol > $LOGS_DIR/basic_symbols.log 2> $LOGS_DIR/basic_symbols.err PERF_EXIT_CODE=$? -../common/check_all_lines_matched.pl "$RE_LINE_EMPTY" "$RE_LINE_COMMENT" < $LOGS_DIR/basic_symbols.log +"$DIR_PATH/../common/check_all_lines_matched.pl" \ + "$RE_LINE_EMPTY" "$RE_LINE_COMMENT" < $LOGS_DIR/basic_symbols.log CHECK_EXIT_CODE=$? -../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_symbols.err +"$DIR_PATH/../common/check_errors_whitelisted.pl" \ + "$DIR_PATH/stderr-whitelist.txt" < $LOGS_DIR/basic_symbols.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "non-existing symbol" @@ -174,9 +207,11 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "non-existing symbol" $CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data --symbol-filter=map > $LOGS_DIR/basic_symbolfilter.log 2> $LOGS_DIR/basic_symbolfilter.err PERF_EXIT_CODE=$? -grep -P -v '^#' $LOGS_DIR/basic_symbolfilter.log | grep -P '\s+[\d\.]+%' | ../common/check_all_lines_matched.pl "\[[k\.]\]\s+.*map" +grep -P -v '^#' $LOGS_DIR/basic_symbolfilter.log | grep -P '\s+[\d\.]+%' | \ + "$DIR_PATH/../common/check_all_lines_matched.pl" "\[[k\.]\]\s+.*map" CHECK_EXIT_CODE=$? -../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/basic_symbolfilter.err +"$DIR_PATH/../common/check_errors_whitelisted.pl" \ + "$DIR_PATH/stderr-whitelist.txt" < $LOGS_DIR/basic_symbolfilter.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "symbol filter" @@ -189,7 +224,8 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "symbol filter" $CMD_PERF report -i $CURRENT_TEST_DIR/perf.data.1 --stdio --header-only > $LOGS_DIR/latency_header.log PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl ", context_switch = 1, " < $LOGS_DIR/latency_header.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + ", context_switch = 1, " < $LOGS_DIR/latency_header.log CHECK_EXIT_CODE=$? print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "latency header" @@ -200,9 +236,11 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "latency header" $CMD_PERF report --stdio -i $CURRENT_TEST_DIR/perf.data.1 > $LOGS_DIR/latency_default.log 2> $LOGS_DIR/latency_default.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "# Overhead Latency Command" < $LOGS_DIR/latency_default.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "# Overhead Latency Command" < $LOGS_DIR/latency_default.log CHECK_EXIT_CODE=$? -../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/latency_default.err +"$DIR_PATH/../common/check_errors_whitelisted.pl" \ + "stderr-whitelist.txt" < $LOGS_DIR/latency_default.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "default report for latency profile" @@ -213,9 +251,11 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "default report for latency profi $CMD_PERF report --latency --stdio -i $CURRENT_TEST_DIR/perf.data.1 > $LOGS_DIR/latency_latency.log 2> $LOGS_DIR/latency_latency.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "# Latency Overhead Command" < $LOGS_DIR/latency_latency.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "# Latency Overhead Command" < $LOGS_DIR/latency_latency.log CHECK_EXIT_CODE=$? -../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/latency_latency.err +"$DIR_PATH/../common/check_errors_whitelisted.pl" \ + "stderr-whitelist.txt" < $LOGS_DIR/latency_latency.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "latency report for latency profile" @@ -226,9 +266,12 @@ print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "latency report for latency profi $CMD_PERF report --hierarchy --sort latency,parallelism,comm,symbol --parallelism=1,2 --stdio -i $CURRENT_TEST_DIR/perf.data.1 > $LOGS_DIR/parallelism_hierarchy.log 2> $LOGS_DIR/parallelism_hierarchy.err PERF_EXIT_CODE=$? -../common/check_all_patterns_found.pl "# Latency Parallelism / Command / Symbol" < $LOGS_DIR/parallelism_hierarchy.log +"$DIR_PATH/../common/check_all_patterns_found.pl" \ + "# Latency Parallelism / Command / Symbol" \ + < $LOGS_DIR/parallelism_hierarchy.log CHECK_EXIT_CODE=$? -../common/check_errors_whitelisted.pl "stderr-whitelist.txt" < $LOGS_DIR/parallelism_hierarchy.err +"$DIR_PATH/../common/check_errors_whitelisted.pl" \ + "stderr-whitelist.txt" < $LOGS_DIR/parallelism_hierarchy.err (( CHECK_EXIT_CODE += $? )) print_results $PERF_EXIT_CODE $CHECK_EXIT_CODE "parallelism histogram" diff --git a/tools/perf/tests/shell/common/init.sh b/tools/perf/tests/shell/common/init.sh index 26c7525651e0..cbfc78bec974 100644 --- a/tools/perf/tests/shell/common/init.sh +++ b/tools/perf/tests/shell/common/init.sh @@ -11,8 +11,8 @@ # -. ../common/settings.sh -. ../common/patterns.sh +. "$(dirname $0)/../common/settings.sh" +. "$(dirname $0)/../common/patterns.sh" THIS_TEST_NAME=`basename $0 .sh` diff --git a/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c b/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c index 5f886cd09e6b..7e879217be30 100644 --- a/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c +++ b/tools/perf/tests/shell/coresight/memcpy_thread/memcpy_thread.c @@ -27,6 +27,8 @@ static void *thrfn(void *arg) } for (i = 0; i < len; i++) memcpy(dst, src, a->size * 1024); + + return NULL; } static pthread_t new_thr(void *(*fn) (void *arg), void *arg) diff --git a/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c b/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c index e05a559253ca..86f3f548b006 100644 --- a/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c +++ b/tools/perf/tests/shell/coresight/thread_loop/thread_loop.c @@ -34,8 +34,8 @@ static void *thrfn(void *arg) } asm volatile( "loop:\n" - "add %[i], %[i], #1\n" - "cmp %[i], %[len]\n" + "add %w[i], %w[i], #1\n" + "cmp %w[i], %w[len]\n" "blt loop\n" : /* out */ : /* in */ [i] "r" (i), [len] "r" (len) diff --git a/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c b/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c index 0fc7bf1a25af..8f4e1c985ca3 100644 --- a/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c +++ b/tools/perf/tests/shell/coresight/unroll_loop_thread/unroll_loop_thread.c @@ -20,7 +20,7 @@ static void *thrfn(void *arg) for (i = 0; i < 10000; i++) { asm volatile ( // force an unroll of thia add instruction so we can test long runs of code -#define SNIP1 "add %[in], %[in], #1\n" +#define SNIP1 "add %w[in], %w[in], #1\n" // 10 #define SNIP2 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 SNIP1 // 100 @@ -36,6 +36,8 @@ static void *thrfn(void *arg) : /* clobber */ ); } + + return NULL; } static pthread_t new_thr(void *(*fn) (void *arg), void *arg) diff --git a/tools/perf/tests/shell/lock_contention.sh b/tools/perf/tests/shell/lock_contention.sh index d33d9e4392b0..7248a74ca2a3 100755 --- a/tools/perf/tests/shell/lock_contention.sh +++ b/tools/perf/tests/shell/lock_contention.sh @@ -7,14 +7,17 @@ set -e err=0 perfdata=$(mktemp /tmp/__perf_test.perf.data.XXXXX) result=$(mktemp /tmp/__perf_test.result.XXXXX) +errout=$(mktemp /tmp/__perf_test.errout.XXXXX) cleanup() { rm -f ${perfdata} rm -f ${result} + rm -f ${errout} trap - EXIT TERM INT } trap_cleanup() { + echo "Unexpected signal in ${FUNCNAME[1]}" cleanup exit ${err} } @@ -75,10 +78,12 @@ test_bpf() test_record_concurrent() { echo "Testing perf lock record and perf lock contention at the same time" - perf lock record -o- -- perf bench sched messaging -p 2> /dev/null | \ + perf lock record -o- -- perf bench sched messaging -p 2> ${errout} | \ perf lock contention -i- -E 1 -q 2> ${result} if [ "$(cat "${result}" | wc -l)" != "1" ]; then echo "[Fail] Recorded result count is not 1:" "$(cat "${result}" | wc -l)" + cat ${errout} + cat ${result} err=1 exit fi diff --git a/tools/perf/tests/shell/python-use.sh b/tools/perf/tests/shell/python-use.sh new file mode 100755 index 000000000000..fd2ee5390060 --- /dev/null +++ b/tools/perf/tests/shell/python-use.sh @@ -0,0 +1,36 @@ +#!/bin/bash +# 'import perf' in python +# SPDX-License-Identifier: GPL-2.0 +# Just test if we can load the python binding. +set -e + +shelldir=$(dirname "$0") +# shellcheck source=lib/setup_python.sh +. "${shelldir}"/lib/setup_python.sh + +MODULE_DIR=$(dirname "$(which perf)")/python + +if [ -d "$MODULE_DIR" ] +then + CMD=$(cat <<EOF +import sys +sys.path.insert(0, '$MODULE_DIR') +import perf +print('success!') +EOF + ) +else + CMD=$(cat <<EOF +import perf +print('success!') +EOF + ) +fi + +echo -e "Testing 'import perf' with:\n$CMD" + +if ! echo "$CMD" | $PYTHON | grep -q "success!" +then + exit 1 +fi +exit 0 diff --git a/tools/perf/tests/shell/record.sh b/tools/perf/tests/shell/record.sh index b1ad24fb3b33..0f5841c479e7 100755 --- a/tools/perf/tests/shell/record.sh +++ b/tools/perf/tests/shell/record.sh @@ -388,6 +388,45 @@ test_callgraph() { echo "Callgraph test [Success]" } +test_ratio_to_prev() { + echo "ratio-to-prev test" + if ! perf record -o /dev/null -e "{instructions, cycles/period=100000,ratio-to-prev=0.5/}" \ + true 2> /dev/null + then + echo "ratio-to-prev [Skipped not supported]" + return + fi + if ! perf record -o /dev/null -e "instructions, cycles/period=100000,ratio-to-prev=0.5/" \ + true |& grep -q 'Invalid use of ratio-to-prev term without preceding element in group' + then + echo "ratio-to-prev test [Failed elements must be in same group]" + err=1 + return + fi + if ! perf record -o /dev/null -e "{instructions,dummy,cycles/period=100000,ratio-to-prev=0.5/}" \ + true |& grep -q 'must have same PMU' + then + echo "ratio-to-prev test [Failed elements must have same PMU]" + err=1 + return + fi + if ! perf record -o /dev/null -e "{instructions,cycles/ratio-to-prev=0.5/}" \ + true |& grep -q 'Event period term or count (-c) must be set when using ratio-to-prev term.' + then + echo "ratio-to-prev test [Failed period must be set]" + err=1 + return + fi + if ! perf record -o /dev/null -e "{cycles/ratio-to-prev=0.5/}" \ + true |& grep -q 'Invalid use of ratio-to-prev term without preceding element in group' + then + echo "ratio-to-prev test [Failed need 2+ events]" + err=1 + return + fi + echo "Basic ratio-to-prev record test [Success]" +} + # raise the limit of file descriptors to minimum if [[ $default_fd_limit -lt $min_fd_limit ]]; then ulimit -Sn $min_fd_limit @@ -404,6 +443,7 @@ test_leader_sampling test_topdown_leader_sampling test_precise_max test_callgraph +test_ratio_to_prev # restore the default value ulimit -Sn $default_fd_limit diff --git a/tools/perf/tests/shell/record_lbr.sh b/tools/perf/tests/shell/record_lbr.sh index 6fcb5e52b9b4..78a02e90ece1 100755 --- a/tools/perf/tests/shell/record_lbr.sh +++ b/tools/perf/tests/shell/record_lbr.sh @@ -4,6 +4,10 @@ set -e +ParanoidAndNotRoot() { + [ "$(id -u)" != 0 ] && [ "$(cat /proc/sys/kernel/perf_event_paranoid)" -gt $1 ] +} + if [ ! -f /sys/bus/event_source/devices/cpu/caps/branches ] && [ ! -f /sys/bus/event_source/devices/cpu_core/caps/branches ] then @@ -23,6 +27,7 @@ cleanup() { } trap_cleanup() { + echo "Unexpected signal in ${FUNCNAME[1]}" cleanup exit 1 } @@ -123,8 +128,11 @@ lbr_test "-j ind_call" "any indirect call" 2 lbr_test "-j ind_jmp" "any indirect jump" 100 lbr_test "-j call" "direct calls" 2 lbr_test "-j ind_call,u" "any indirect user call" 100 -lbr_test "-a -b" "system wide any branch" 2 -lbr_test "-a -j any_call" "system wide any call" 2 +if ! ParanoidAndNotRoot 1 +then + lbr_test "-a -b" "system wide any branch" 2 + lbr_test "-a -j any_call" "system wide any call" 2 +fi # Parallel parallel_lbr_test "-b" "parallel any branch" 100 & @@ -141,10 +149,16 @@ parallel_lbr_test "-j call" "parallel direct calls" 100 & pid6=$! parallel_lbr_test "-j ind_call,u" "parallel any indirect user call" 100 & pid7=$! -parallel_lbr_test "-a -b" "parallel system wide any branch" 100 & -pid8=$! -parallel_lbr_test "-a -j any_call" "parallel system wide any call" 100 & -pid9=$! +if ParanoidAndNotRoot 1 +then + pid8= + pid9= +else + parallel_lbr_test "-a -b" "parallel system wide any branch" 100 & + pid8=$! + parallel_lbr_test "-a -j any_call" "parallel system wide any call" 100 & + pid9=$! +fi for pid in $pid1 $pid2 $pid3 $pid4 $pid5 $pid6 $pid7 $pid8 $pid9 do diff --git a/tools/perf/tests/shell/stat+event_uniquifying.sh b/tools/perf/tests/shell/stat+event_uniquifying.sh index bf54bd6c3e2e..b5dec6b6da36 100755 --- a/tools/perf/tests/shell/stat+event_uniquifying.sh +++ b/tools/perf/tests/shell/stat+event_uniquifying.sh @@ -4,74 +4,63 @@ set -e -stat_output=$(mktemp /tmp/__perf_test.stat_output.XXXXX) -perf_tool=perf err=0 +stat_output=$(mktemp /tmp/__perf_test.stat_output.XXXXX) -test_event_uniquifying() { - # We use `clockticks` in `uncore_imc` to verify the uniquify behavior. - pmu="uncore_imc" - event="clockticks" - - # If the `-A` option is added, the event should be uniquified. - # - # $perf list -v clockticks - # - # List of pre-defined events (to be used in -e or -M): - # - # uncore_imc_0/clockticks/ [Kernel PMU event] - # uncore_imc_1/clockticks/ [Kernel PMU event] - # uncore_imc_2/clockticks/ [Kernel PMU event] - # uncore_imc_3/clockticks/ [Kernel PMU event] - # uncore_imc_4/clockticks/ [Kernel PMU event] - # uncore_imc_5/clockticks/ [Kernel PMU event] - # - # ... - # - # $perf stat -e clockticks -A -- true - # - # Performance counter stats for 'system wide': - # - # CPU0 3,773,018 uncore_imc_0/clockticks/ - # CPU0 3,609,025 uncore_imc_1/clockticks/ - # CPU0 0 uncore_imc_2/clockticks/ - # CPU0 3,230,009 uncore_imc_3/clockticks/ - # CPU0 3,049,897 uncore_imc_4/clockticks/ - # CPU0 0 uncore_imc_5/clockticks/ - # - # 0.002029828 seconds time elapsed - - echo "stat event uniquifying test" - uniquified_event_array=() +cleanup() { + rm -f "${stat_output}" - # Skip if the machine does not have `uncore_imc` device. - if ! ${perf_tool} list pmu | grep -q ${pmu}; then - echo "Target does not support PMU ${pmu} [Skipped]" - err=2 - return - fi + trap - EXIT TERM INT +} - # Check how many uniquified events. - while IFS= read -r line; do - uniquified_event=$(echo "$line" | awk '{print $1}') - uniquified_event_array+=("${uniquified_event}") - done < <(${perf_tool} list -v ${event} | grep ${pmu}) +trap_cleanup() { + echo "Unexpected signal in ${FUNCNAME[1]}" + cleanup + exit 1 +} +trap trap_cleanup EXIT TERM INT - perf_command="${perf_tool} stat -e $event -A -o ${stat_output} -- true" - $perf_command +test_event_uniquifying() { + echo "Uniquification of PMU sysfs events test" - # Check the output contains all uniquified events. - for uniquified_event in "${uniquified_event_array[@]}"; do - if ! cat "${stat_output}" | grep -q "${uniquified_event}"; then - echo "Event is not uniquified [Failed]" - echo "${perf_command}" - cat "${stat_output}" - err=1 - break - fi + # Read events from perf list with and without -v. With -v the duplicate PMUs + # aren't deduplicated. Note, json events are listed by perf list without a + # PMU. + read -ra pmu_events <<< "$(perf list --raw pmu)" + read -ra pmu_v_events <<< "$(perf list -v --raw pmu)" + # For all non-deduplicated events. + for pmu_v_event in "${pmu_v_events[@]}"; do + # If the event matches an event in the deduplicated events then it musn't + # be an event with duplicate PMUs, continue the outer loop. + for pmu_event in "${pmu_events[@]}"; do + if [[ "$pmu_v_event" == "$pmu_event" ]]; then + continue 2 + fi + done + # Strip the suffix from the non-deduplicated event's PMU. + event=$(echo "$pmu_v_event" | sed -E 's/_[0-9]+//') + for pmu_event in "${pmu_events[@]}"; do + if [[ "$event" == "$pmu_event" ]]; then + echo "Testing event ${event} is uniquified to ${pmu_v_event}" + if ! perf stat -e "$event" -A -o ${stat_output} -- true; then + echo "Error running perf stat for event '$event' [Skip]" + if [ $err = 0 ]; then + err=2 + fi + continue + fi + # Ensure the non-deduplicated event appears in the output. + if ! grep -q "${pmu_v_event}" "${stat_output}"; then + echo "Uniquification of PMU sysfs events test [Failed]" + cat "${stat_output}" + err=1 + fi + break + fi + done done } test_event_uniquifying -rm -f "${stat_output}" +cleanup exit $err diff --git a/tools/perf/tests/shell/stat+std_output.sh b/tools/perf/tests/shell/stat+std_output.sh index 6fee67693ba7..ec41f24299d9 100755 --- a/tools/perf/tests/shell/stat+std_output.sh +++ b/tools/perf/tests/shell/stat+std_output.sh @@ -90,7 +90,11 @@ function commachecker() } done < "${stat_output}" - [ $metric_only -eq 1 ] && exit 1 + if [ $metric_only -ne 1 ] + then + echo "Missing metric only output in:" + cat "${stat_output}" + fi return 0 } diff --git a/tools/perf/tests/shell/test_brstack.sh b/tools/perf/tests/shell/test_brstack.sh index 9138fa83bf36..85233d435be6 100755 --- a/tools/perf/tests/shell/test_brstack.sh +++ b/tools/perf/tests/shell/test_brstack.sh @@ -34,6 +34,17 @@ trap_cleanup() { } trap trap_cleanup EXIT TERM INT +is_arm64() { + [ "$(uname -m)" = "aarch64" ]; +} + +check_branches() { + if ! tr -s ' ' '\n' < "$TMPDIR/perf.script" | grep -E -m1 -q "$1"; then + echo "Branches missing $1" + err=1 + fi +} + test_user_branches() { echo "Testing user branch stack sampling" @@ -55,14 +66,67 @@ test_user_branches() { ) for x in "${expected[@]}" do - if ! tr -s ' ' '\n' < "$TMPDIR/perf.script" | grep -E -m1 -q "$x" - then - echo "Branches missing $x" - err=1 - fi + check_branches "$x" done + + # Dump addresses only this time + perf script -i "$TMPDIR/perf.data" --fields brstack | \ + tr ' ' '\n' > "$TMPDIR/perf.script" + + # There should be no kernel addresses with the u option, in either + # source or target addresses. + if grep -E -m1 "0x[89a-f][0-9a-f]{15}" $TMPDIR/perf.script; then + echo "ERROR: Kernel address found in user mode" + err=1 + fi # some branch types are still not being tested: - # IND COND_CALL COND_RET SYSCALL SYSRET IRQ SERROR NO_TX + # IND COND_CALL COND_RET SYSRET SERROR NO_TX +} + +test_trap_eret_branches() { + echo "Testing trap & eret branches" + if ! is_arm64; then + echo "skip: not arm64" + else + perf record -o $TMPDIR/perf.data --branch-filter any,save_type,u,k -- \ + perf test -w traploop 1000 + perf script -i $TMPDIR/perf.data --fields brstacksym | \ + tr ' ' '\n' > $TMPDIR/perf.script + + # BRBINF<n>.TYPE == TRAP are mapped to PERF_BR_IRQ by the BRBE driver + check_branches "^trap_bench\+[^ ]+/[^ ]/IRQ/" + check_branches "^[^ ]+/trap_bench\+[^ ]+/ERET/" + fi +} + +test_kernel_branches() { + echo "Testing that k option only includes kernel source addresses" + + if ! perf record --branch-filter any,k -o- -- true > /dev/null; then + echo "skip: not enough privileges" + else + perf record -o $TMPDIR/perf.data --branch-filter any,k -- \ + perf bench syscall basic --loop 1000 + perf script -i $TMPDIR/perf.data --fields brstack | \ + tr ' ' '\n' > $TMPDIR/perf.script + + # Example of branch entries: + # "0xffffffff93bda241/0xffffffff93bda20f/M/-/-/..." + # Source addresses come first and target address can be either + # userspace or kernel even with k option, as long as the source + # is in kernel. + + #Look for source addresses with top bit set + if ! grep -E -m1 "^0x[89a-f][0-9a-f]{15}" $TMPDIR/perf.script; then + echo "ERROR: Kernel branches missing" + err=1 + fi + # Look for no source addresses without top bit set + if grep -E -m1 "^0x[0-7][0-9a-f]{0,15}" $TMPDIR/perf.script; then + echo "ERROR: User branches found with kernel filter" + err=1 + fi + fi } # first argument <arg0> is the argument passed to "--branch-stack <arg0>,save_type,u" @@ -97,18 +161,42 @@ test_filter() { fi } +test_syscall() { + echo "Testing syscalls" + # skip if perf doesn't have enough privileges + if ! perf record --branch-filter any,k -o- -- true > /dev/null; then + echo "skip: not enough privileges" + else + perf record -o $TMPDIR/perf.data --branch-filter \ + any_call,save_type,u,k -c 10000 -- \ + perf bench syscall basic --loop 1000 + perf script -i $TMPDIR/perf.data --fields brstacksym | \ + tr ' ' '\n' > $TMPDIR/perf.script + + check_branches "getppid[^ ]*/SYSCALL/" + fi +} set -e test_user_branches +test_syscall +test_kernel_branches +test_trap_eret_branches + +any_call="CALL|IND_CALL|COND_CALL|SYSCALL|IRQ" + +if is_arm64; then + any_call="$any_call|FAULT_DATA|FAULT_INST" +fi -test_filter "any_call" "CALL|IND_CALL|COND_CALL|SYSCALL|IRQ" +test_filter "any_call" "$any_call" test_filter "call" "CALL|SYSCALL" test_filter "cond" "COND" test_filter "any_ret" "RET|COND_RET|SYSRET|ERET" test_filter "call,cond" "CALL|SYSCALL|COND" -test_filter "any_call,cond" "CALL|IND_CALL|COND_CALL|IRQ|SYSCALL|COND" -test_filter "cond,any_call,any_ret" "COND|CALL|IND_CALL|COND_CALL|SYSCALL|IRQ|RET|COND_RET|SYSRET|ERET" +test_filter "any_call,cond" "$any_call|COND" +test_filter "any_call,cond,any_ret" "$any_call|COND|RET|COND_RET" cleanup exit $err diff --git a/tools/perf/tests/shell/trace_btf_enum.sh b/tools/perf/tests/shell/trace_btf_enum.sh index 572001d75d78..03e9f680a4a6 100755 --- a/tools/perf/tests/shell/trace_btf_enum.sh +++ b/tools/perf/tests/shell/trace_btf_enum.sh @@ -23,6 +23,14 @@ check_vmlinux() { fi } +check_permissions() { + if perf trace -e $syscall $TESTPROG 2>&1 | grep -q "Operation not permitted" + then + echo "trace+enum test [Skipped permissions]" + err=2 + fi +} + trace_landlock() { echo "Tracing syscall ${syscall}" @@ -56,6 +64,9 @@ trace_non_syscall() { } check_vmlinux +if [ $err = 0 ]; then + check_permissions +fi if [ $err = 0 ]; then trace_landlock diff --git a/tools/perf/tests/tests.h b/tools/perf/tests/tests.h index 97e62db8764a..33de16dde737 100644 --- a/tools/perf/tests/tests.h +++ b/tools/perf/tests/tests.h @@ -120,7 +120,6 @@ DECLARE_SUITE(dso_data_cache); DECLARE_SUITE(dso_data_reopen); DECLARE_SUITE(parse_events); DECLARE_SUITE(hists_link); -DECLARE_SUITE(python_use); DECLARE_SUITE(bp_signal); DECLARE_SUITE(bp_signal_overflow); DECLARE_SUITE(bp_accounting); @@ -239,6 +238,7 @@ DECLARE_WORKLOAD(sqrtloop); DECLARE_WORKLOAD(brstack); DECLARE_WORKLOAD(datasym); DECLARE_WORKLOAD(landlock); +DECLARE_WORKLOAD(traploop); extern const char *dso_to_test; extern const char *test_objdump_path; diff --git a/tools/perf/tests/workloads/Build b/tools/perf/tests/workloads/Build index 5af17206f04d..fb1012cc4fc3 100644 --- a/tools/perf/tests/workloads/Build +++ b/tools/perf/tests/workloads/Build @@ -7,8 +7,10 @@ perf-test-y += sqrtloop.o perf-test-y += brstack.o perf-test-y += datasym.o perf-test-y += landlock.o +perf-test-y += traploop.o CFLAGS_sqrtloop.o = -g -O0 -fno-inline -U_FORTIFY_SOURCE CFLAGS_leafloop.o = -g -O0 -fno-inline -fno-omit-frame-pointer -U_FORTIFY_SOURCE CFLAGS_brstack.o = -g -O0 -fno-inline -U_FORTIFY_SOURCE CFLAGS_datasym.o = -g -O0 -fno-inline -U_FORTIFY_SOURCE +CFLAGS_traploop.o = -g -O0 -fno-inline -U_FORTIFY_SOURCE diff --git a/tools/perf/tests/workloads/traploop.c b/tools/perf/tests/workloads/traploop.c new file mode 100644 index 000000000000..68dec399a735 --- /dev/null +++ b/tools/perf/tests/workloads/traploop.c @@ -0,0 +1,31 @@ +// SPDX-License-Identifier: GPL-2.0 +#include <stdlib.h> +#include "../tests.h" + +#define BENCH_RUNS 999999 + +#ifdef __aarch64__ +static void trap_bench(void) +{ + unsigned long val; + + asm("mrs %0, ID_AA64ISAR0_EL1" : "=r" (val)); /* TRAP + ERET */ +} +#else +static void trap_bench(void) { } +#endif + +static int traploop(int argc, const char **argv) +{ + int num_loops = BENCH_RUNS; + + if (argc > 0) + num_loops = atoi(argv[0]); + + for (int i = 0; i < num_loops; i++) + trap_bench(); + + return 0; +} + +DEFINE_WORKLOAD(traploop); diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 183902dac042..8fe699f98542 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -4,7 +4,9 @@ #include "../ui.h" #include "../../util/annotate.h" #include "../../util/debug.h" +#include "../../util/debuginfo.h" #include "../../util/dso.h" +#include "../../util/hashmap.h" #include "../../util/hist.h" #include "../../util/sort.h" #include "../../util/map.h" @@ -12,7 +14,9 @@ #include "../../util/symbol.h" #include "../../util/evsel.h" #include "../../util/evlist.h" +#include "../../util/thread.h" #include <inttypes.h> +#include <linux/err.h> #include <linux/kernel.h> #include <linux/string.h> #include <linux/zalloc.h> @@ -27,10 +31,31 @@ struct annotate_browser { struct rb_node *curr_hot; struct annotation_line *selection; struct arch *arch; + /* + * perf top can delete hist_entry anytime. Callers should make sure + * its lifetime. + */ + struct hist_entry *he; + struct debuginfo *dbg; + struct evsel *evsel; + struct hashmap *type_hash; bool searching_backwards; char search_bf[128]; }; +/* A copy of target hist_entry for perf top. */ +static struct hist_entry annotate_he; + +static size_t type_hash(long key, void *ctx __maybe_unused) +{ + return key; +} + +static bool type_equal(long key1, long key2, void *ctx __maybe_unused) +{ + return key1 == key2; +} + static inline struct annotation *browser__annotation(struct ui_browser *browser) { struct map_symbol *ms = browser->priv; @@ -107,12 +132,21 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int .printf = annotate_browser__printf, .write_graph = annotate_browser__write_graph, }; + struct annotation_print_data apd = { + .he = ab->he, + .arch = ab->arch, + .evsel = ab->evsel, + .dbg = ab->dbg, + }; /* The scroll bar isn't being used */ if (!browser->navkeypressed) ops.width += 1; - annotation_line__write(al, notes, &ops); + if (!IS_ERR_OR_NULL(ab->type_hash)) + apd.type_hash = ab->type_hash; + + annotation_line__write(al, notes, &ops, &apd); if (ops.current_entry) ab->selection = al; @@ -515,9 +549,24 @@ static void ui_browser__init_asm_mode(struct ui_browser *browser) static int sym_title(struct symbol *sym, struct map *map, char *title, size_t sz, int percent_type) { - return snprintf(title, sz, "%s %s [Percent: %s]", sym->name, + return snprintf(title, sz, "%s %s [Percent: %s] %s", sym->name, dso__long_name(map__dso(map)), - percent_type_str(percent_type)); + percent_type_str(percent_type), + annotate_opts.code_with_type ? "[Type]" : ""); +} + +static void annotate_browser__show_function_title(struct annotate_browser *browser) +{ + struct ui_browser *b = &browser->b; + struct map_symbol *ms = b->priv; + struct symbol *sym = ms->sym; + char title[SYM_TITLE_MAX_SIZE]; + + sym_title(sym, ms->map, title, sizeof(title), annotate_opts.percent_type); + + ui_browser__gotorc_title(b, 0, 0); + ui_browser__set_color(b, HE_COLORSET_ROOT); + ui_browser__write_nstring(b, title, b->width + 1); } /* @@ -536,7 +585,6 @@ static bool annotate_browser__callq(struct annotate_browser *browser, struct map_symbol *ms = browser->b.priv, target_ms; struct disasm_line *dl = disasm_line(browser->selection); struct annotation *notes; - char title[SYM_TITLE_MAX_SIZE]; if (!dl->ops.target.sym) { ui_helpline__puts("The called function was not found."); @@ -557,9 +605,14 @@ static bool annotate_browser__callq(struct annotate_browser *browser, target_ms.map = ms->map; target_ms.sym = dl->ops.target.sym; annotation__unlock(notes); - symbol__tui_annotate(&target_ms, evsel, hbt); - sym_title(ms->sym, ms->map, title, sizeof(title), annotate_opts.percent_type); - ui_browser__show_title(&browser->b, title); + __hist_entry__tui_annotate(browser->he, &target_ms, evsel, hbt); + + /* + * The annotate_browser above changed the title with the target function + * and now it's back to the original function. Refresh the header line + * for the original function again. + */ + annotate_browser__show_function_title(browser); return true; } @@ -731,20 +784,12 @@ bool annotate_browser__continue_search_reverse(struct annotate_browser *browser, return __annotate_browser__search_reverse(browser); } -static int annotate_browser__show(struct ui_browser *browser, char *title, const char *help) +static int annotate_browser__show(struct annotate_browser *browser, char *title, const char *help) { - struct map_symbol *ms = browser->priv; - struct symbol *sym = ms->sym; - char symbol_dso[SYM_TITLE_MAX_SIZE]; - - if (ui_browser__show(browser, title, help) < 0) + if (ui_browser__show(&browser->b, title, help) < 0) return -1; - sym_title(sym, ms->map, symbol_dso, sizeof(symbol_dso), annotate_opts.percent_type); - - ui_browser__gotorc_title(browser, 0, 0); - ui_browser__set_color(browser, HE_COLORSET_ROOT); - ui_browser__write_nstring(browser, symbol_dso, browser->width + 1); + annotate_browser__show_function_title(browser); return 0; } @@ -793,6 +838,20 @@ static int annotate__scnprintf_title(struct hists *hists, char *bf, size_t size) return printed; } +static void annotate_browser__debuginfo_warning(struct annotate_browser *browser) +{ + struct map_symbol *ms = browser->b.priv; + struct dso *dso = map__dso(ms->map); + + if (browser->dbg == NULL && annotate_opts.code_with_type && + !dso__debuginfo_warned(dso)) { + ui__warning("DWARF debuginfo not found.\n\n" + "Data-type in this DSO will not be displayed.\n" + "Please make sure to have debug information."); + dso__set_debuginfo_warned(dso); + } +} + static int annotate_browser__run(struct annotate_browser *browser, struct evsel *evsel, struct hist_browser_timer *hbt) @@ -809,7 +868,7 @@ static int annotate_browser__run(struct annotate_browser *browser, int key; annotate__scnprintf_title(hists, title, sizeof(title)); - if (annotate_browser__show(&browser->b, title, help) < 0) + if (annotate_browser__show(browser, title, help) < 0) return -1; annotate_browser__calc_percent(browser, evsel); @@ -823,6 +882,8 @@ static int annotate_browser__run(struct annotate_browser *browser, annotation_br_cntr_abbr_list(&br_cntr_text, evsel, false); + annotate_browser__debuginfo_warning(browser); + while (1) { key = ui_browser__run(&browser->b, delay_secs); @@ -845,7 +906,7 @@ static int annotate_browser__run(struct annotate_browser *browser, if (delay_secs != 0) { symbol__annotate_decay_histogram(sym, evsel); annotate__scnprintf_title(hists, title, sizeof(title)); - annotate_browser__show(&browser->b, title, help); + annotate_browser__show(browser, title, help); } continue; case K_TAB: @@ -891,11 +952,12 @@ static int annotate_browser__run(struct annotate_browser *browser, "b Toggle percent base [period/hits]\n" "B Branch counter abbr list (Optional)\n" "? Search string backwards\n" - "f Toggle showing offsets to full address\n"); + "f Toggle showing offsets to full address\n" + "T Toggle data type display\n"); continue; case 'r': script_browse(NULL, NULL); - annotate_browser__show(&browser->b, title, help); + annotate_browser__show(browser, title, help); continue; case 'k': annotate_opts.show_linenr = !annotate_opts.show_linenr; @@ -910,7 +972,7 @@ static int annotate_browser__run(struct annotate_browser *browser, if (annotate_browser__toggle_source(browser, evsel)) ui_helpline__puts(help); annotate__scnprintf_title(hists, title, sizeof(title)); - annotate_browser__show(&browser->b, title, help); + annotate_browser__show(browser, title, help); continue; case 'o': annotate_opts.use_offset = !annotate_opts.use_offset; @@ -975,7 +1037,7 @@ show_sup_ins: continue; } case 'P': - map_symbol__annotation_dump(ms, evsel); + map_symbol__annotation_dump(ms, evsel, browser->he); continue; case 't': if (symbol_conf.show_total_period) { @@ -998,7 +1060,7 @@ show_sup_ins: case 'b': switch_percent_type(&annotate_opts, key == 'b'); annotate__scnprintf_title(hists, title, sizeof(title)); - annotate_browser__show(&browser->b, title, help); + annotate_browser__show(browser, title, help); continue; case 'B': if (br_cntr_text) @@ -1011,6 +1073,17 @@ show_sup_ins: case 'f': annotation__toggle_full_addr(notes, ms); continue; + case 'T': + annotate_opts.code_with_type ^= 1; + if (browser->dbg == NULL) + browser->dbg = dso__debuginfo(map__dso(ms->map)); + if (browser->type_hash == NULL) { + browser->type_hash = hashmap__new(type_hash, type_equal, + /*ctx=*/NULL); + } + annotate_browser__show(browser, title, help); + annotate_browser__debuginfo_warning(browser); + continue; case K_LEFT: case '<': case '>': @@ -1032,12 +1105,6 @@ out: return key; } -int map_symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, - struct hist_browser_timer *hbt) -{ - return symbol__tui_annotate(ms, evsel, hbt); -} - int hist_entry__tui_annotate(struct hist_entry *he, struct evsel *evsel, struct hist_browser_timer *hbt) { @@ -1046,11 +1113,12 @@ int hist_entry__tui_annotate(struct hist_entry *he, struct evsel *evsel, SLang_init_tty(0, 0, 0); SLtty_set_suspend_state(true); - return map_symbol__tui_annotate(&he->ms, evsel, hbt); + return __hist_entry__tui_annotate(he, &he->ms, evsel, hbt); } -int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, - struct hist_browser_timer *hbt) +int __hist_entry__tui_annotate(struct hist_entry *he, struct map_symbol *ms, + struct evsel *evsel, + struct hist_browser_timer *hbt) { struct symbol *sym = ms->sym; struct annotation *notes = symbol__annotation(sym); @@ -1064,6 +1132,8 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, .priv = ms, .use_navkeypressed = true, }, + .he = he, + .evsel = evsel, }; struct dso *dso; int ret = -1, err; @@ -1093,8 +1163,23 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, } } + /* Copy necessary information when it's called from perf top */ + if (hbt != NULL && he != &annotate_he) { + annotate_he.hists = he->hists; + annotate_he.thread = thread__get(he->thread); + annotate_he.cpumode = he->cpumode; + map_symbol__copy(&annotate_he.ms, ms); + + browser.he = &annotate_he; + } + ui_helpline__push("Press ESC to exit"); + if (annotate_opts.code_with_type) { + browser.dbg = dso__debuginfo(dso); + browser.type_hash = hashmap__new(type_hash, type_equal, /*ctx=*/NULL); + } + browser.b.width = notes->src->widths.max_line_len; browser.b.nr_entries = notes->src->nr_entries; browser.b.entries = ¬es->src->source; @@ -1105,8 +1190,24 @@ int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, ret = annotate_browser__run(&browser, evsel, hbt); + debuginfo__delete(browser.dbg); + + if (!IS_ERR_OR_NULL(browser.type_hash)) { + struct hashmap_entry *cur; + size_t bkt; + + hashmap__for_each_entry(browser.type_hash, cur, bkt) + zfree(&cur->pvalue); + hashmap__free(browser.type_hash); + } + if (not_annotated && !notes->src->tried_source) annotated_source__purge(notes->src); + if (hbt != NULL && he != &annotate_he) { + thread__zput(annotate_he.thread); + map_symbol__exit(&annotate_he.ms); + } + return ret; } diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index d9d3fb44477a..487c0b08c003 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -2484,8 +2484,8 @@ do_annotate(struct hist_browser *browser, struct popup_action *act) else evsel = hists_to_evsel(browser->hists); - err = map_symbol__tui_annotate(&act->ms, evsel, browser->hbt); he = hist_browser__selected_entry(browser); + err = __hist_entry__tui_annotate(he, &act->ms, evsel, browser->hbt); /* * offer option to annotate the other branch source or target * (if they exists) when returning from annotate diff --git a/tools/perf/ui/libslang.h b/tools/perf/ui/libslang.h index 1dff3020e9d5..6722561e0458 100644 --- a/tools/perf/ui/libslang.h +++ b/tools/perf/ui/libslang.h @@ -15,11 +15,7 @@ #define ENABLE_SLFUTURE_CONST 1 #define ENABLE_SLFUTURE_VOID 1 -#ifdef HAVE_SLANG_INCLUDE_SUBDIR -#include <slang/slang.h> -#else #include <slang.h> -#endif #define SL_KEY_UNTAB 0x1000 diff --git a/tools/perf/util/Build b/tools/perf/util/Build index 4959e7a990e4..4be313cd115a 100644 --- a/tools/perf/util/Build +++ b/tools/perf/util/Build @@ -2,18 +2,19 @@ include $(srctree)/tools/scripts/Makefile.include include $(srctree)/tools/scripts/utilities.mak perf-util-y += arm64-frame-pointer-unwind-support.o +perf-util-y += addr2line.o perf-util-y += addr_location.o perf-util-y += annotate.o perf-util-y += block-info.o perf-util-y += block-range.o perf-util-y += build-id.o perf-util-y += cacheline.o +perf-util-y += capstone.o perf-util-y += config.o perf-util-y += copyfile.o perf-util-y += ctype.o perf-util-y += db-export.o perf-util-y += disasm.o -perf-util-y += disasm_bpf.o perf-util-y += env.o perf-util-y += event.o perf-util-y += evlist.o @@ -23,8 +24,9 @@ perf-util-y += evsel_fprintf.o perf-util-y += perf_event_attr_fprintf.o perf-util-y += evswitch.o perf-util-y += find_bit.o -perf-util-y += get_current_dir_name.o perf-util-y += levenshtein.o +perf-util-$(CONFIG_LIBBFD) += libbfd.o +perf-util-y += llvm.o perf-util-y += mmap.o perf-util-y += memswap.o perf-util-y += parse-events.o @@ -136,6 +138,7 @@ perf-util-$(CONFIG_AUXTRACE) += arm-spe-decoder/ perf-util-$(CONFIG_AUXTRACE) += hisi-ptt.o perf-util-$(CONFIG_AUXTRACE) += hisi-ptt-decoder/ perf-util-$(CONFIG_AUXTRACE) += s390-cpumsf.o +perf-util-$(CONFIG_AUXTRACE) += powerpc-vpadtl.o ifdef CONFIG_LIBOPENCSD perf-util-$(CONFIG_AUXTRACE) += cs-etm.o diff --git a/tools/perf/util/addr2line.c b/tools/perf/util/addr2line.c new file mode 100644 index 000000000000..f2d94a3272d7 --- /dev/null +++ b/tools/perf/util/addr2line.c @@ -0,0 +1,439 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "addr2line.h" +#include "debug.h" +#include "dso.h" +#include "string2.h" +#include "srcline.h" +#include "symbol.h" +#include "symbol_conf.h" + +#include <api/io.h> +#include <linux/zalloc.h> +#include <subcmd/run-command.h> + +#include <inttypes.h> +#include <signal.h> +#include <stdlib.h> +#include <string.h> + +#define MAX_INLINE_NEST 1024 + +/* If addr2line doesn't return data for 1 second then timeout. */ +int addr2line_timeout_ms = 1 * 1000; + +static int filename_split(char *filename, unsigned int *line_nr) +{ + char *sep; + + sep = strchr(filename, '\n'); + if (sep) + *sep = '\0'; + + if (!strcmp(filename, "??:0")) + return 0; + + sep = strchr(filename, ':'); + if (sep) { + *sep++ = '\0'; + *line_nr = strtoul(sep, NULL, 0); + return 1; + } + pr_debug("addr2line missing ':' in filename split\n"); + return 0; +} + +static void addr2line_subprocess_cleanup(struct child_process *a2l) +{ + if (a2l->pid != -1) { + kill(a2l->pid, SIGKILL); + finish_command(a2l); /* ignore result, we don't care */ + a2l->pid = -1; + close(a2l->in); + close(a2l->out); + } + + free(a2l); +} + +static struct child_process *addr2line_subprocess_init(const char *addr2line_path, + const char *binary_path) +{ + const char *argv[] = { + addr2line_path ?: "addr2line", + "-e", binary_path, + "-a", "-i", "-f", NULL + }; + struct child_process *a2l = zalloc(sizeof(*a2l)); + int start_command_status = 0; + + if (a2l == NULL) { + pr_err("Failed to allocate memory for addr2line"); + return NULL; + } + + a2l->pid = -1; + a2l->in = -1; + a2l->out = -1; + a2l->no_stderr = 1; + + a2l->argv = argv; + start_command_status = start_command(a2l); + a2l->argv = NULL; /* it's not used after start_command; avoid dangling pointers */ + + if (start_command_status != 0) { + pr_warning("could not start addr2line (%s) for %s: start_command return code %d\n", + addr2line_path, binary_path, start_command_status); + addr2line_subprocess_cleanup(a2l); + return NULL; + } + + return a2l; +} + +enum a2l_style { + BROKEN, + GNU_BINUTILS, + LLVM, +}; + +static enum a2l_style addr2line_configure(struct child_process *a2l, const char *dso_name) +{ + static bool cached; + static enum a2l_style style; + + if (!cached) { + char buf[128]; + struct io io; + int ch; + int lines; + + if (write(a2l->in, ",\n", 2) != 2) + return BROKEN; + + io__init(&io, a2l->out, buf, sizeof(buf)); + ch = io__get_char(&io); + if (ch == ',') { + style = LLVM; + cached = true; + lines = 1; + pr_debug3("Detected LLVM addr2line style\n"); + } else if (ch == '0') { + style = GNU_BINUTILS; + cached = true; + lines = 3; + pr_debug3("Detected binutils addr2line style\n"); + } else { + if (!symbol_conf.disable_add2line_warn) { + char *output = NULL; + size_t output_len; + + io__getline(&io, &output, &output_len); + pr_warning("%s %s: addr2line configuration failed\n", + __func__, dso_name); + pr_warning("\t%c%s", ch, output); + } + pr_debug("Unknown/broken addr2line style\n"); + return BROKEN; + } + while (lines) { + ch = io__get_char(&io); + if (ch <= 0) + break; + if (ch == '\n') + lines--; + } + /* Ignore SIGPIPE in the event addr2line exits. */ + signal(SIGPIPE, SIG_IGN); + } + return style; +} + +static int read_addr2line_record(struct io *io, + enum a2l_style style, + const char *dso_name, + u64 addr, + bool first, + char **function, + char **filename, + unsigned int *line_nr) +{ + /* + * Returns: + * -1 ==> error + * 0 ==> sentinel (or other ill-formed) record read + * 1 ==> a genuine record read + */ + char *line = NULL; + size_t line_len = 0; + unsigned int dummy_line_nr = 0; + int ret = -1; + + if (function != NULL) + zfree(function); + + if (filename != NULL) + zfree(filename); + + if (line_nr != NULL) + *line_nr = 0; + + /* + * Read the first line. Without an error this will be: + * - for the first line an address like 0x1234, + * - the binutils sentinel 0x0000000000000000, + * - the llvm-addr2line the sentinel ',' character, + * - the function name line for an inlined function. + */ + if (io__getline(io, &line, &line_len) < 0 || !line_len) + goto error; + + pr_debug3("%s %s: addr2line read address for sentinel: %s", __func__, dso_name, line); + if (style == LLVM && line_len == 2 && line[0] == ',') { + /* Found the llvm-addr2line sentinel character. */ + zfree(&line); + return 0; + } else if (style == GNU_BINUTILS && (!first || addr != 0)) { + int zero_count = 0, non_zero_count = 0; + /* + * Check for binutils sentinel ignoring it for the case the + * requested address is 0. + */ + + /* A given address should always start 0x. */ + if (line_len >= 2 || line[0] != '0' || line[1] != 'x') { + for (size_t i = 2; i < line_len; i++) { + if (line[i] == '0') + zero_count++; + else if (line[i] != '\n') + non_zero_count++; + } + if (!non_zero_count) { + int ch; + + if (first && !zero_count) { + /* Line was erroneous just '0x'. */ + goto error; + } + /* + * Line was 0x0..0, the sentinel for binutils. Remove + * the function and filename lines. + */ + zfree(&line); + do { + ch = io__get_char(io); + } while (ch > 0 && ch != '\n'); + do { + ch = io__get_char(io); + } while (ch > 0 && ch != '\n'); + return 0; + } + } + } + /* Read the second function name line (if inline data then this is the first line). */ + if (first && (io__getline(io, &line, &line_len) < 0 || !line_len)) + goto error; + + pr_debug3("%s %s: addr2line read line: %s", __func__, dso_name, line); + if (function != NULL) + *function = strdup(strim(line)); + + zfree(&line); + line_len = 0; + + /* Read the third filename and line number line. */ + if (io__getline(io, &line, &line_len) < 0 || !line_len) + goto error; + + pr_debug3("%s %s: addr2line filename:number : %s", __func__, dso_name, line); + if (filename_split(line, line_nr == NULL ? &dummy_line_nr : line_nr) == 0 && + style == GNU_BINUTILS) { + ret = 0; + goto error; + } + + if (filename != NULL) + *filename = strdup(line); + + zfree(&line); + line_len = 0; + + return 1; + +error: + free(line); + if (function != NULL) + zfree(function); + if (filename != NULL) + zfree(filename); + return ret; +} + +static int inline_list__append_record(struct dso *dso, + struct inline_node *node, + struct symbol *sym, + const char *function, + const char *filename, + unsigned int line_nr) +{ + struct symbol *inline_sym = new_inline_sym(dso, sym, function); + + return inline_list__append(inline_sym, srcline_from_fileline(filename, line_nr), node); +} + +int cmd__addr2line(const char *dso_name, u64 addr, + char **file, unsigned int *line_nr, + struct dso *dso, + bool unwind_inlines, + struct inline_node *node, + struct symbol *sym __maybe_unused) +{ + struct child_process *a2l = dso__a2l(dso); + char *record_function = NULL; + char *record_filename = NULL; + unsigned int record_line_nr = 0; + int record_status = -1; + int ret = 0; + size_t inline_count = 0; + int len; + char buf[128]; + ssize_t written; + struct io io = { .eof = false }; + enum a2l_style a2l_style; + + if (!a2l) { + if (!filename__has_section(dso_name, ".debug_line")) + goto out; + + dso__set_a2l(dso, + addr2line_subprocess_init(symbol_conf.addr2line_path, dso_name)); + a2l = dso__a2l(dso); + } + + if (a2l == NULL) { + if (!symbol_conf.disable_add2line_warn) + pr_warning("%s %s: addr2line_subprocess_init failed\n", __func__, dso_name); + goto out; + } + a2l_style = addr2line_configure(a2l, dso_name); + if (a2l_style == BROKEN) + goto out; + + /* + * Send our request and then *deliberately* send something that can't be + * interpreted as a valid address to ask addr2line about (namely, + * ","). This causes addr2line to first write out the answer to our + * request, in an unbounded/unknown number of records, and then to write + * out the lines "0x0...0", "??" and "??:0", for GNU binutils, or "," + * for llvm-addr2line, so that we can detect when it has finished giving + * us anything useful. + */ + len = snprintf(buf, sizeof(buf), "%016"PRIx64"\n,\n", addr); + written = len > 0 ? write(a2l->in, buf, len) : -1; + if (written != len) { + if (!symbol_conf.disable_add2line_warn) + pr_warning("%s %s: could not send request\n", __func__, dso_name); + goto out; + } + io__init(&io, a2l->out, buf, sizeof(buf)); + io.timeout_ms = addr2line_timeout_ms; + switch (read_addr2line_record(&io, a2l_style, dso_name, addr, /*first=*/true, + &record_function, &record_filename, &record_line_nr)) { + case -1: + if (!symbol_conf.disable_add2line_warn) + pr_warning("%s %s: could not read first record\n", __func__, dso_name); + goto out; + case 0: + /* + * The first record was invalid, so return failure, but first + * read another record, since we sent a sentinel ',' for the + * sake of detected the last inlined function. Treat this as the + * first of a record as the ',' generates a new start with GNU + * binutils, also force a non-zero address as we're no longer + * reading that record. + */ + switch (read_addr2line_record(&io, a2l_style, dso_name, + /*addr=*/1, /*first=*/true, + NULL, NULL, NULL)) { + case -1: + if (!symbol_conf.disable_add2line_warn) + pr_warning("%s %s: could not read sentinel record\n", + __func__, dso_name); + break; + case 0: + /* The sentinel as expected. */ + break; + default: + if (!symbol_conf.disable_add2line_warn) + pr_warning("%s %s: unexpected record instead of sentinel", + __func__, dso_name); + break; + } + goto out; + default: + /* First record as expected. */ + break; + } + + if (file) { + *file = strdup(record_filename); + ret = 1; + } + if (line_nr) + *line_nr = record_line_nr; + + if (unwind_inlines) { + if (node && inline_list__append_record(dso, node, sym, + record_function, + record_filename, + record_line_nr)) { + ret = 0; + goto out; + } + } + + /* + * We have to read the records even if we don't care about the inline + * info. This isn't the first record and force the address to non-zero + * as we're reading records beyond the first. + */ + while ((record_status = read_addr2line_record(&io, + a2l_style, + dso_name, + /*addr=*/1, + /*first=*/false, + &record_function, + &record_filename, + &record_line_nr)) == 1) { + if (unwind_inlines && node && inline_count++ < MAX_INLINE_NEST) { + if (inline_list__append_record(dso, node, sym, + record_function, + record_filename, + record_line_nr)) { + ret = 0; + goto out; + } + ret = 1; /* found at least one inline frame */ + } + } + +out: + free(record_function); + free(record_filename); + if (io.eof) { + dso__set_a2l(dso, NULL); + addr2line_subprocess_cleanup(a2l); + } + return ret; +} + +void dso__free_a2l(struct dso *dso) +{ + struct child_process *a2l = dso__a2l(dso); + + if (!a2l) + return; + + addr2line_subprocess_cleanup(a2l); + + dso__set_a2l(dso, NULL); +} diff --git a/tools/perf/util/addr2line.h b/tools/perf/util/addr2line.h new file mode 100644 index 000000000000..d35a47ba8dab --- /dev/null +++ b/tools/perf/util/addr2line.h @@ -0,0 +1,20 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_ADDR2LINE_H +#define __PERF_ADDR2LINE_H + +#include <linux/types.h> + +struct dso; +struct inline_node; +struct symbol; + +extern int addr2line_timeout_ms; + +int cmd__addr2line(const char *dso_name, u64 addr, + char **file, unsigned int *line_nr, + struct dso *dso, + bool unwind_inlines, + struct inline_node *node, + struct symbol *sym); + +#endif /* __PERF_ADDR2LINE_H */ diff --git a/tools/perf/util/annotate-data.c b/tools/perf/util/annotate-data.c index 1ef2edbc71d9..903027a6fb7d 100644 --- a/tools/perf/util/annotate-data.c +++ b/tools/perf/util/annotate-data.c @@ -58,7 +58,7 @@ void pr_debug_type_name(Dwarf_Die *die, enum type_state_kind kind) case TSR_KIND_CONST: pr_info(" constant\n"); return; - case TSR_KIND_POINTER: + case TSR_KIND_PERCPU_POINTER: pr_info(" pointer"); /* it also prints the type info */ break; @@ -591,7 +591,7 @@ void set_stack_state(struct type_state_stack *stack, int offset, u8 kind, switch (tag) { case DW_TAG_structure_type: case DW_TAG_union_type: - stack->compound = (kind != TSR_KIND_POINTER); + stack->compound = (kind != TSR_KIND_PERCPU_POINTER); break; default: stack->compound = false; @@ -868,6 +868,11 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo int offset = var->offset; struct type_state_stack *stack; + /* If the reg location holds the pointer value, dereference the type */ + if (!var->is_reg_var_addr && is_pointer_type(&mem_die) && + __die_get_real_type(&mem_die, &mem_die) == NULL) + continue; + if (var->reg != DWARF_REG_FB) offset -= fb_offset; @@ -893,6 +898,10 @@ static void update_var_state(struct type_state *state, struct data_loc_info *dlo reg = &state->regs[var->reg]; + /* For gp registers, skip the address registers for now */ + if (var->is_reg_var_addr) + continue; + if (reg->ok && reg->kind == TSR_KIND_TYPE && !is_better_type(®->type, &mem_die)) continue; @@ -1107,7 +1116,7 @@ again: return PERF_TMR_OK; } - if (state->regs[reg].kind == TSR_KIND_POINTER) { + if (state->regs[reg].kind == TSR_KIND_PERCPU_POINTER) { pr_debug_dtp("percpu ptr"); /* diff --git a/tools/perf/util/annotate-data.h b/tools/perf/util/annotate-data.h index 541fee1a5f0a..df52a0a1f496 100644 --- a/tools/perf/util/annotate-data.h +++ b/tools/perf/util/annotate-data.h @@ -34,7 +34,7 @@ enum type_state_kind { TSR_KIND_TYPE, TSR_KIND_PERCPU_BASE, TSR_KIND_CONST, - TSR_KIND_POINTER, + TSR_KIND_PERCPU_POINTER, TSR_KIND_CANARY, }; @@ -189,12 +189,15 @@ struct type_state_stack { u8 kind; }; -/* FIXME: This should be arch-dependent */ -#ifdef __powerpc__ +/* + * Maximum number of registers tracked in type_state. + * + * This limit must cover all supported architectures, since perf + * may analyze perf.data files generated on systems with a different + * register set. Use 32 as a safe upper bound instead of relying on + * build-arch specific values. + */ #define TYPE_STATE_MAX_REGS 32 -#else -#define TYPE_STATE_MAX_REGS 16 -#endif /* * State table to maintain type info in each register and stack location. diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 0dd475a744b6..a2e34f149a07 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -765,14 +765,16 @@ __hist_entry__get_data_type(struct hist_entry *he, struct arch *arch, struct debuginfo *dbg, struct disasm_line *dl, int *type_offset); -struct annotation_print_data { - struct hist_entry *he; - struct evsel *evsel; - struct arch *arch; - struct debuginfo *dbg; - u64 start; - int addr_fmt_width; -}; +static bool needs_type_info(struct annotated_data_type *data_type) +{ + if (data_type == NULL || data_type == NO_TYPE) + return false; + + if (verbose) + return true; + + return (data_type != &stackop_type) && (data_type != &canary_type); +} static int annotation_line__print(struct annotation_line *al, struct annotation_print_data *apd, @@ -845,7 +847,7 @@ annotation_line__print(struct annotation_line *al, struct annotation_print_data printf(" : "); - disasm_line__print(dl, apd->start, apd->addr_fmt_width); + disasm_line__print(dl, notes->src->start, apd->addr_fmt_width); if (opts->code_with_type && apd->dbg) { struct annotated_data_type *data_type; @@ -853,7 +855,7 @@ annotation_line__print(struct annotation_line *al, struct annotation_print_data data_type = __hist_entry__get_data_type(apd->he, apd->arch, apd->dbg, dl, &offset); - if (data_type && data_type != NO_TYPE) { + if (needs_type_info(data_type)) { char buf[4096]; printf("\t\t# data-type: %s", @@ -1013,7 +1015,6 @@ int symbol__annotate(struct map_symbol *ms, struct evsel *evsel, struct symbol *sym = ms->sym; struct annotation *notes = symbol__annotation(sym); struct annotate_args args = { - .evsel = evsel, .options = &annotate_opts, }; struct arch *arch = NULL; @@ -1230,7 +1231,6 @@ int hist_entry__annotate_printf(struct hist_entry *he, struct evsel *evsel) struct annotation_print_data apd = { .he = he, .evsel = evsel, - .start = map__rip_2objdump(map, sym->start), }; int printed = 2, queue_len = 0; int more = 0; @@ -1267,9 +1267,9 @@ int hist_entry__annotate_printf(struct hist_entry *he, struct evsel *evsel) symbol__annotate_hits(sym, evsel); apd.addr_fmt_width = annotated_source__addr_fmt_width(¬es->src->source, - apd.start); + notes->src->start); evsel__get_arch(evsel, &apd.arch); - apd.dbg = debuginfo__new(filename); + apd.dbg = dso__debuginfo(dso); list_for_each_entry(pos, ¬es->src->source, node) { int err; @@ -1357,7 +1357,8 @@ static void FILE__write_graph(void *fp, int graph) fputs(s, fp); } -static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp) +static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp, + struct annotation_print_data *apd) { struct annotation *notes = symbol__annotation(sym); struct annotation_write_ops wops = { @@ -1371,24 +1372,37 @@ static int symbol__annotate_fprintf2(struct symbol *sym, FILE *fp) }; struct annotation_line *al; + if (annotate_opts.code_with_type) { + evsel__get_arch(apd->evsel, &apd->arch); + apd->dbg = dso__debuginfo(map__dso(apd->he->ms.map)); + } + list_for_each_entry(al, ¬es->src->source, node) { if (annotation_line__filter(al)) continue; - annotation_line__write(al, notes, &wops); + annotation_line__write(al, notes, &wops, apd); fputc('\n', fp); wops.first_line = false; } + if (annotate_opts.code_with_type) + debuginfo__delete(apd->dbg); + return 0; } -int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel) +int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel, + struct hist_entry *he) { const char *ev_name = evsel__name(evsel); char buf[1024]; char *filename; int err = -1; FILE *fp; + struct annotation_print_data apd = { + .he = he, + .evsel = evsel, + }; if (asprintf(&filename, "%s.annotation", ms->sym->name) < 0) return -1; @@ -1404,7 +1418,7 @@ int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel) fprintf(fp, "%s() %s\nEvent: %s\n\n", ms->sym->name, dso__long_name(map__dso(ms->map)), ev_name); - symbol__annotate_fprintf2(ms->sym, fp); + symbol__annotate_fprintf2(ms->sym, fp, &apd); fclose(fp); err = 0; @@ -1656,6 +1670,10 @@ int hist_entry__tty_annotate2(struct hist_entry *he, struct evsel *evsel) struct symbol *sym = ms->sym; struct rb_root source_line = RB_ROOT; struct hists *hists = evsel__hists(evsel); + struct annotation_print_data apd = { + .he = he, + .evsel = evsel, + }; char buf[1024]; int err; @@ -1678,7 +1696,7 @@ int hist_entry__tty_annotate2(struct hist_entry *he, struct evsel *evsel) hists__scnprintf_title(hists, buf, sizeof(buf)); fprintf(stdout, "%s, [percent: %s]\n%s() %s\n", buf, percent_type_str(annotate_opts.percent_type), sym->name, dso__long_name(dso)); - symbol__annotate_fprintf2(sym, stdout); + symbol__annotate_fprintf2(sym, stdout, &apd); annotated_source__purge(symbol__annotation(sym)->src); @@ -1743,7 +1761,7 @@ static double annotation_line__max_percent(struct annotation_line *al, return percent_max; } -static void disasm_line__write(struct disasm_line *dl, struct annotation *notes, +static int disasm_line__write(struct disasm_line *dl, struct annotation *notes, void *obj, char *bf, size_t size, void (*obj__printf)(void *obj, const char *fmt, ...), void (*obj__write_graph)(void *obj, int graph)) @@ -1771,8 +1789,8 @@ call_like: obj__printf(obj, " "); } - disasm_line__scnprintf(dl, bf, size, !annotate_opts.use_offset, - notes->src->widths.max_ins_name); + return disasm_line__scnprintf(dl, bf, size, !annotate_opts.use_offset, + notes->src->widths.max_ins_name) + 2; } static void ipc_coverage_string(char *bf, int size, struct annotation *notes) @@ -1935,24 +1953,82 @@ err: return -ENOMEM; } -static void __annotation_line__write(struct annotation_line *al, struct annotation *notes, - bool first_line, bool current_entry, bool change_color, int width, - void *obj, unsigned int percent_type, - int (*obj__set_color)(void *obj, int color), - void (*obj__set_percent_color)(void *obj, double percent, bool current), - int (*obj__set_jumps_percent_color)(void *obj, int nr, bool current), - void (*obj__printf)(void *obj, const char *fmt, ...), - void (*obj__write_graph)(void *obj, int graph)) +struct type_hash_entry { + struct annotated_data_type *type; + int offset; +}; +static int disasm_line__snprint_type_info(struct disasm_line *dl, + char *buf, int len, + struct annotation_print_data *apd) { - double percent_max = annotation_line__max_percent(al, percent_type); - int pcnt_width = annotation__pcnt_width(notes), - cycles_width = annotation__cycles_width(notes); + struct annotated_data_type *data_type = NULL; + struct type_hash_entry *entry = NULL; + char member[256]; + int offset = 0; + int printed; + + scnprintf(buf, len, " "); + + if (!annotate_opts.code_with_type || apd->dbg == NULL) + return 1; + + if (apd->type_hash) { + hashmap__find(apd->type_hash, dl->al.offset, &entry); + if (entry != NULL) { + data_type = entry->type; + offset = entry->offset; + } + } + + if (data_type == NULL) + data_type = __hist_entry__get_data_type(apd->he, apd->arch, apd->dbg, dl, &offset); + + if (apd->type_hash && entry == NULL) { + entry = malloc(sizeof(*entry)); + if (entry != NULL) { + entry->type = data_type; + entry->offset = offset; + hashmap__add(apd->type_hash, dl->al.offset, entry); + } + } + + if (!needs_type_info(data_type)) + return 1; + + printed = scnprintf(buf, len, "\t\t# data-type: %s", data_type->self.type_name); + + if (data_type != &stackop_type && data_type != &canary_type && len > printed) + printed += scnprintf(buf + printed, len - printed, " +%#x", offset); + + if (annotated_data_type__get_member_name(data_type, member, sizeof(member), offset) && + len > printed) { + printed += scnprintf(buf + printed, len - printed, " (%s)", member); + } + return printed; +} + +void annotation_line__write(struct annotation_line *al, struct annotation *notes, + const struct annotation_write_ops *wops, + struct annotation_print_data *apd) +{ + bool current_entry = wops->current_entry; + bool change_color = wops->change_color; + double percent_max = annotation_line__max_percent(al, annotate_opts.percent_type); + int width = wops->width; + int pcnt_width = annotation__pcnt_width(notes); + int cycles_width = annotation__cycles_width(notes); bool show_title = false; char bf[256]; int printed; - - if (first_line && (al->offset == -1 || percent_max == 0.0)) { + void *obj = wops->obj; + int (*obj__set_color)(void *obj, int color) = wops->set_color; + void (*obj__set_percent_color)(void *obj, double percent, bool current) = wops->set_percent_color; + int (*obj__set_jumps_percent_color)(void *obj, int nr, bool current) = wops->set_jumps_percent_color; + void (*obj__printf)(void *obj, const char *fmt, ...) = wops->printf; + void (*obj__write_graph)(void *obj, int graph) = wops->write_graph; + + if (wops->first_line && (al->offset == -1 || percent_max == 0.0)) { if (notes->branch && al->cycles) { if (al->cycles->ipc == 0.0 && al->cycles->avg == 0) show_title = true; @@ -1966,7 +2042,8 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati for (i = 0; i < al->data_nr; i++) { double percent; - percent = annotation_data__percent(&al->data[i], percent_type); + percent = annotation_data__percent(&al->data[i], + annotate_opts.percent_type); obj__set_percent_color(obj, percent, current_entry); if (symbol_conf.show_total_period) { @@ -1989,6 +2066,7 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati symbol_conf.show_nr_samples ? "Samples" : "Percent"); } } + width -= pcnt_width; if (notes->branch) { if (al->cycles && al->cycles->ipc) @@ -2052,11 +2130,13 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati obj__printf(obj, "%*s", ANNOTATION__AVG_IPC_WIDTH, bf); } } + width -= cycles_width; obj__printf(obj, " "); + width -= 1; if (!*al->line) - obj__printf(obj, "%-*s", width - pcnt_width - cycles_width, " "); + obj__printf(obj, "%-*s", width, " "); else if (al->offset == -1) { if (al->line_nr && annotate_opts.show_linenr) printed = scnprintf(bf, sizeof(bf), "%-*d ", @@ -2065,7 +2145,8 @@ static void __annotation_line__write(struct annotation_line *al, struct annotati printed = scnprintf(bf, sizeof(bf), "%-*s ", notes->src->widths.addr, " "); obj__printf(obj, bf); - obj__printf(obj, "%-*s", width - printed - pcnt_width - cycles_width + 1, al->line); + width -= printed; + obj__printf(obj, "%-*s", width, al->line); } else { u64 addr = al->offset; int color = -1; @@ -2108,22 +2189,18 @@ print_addr: if (change_color) obj__set_color(obj, color); - disasm_line__write(disasm_line(al), notes, obj, bf, sizeof(bf), obj__printf, obj__write_graph); + width -= printed; - obj__printf(obj, "%-*s", width - pcnt_width - cycles_width - 3 - printed, bf); - } + printed = disasm_line__write(disasm_line(al), notes, obj, bf, sizeof(bf), + obj__printf, obj__write_graph); -} + obj__printf(obj, "%s", bf); + width -= printed; + + disasm_line__snprint_type_info(disasm_line(al), bf, sizeof(bf), apd); + obj__printf(obj, "%-*s", width, bf); + } -void annotation_line__write(struct annotation_line *al, struct annotation *notes, - struct annotation_write_ops *wops) -{ - __annotation_line__write(al, notes, wops->first_line, wops->current_entry, - wops->change_color, wops->width, wops->obj, - annotate_opts.percent_type, - wops->set_color, wops->set_percent_color, - wops->set_jumps_percent_color, wops->printf, - wops->write_graph); } int symbol__annotate2(struct map_symbol *ms, struct evsel *evsel, @@ -2829,7 +2906,7 @@ struct annotated_data_type *hist_entry__get_data_type(struct hist_entry *he) di_cache.dso = dso__get(map__dso(ms->map)); debuginfo__delete(di_cache.dbg); - di_cache.dbg = debuginfo__new(dso__long_name(di_cache.dso)); + di_cache.dbg = dso__debuginfo(di_cache.dso); } if (di_cache.dbg == NULL) { diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 8b5131d257b0..eaf6c8aa7f47 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -199,8 +199,20 @@ struct annotation_write_ops { void (*write_graph)(void *obj, int graph); }; +struct annotation_print_data { + struct hist_entry *he; + struct evsel *evsel; + struct arch *arch; + struct debuginfo *dbg; + /* save data type info keyed by al->offset */ + struct hashmap *type_hash; + /* It'll be set in hist_entry__annotate_printf() */ + int addr_fmt_width; +}; + void annotation_line__write(struct annotation_line *al, struct annotation *notes, - struct annotation_write_ops *ops); + const struct annotation_write_ops *ops, + struct annotation_print_data *apd); int __annotation__scnprintf_samples_period(struct annotation *notes, char *bf, size_t size, @@ -463,7 +475,8 @@ void symbol__annotate_zero_histogram(struct symbol *sym, struct evsel *evsel); void symbol__annotate_decay_histogram(struct symbol *sym, struct evsel *evsel); void annotated_source__purge(struct annotated_source *as); -int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel); +int map_symbol__annotation_dump(struct map_symbol *ms, struct evsel *evsel, + struct hist_entry *he); bool ui__has_annotation(void); @@ -471,18 +484,6 @@ int hist_entry__annotate_printf(struct hist_entry *he, struct evsel *evsel); int hist_entry__tty_annotate(struct hist_entry *he, struct evsel *evsel); int hist_entry__tty_annotate2(struct hist_entry *he, struct evsel *evsel); -#ifdef HAVE_SLANG_SUPPORT -int symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, - struct hist_browser_timer *hbt); -#else -static inline int symbol__tui_annotate(struct map_symbol *ms __maybe_unused, - struct evsel *evsel __maybe_unused, - struct hist_browser_timer *hbt __maybe_unused) -{ - return 0; -} -#endif - void annotation_options__init(void); void annotation_options__exit(void); diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c index 688fe6d75244..96eb7cced6fd 100644 --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.c @@ -229,42 +229,7 @@ static int arm_spe_read_record(struct arm_spe_decoder *decoder) } break; case ARM_SPE_EVENTS: - if (payload & BIT(EV_L1D_REFILL)) - decoder->record.type |= ARM_SPE_L1D_MISS; - - if (payload & BIT(EV_L1D_ACCESS)) - decoder->record.type |= ARM_SPE_L1D_ACCESS; - - if (payload & BIT(EV_TLB_WALK)) - decoder->record.type |= ARM_SPE_TLB_MISS; - - if (payload & BIT(EV_TLB_ACCESS)) - decoder->record.type |= ARM_SPE_TLB_ACCESS; - - if (payload & BIT(EV_LLC_MISS)) - decoder->record.type |= ARM_SPE_LLC_MISS; - - if (payload & BIT(EV_LLC_ACCESS)) - decoder->record.type |= ARM_SPE_LLC_ACCESS; - - if (payload & BIT(EV_REMOTE_ACCESS)) - decoder->record.type |= ARM_SPE_REMOTE_ACCESS; - - if (payload & BIT(EV_MISPRED)) - decoder->record.type |= ARM_SPE_BRANCH_MISS; - - if (payload & BIT(EV_NOT_TAKEN)) - decoder->record.type |= ARM_SPE_BRANCH_NOT_TAKEN; - - if (payload & BIT(EV_TRANSACTIONAL)) - decoder->record.type |= ARM_SPE_IN_TXN; - - if (payload & BIT(EV_PARTIAL_PREDICATE)) - decoder->record.type |= ARM_SPE_SVE_PARTIAL_PRED; - - if (payload & BIT(EV_EMPTY_PREDICATE)) - decoder->record.type |= ARM_SPE_SVE_EMPTY_PRED; - + decoder->record.type = payload; break; case ARM_SPE_DATA_SOURCE: decoder->record.source = payload; diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h index 881d9f29c138..fbb57f805237 100644 --- a/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h +++ b/tools/perf/util/arm-spe-decoder/arm-spe-decoder.h @@ -13,20 +13,23 @@ #include "arm-spe-pkt-decoder.h" -enum arm_spe_sample_type { - ARM_SPE_L1D_ACCESS = 1 << 0, - ARM_SPE_L1D_MISS = 1 << 1, - ARM_SPE_LLC_ACCESS = 1 << 2, - ARM_SPE_LLC_MISS = 1 << 3, - ARM_SPE_TLB_ACCESS = 1 << 4, - ARM_SPE_TLB_MISS = 1 << 5, - ARM_SPE_BRANCH_MISS = 1 << 6, - ARM_SPE_REMOTE_ACCESS = 1 << 7, - ARM_SPE_SVE_PARTIAL_PRED = 1 << 8, - ARM_SPE_SVE_EMPTY_PRED = 1 << 9, - ARM_SPE_BRANCH_NOT_TAKEN = 1 << 10, - ARM_SPE_IN_TXN = 1 << 11, -}; +#define ARM_SPE_L1D_ACCESS BIT(EV_L1D_ACCESS) +#define ARM_SPE_L1D_MISS BIT(EV_L1D_REFILL) +#define ARM_SPE_LLC_ACCESS BIT(EV_LLC_ACCESS) +#define ARM_SPE_LLC_MISS BIT(EV_LLC_MISS) +#define ARM_SPE_TLB_ACCESS BIT(EV_TLB_ACCESS) +#define ARM_SPE_TLB_MISS BIT(EV_TLB_WALK) +#define ARM_SPE_BRANCH_MISS BIT(EV_MISPRED) +#define ARM_SPE_BRANCH_NOT_TAKEN BIT(EV_NOT_TAKEN) +#define ARM_SPE_REMOTE_ACCESS BIT(EV_REMOTE_ACCESS) +#define ARM_SPE_SVE_PARTIAL_PRED BIT(EV_PARTIAL_PREDICATE) +#define ARM_SPE_SVE_EMPTY_PRED BIT(EV_EMPTY_PREDICATE) +#define ARM_SPE_IN_TXN BIT(EV_TRANSACTIONAL) +#define ARM_SPE_L2D_ACCESS BIT(EV_L2D_ACCESS) +#define ARM_SPE_L2D_MISS BIT(EV_L2D_MISS) +#define ARM_SPE_RECENTLY_FETCHED BIT(EV_RECENTLY_FETCHED) +#define ARM_SPE_DATA_SNOOPED BIT(EV_DATA_SNOOPED) +#define ARM_SPE_HITM BIT(EV_CACHE_DATA_MODIFIED) enum arm_spe_op_type { /* First level operation type */ @@ -100,7 +103,7 @@ enum arm_spe_hisi_hip_data_source { }; struct arm_spe_record { - enum arm_spe_sample_type type; + u64 type; int err; u32 op; u32 latency; diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.c b/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.c index 13cadb2f1cea..80561630253d 100644 --- a/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.c +++ b/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.c @@ -314,6 +314,20 @@ static int arm_spe_pkt_desc_event(const struct arm_spe_pkt *packet, arm_spe_pkt_out_string(&err, &buf, &buf_len, " SVE-PARTIAL-PRED"); if (payload & BIT(EV_EMPTY_PREDICATE)) arm_spe_pkt_out_string(&err, &buf, &buf_len, " SVE-EMPTY-PRED"); + if (payload & BIT(EV_L2D_ACCESS)) + arm_spe_pkt_out_string(&err, &buf, &buf_len, " L2D-ACCESS"); + if (payload & BIT(EV_L2D_MISS)) + arm_spe_pkt_out_string(&err, &buf, &buf_len, " L2D-MISS"); + if (payload & BIT(EV_CACHE_DATA_MODIFIED)) + arm_spe_pkt_out_string(&err, &buf, &buf_len, " HITM"); + if (payload & BIT(EV_RECENTLY_FETCHED)) + arm_spe_pkt_out_string(&err, &buf, &buf_len, " LFB"); + if (payload & BIT(EV_DATA_SNOOPED)) + arm_spe_pkt_out_string(&err, &buf, &buf_len, " SNOOPED"); + if (payload & BIT(EV_STREAMING_SVE_MODE)) + arm_spe_pkt_out_string(&err, &buf, &buf_len, " STREAMING-SVE"); + if (payload & BIT(EV_SMCU)) + arm_spe_pkt_out_string(&err, &buf, &buf_len, " SMCU"); return err; } diff --git a/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.h b/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.h index 2cdf9f6da268..d00c2481712d 100644 --- a/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.h +++ b/tools/perf/util/arm-spe-decoder/arm-spe-pkt-decoder.h @@ -108,6 +108,13 @@ enum arm_spe_events { EV_TRANSACTIONAL = 16, EV_PARTIAL_PREDICATE = 17, EV_EMPTY_PREDICATE = 18, + EV_L2D_ACCESS = 19, + EV_L2D_MISS = 20, + EV_CACHE_DATA_MODIFIED = 21, + EV_RECENTLY_FETCHED = 22, + EV_DATA_SNOOPED = 23, + EV_STREAMING_SVE_MODE = 24, + EV_SMCU = 25, }; /* Operation packet header */ diff --git a/tools/perf/util/arm-spe.c b/tools/perf/util/arm-spe.c index 8942fa598a84..71be979f5077 100644 --- a/tools/perf/util/arm-spe.c +++ b/tools/perf/util/arm-spe.c @@ -39,6 +39,18 @@ #define is_ldst_op(op) (!!((op) & ARM_SPE_OP_LDST)) +#define ARM_SPE_CACHE_EVENT(lvl) \ + (ARM_SPE_##lvl##_ACCESS | ARM_SPE_##lvl##_MISS) + +#define arm_spe_is_cache_level(type, lvl) \ + ((type) & ARM_SPE_CACHE_EVENT(lvl)) + +#define arm_spe_is_cache_hit(type, lvl) \ + (((type) & ARM_SPE_CACHE_EVENT(lvl)) == ARM_SPE_##lvl##_ACCESS) + +#define arm_spe_is_cache_miss(type, lvl) \ + ((type) & ARM_SPE_##lvl##_MISS) + struct arm_spe { struct auxtrace auxtrace; struct auxtrace_queues queues; @@ -62,7 +74,6 @@ struct arm_spe { u8 sample_remote_access; u8 sample_memory; u8 sample_instructions; - u64 instructions_sample_period; u64 l1d_miss_id; u64 l1d_access_id; @@ -101,7 +112,7 @@ struct arm_spe_queue { u64 time; u64 timestamp; struct thread *thread; - u64 period_instructions; + u64 sample_count; u32 flags; struct branch_stack *last_branch; }; @@ -228,7 +239,6 @@ static struct arm_spe_queue *arm_spe__alloc_queue(struct arm_spe *spe, speq->pid = -1; speq->tid = -1; speq->cpu = -1; - speq->period_instructions = 0; /* params set */ params.get_trace = arm_spe_get_trace; @@ -305,15 +315,28 @@ static int arm_spe_set_tid(struct arm_spe_queue *speq, pid_t tid) return 0; } -static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, u64 cpu) +static u64 *arm_spe__get_metadata_by_cpu(struct arm_spe *spe, int cpu) { u64 i; if (!spe->metadata) return NULL; + /* CPU ID is -1 for per-thread mode */ + if (cpu < 0) { + /* + * On the heterogeneous system, due to CPU ID is -1, + * cannot confirm the data source packet is supported. + */ + if (!spe->is_homogeneous) + return NULL; + + /* In homogeneous system, simply use CPU0's metadata */ + return spe->metadata[0]; + } + for (i = 0; i < spe->metadata_nr_cpu; i++) - if (spe->metadata[i][ARM_SPE_CPU] == cpu) + if (spe->metadata[i][ARM_SPE_CPU] == (u64)cpu) return spe->metadata[i]; return NULL; @@ -352,7 +375,7 @@ static void arm_spe_prep_sample(struct arm_spe *spe, sample->cpumode = arm_spe_cpumode(spe, sample->ip); sample->pid = speq->pid; sample->tid = speq->tid; - sample->period = 1; + sample->period = spe->synth_opts.period; sample->cpu = speq->cpu; sample->simd_flags = arm_spe__synth_simd_flags(record); @@ -471,7 +494,8 @@ arm_spe_deliver_synth_event(struct arm_spe *spe, } static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, - u64 spe_events_id, u64 data_src) + u64 spe_events_id, + union perf_mem_data_src data_src) { struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; @@ -486,7 +510,7 @@ static int arm_spe__synth_mem_sample(struct arm_spe_queue *speq, sample.stream_id = spe_events_id; sample.addr = record->virt_addr; sample.phys_addr = record->phys_addr; - sample.data_src = data_src; + sample.data_src = data_src.val; sample.weight = record->latency; ret = arm_spe_deliver_synth_event(spe, speq, event, &sample); @@ -519,7 +543,8 @@ static int arm_spe__synth_branch_sample(struct arm_spe_queue *speq, } static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, - u64 spe_events_id, u64 data_src) + u64 spe_events_id, + union perf_mem_data_src data_src) { struct arm_spe *spe = speq->spe; struct arm_spe_record *record = &speq->decoder->record; @@ -527,14 +552,6 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, struct perf_sample sample; int ret; - /* - * Handles perf instruction sampling period. - */ - speq->period_instructions++; - if (speq->period_instructions < spe->instructions_sample_period) - return 0; - speq->period_instructions = 0; - perf_sample__init(&sample, /*all=*/true); arm_spe_prep_sample(spe, speq, event, &sample); @@ -542,8 +559,7 @@ static int arm_spe__synth_instruction_sample(struct arm_spe_queue *speq, sample.stream_id = spe_events_id; sample.addr = record->to_ip; sample.phys_addr = record->phys_addr; - sample.data_src = data_src; - sample.period = spe->instructions_sample_period; + sample.data_src = data_src.val; sample.weight = record->latency; sample.flags = speq->flags; sample.branch_stack = speq->last_branch; @@ -670,8 +686,8 @@ static void arm_spe__synth_data_source_common(const struct arm_spe_record *recor * socket */ case ARM_SPE_COMMON_DS_REMOTE: - data_src->mem_lvl = PERF_MEM_LVL_REM_CCE1; - data_src->mem_lvl_num = PERF_MEM_LVLNUM_ANY_CACHE; + data_src->mem_lvl = PERF_MEM_LVL_NA; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; data_src->mem_snoopx = PERF_MEM_SNOOPX_PEER; break; @@ -819,30 +835,121 @@ static const struct data_source_handle data_source_handles[] = { DS(hisi_hip_ds_encoding_cpus, data_source_hisi_hip), }; -static void arm_spe__synth_memory_level(const struct arm_spe_record *record, - union perf_mem_data_src *data_src) +static void arm_spe__synth_ld_memory_level(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) { - if (record->type & (ARM_SPE_LLC_ACCESS | ARM_SPE_LLC_MISS)) { - data_src->mem_lvl = PERF_MEM_LVL_L3; + /* + * To find a cache hit, search in ascending order from the lower level + * caches to the higher level caches. This reflects the best scenario + * for a cache hit. + */ + if (arm_spe_is_cache_hit(record->type, L1D)) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; + } else if (record->type & ARM_SPE_RECENTLY_FETCHED) { + data_src->mem_lvl = PERF_MEM_LVL_LFB | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_LFB; + } else if (arm_spe_is_cache_hit(record->type, L2D)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + } else if (arm_spe_is_cache_hit(record->type, LLC)) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + /* + * To find a cache miss, search in descending order from the higher + * level cache to the lower level cache. This represents the worst + * scenario for a cache miss. + */ + } else if (arm_spe_is_cache_miss(record->type, LLC)) { + data_src->mem_lvl = PERF_MEM_LVL_L3 | PERF_MEM_LVL_MISS; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + } else if (arm_spe_is_cache_miss(record->type, L2D)) { + data_src->mem_lvl = PERF_MEM_LVL_L2 | PERF_MEM_LVL_MISS; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + } else if (arm_spe_is_cache_miss(record->type, L1D)) { + data_src->mem_lvl = PERF_MEM_LVL_L1 | PERF_MEM_LVL_MISS; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; + } +} - if (record->type & ARM_SPE_LLC_MISS) - data_src->mem_lvl |= PERF_MEM_LVL_MISS; - else - data_src->mem_lvl |= PERF_MEM_LVL_HIT; - } else if (record->type & (ARM_SPE_L1D_ACCESS | ARM_SPE_L1D_MISS)) { +static void arm_spe__synth_st_memory_level(const struct arm_spe_record *record, + union perf_mem_data_src *data_src) +{ + /* Record the greatest level info for a store operation. */ + if (arm_spe_is_cache_level(record->type, LLC)) { + data_src->mem_lvl = PERF_MEM_LVL_L3; + data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, LLC) ? + PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L3; + } else if (arm_spe_is_cache_level(record->type, L2D)) { + data_src->mem_lvl = PERF_MEM_LVL_L2; + data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L2D) ? + PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L2; + } else if (arm_spe_is_cache_level(record->type, L1D)) { data_src->mem_lvl = PERF_MEM_LVL_L1; + data_src->mem_lvl |= arm_spe_is_cache_miss(record->type, L1D) ? + PERF_MEM_LVL_MISS : PERF_MEM_LVL_HIT; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_L1; + } +} - if (record->type & ARM_SPE_L1D_MISS) - data_src->mem_lvl |= PERF_MEM_LVL_MISS; - else - data_src->mem_lvl |= PERF_MEM_LVL_HIT; +static void arm_spe__synth_memory_level(struct arm_spe_queue *speq, + const struct arm_spe_record *record, + union perf_mem_data_src *data_src) +{ + struct arm_spe *spe = speq->spe; + + /* + * The data source packet contains more info for cache levels for + * peer snooping. So respect the memory level if has been set by + * data source parsing. + */ + if (!data_src->mem_lvl) { + if (data_src->mem_op == PERF_MEM_OP_LOAD) + arm_spe__synth_ld_memory_level(record, data_src); + if (data_src->mem_op == PERF_MEM_OP_STORE) + arm_spe__synth_st_memory_level(record, data_src); + } + + if (!data_src->mem_lvl) { + data_src->mem_lvl = PERF_MEM_LVL_NA; + data_src->mem_lvl_num = PERF_MEM_LVLNUM_NA; + } + + /* + * If 'mem_snoop' has been set by data source packet, skip to set + * it at here. + */ + if (!data_src->mem_snoop) { + if (record->type & ARM_SPE_DATA_SNOOPED) { + if (record->type & ARM_SPE_HITM) + data_src->mem_snoop = PERF_MEM_SNOOP_HITM; + else + data_src->mem_snoop = PERF_MEM_SNOOP_HIT; + } else { + u64 *metadata = + arm_spe__get_metadata_by_cpu(spe, speq->cpu); + + /* + * Set NA ("Not available") mode if no meta data or the + * SNOOPED event is not supported. + */ + if (!metadata || + !(metadata[ARM_SPE_CAP_EVENT_FILTER] & ARM_SPE_DATA_SNOOPED)) + data_src->mem_snoop = PERF_MEM_SNOOP_NA; + else + data_src->mem_snoop = PERF_MEM_SNOOP_NONE; + } } - if (record->type & ARM_SPE_REMOTE_ACCESS) - data_src->mem_lvl |= PERF_MEM_LVL_REM_CCE1; + if (!data_src->mem_remote) { + if (record->type & ARM_SPE_REMOTE_ACCESS) + data_src->mem_remote = PERF_MEM_REMOTE_REMOTE; + } } -static bool arm_spe__synth_ds(struct arm_spe_queue *speq, +static void arm_spe__synth_ds(struct arm_spe_queue *speq, const struct arm_spe_record *record, union perf_mem_data_src *data_src) { @@ -859,56 +966,41 @@ static bool arm_spe__synth_ds(struct arm_spe_queue *speq, cpuid = perf_env__cpuid(perf_session__env(spe->session)); midr = strtol(cpuid, NULL, 16); } else { - /* CPU ID is -1 for per-thread mode */ - if (speq->cpu < 0) { - /* - * On the heterogeneous system, due to CPU ID is -1, - * cannot confirm the data source packet is supported. - */ - if (!spe->is_homogeneous) - return false; - - /* In homogeneous system, simply use CPU0's metadata */ - if (spe->metadata) - metadata = spe->metadata[0]; - } else { - metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu); - } - + metadata = arm_spe__get_metadata_by_cpu(spe, speq->cpu); if (!metadata) - return false; + return; midr = metadata[ARM_SPE_CPU_MIDR]; } for (i = 0; i < ARRAY_SIZE(data_source_handles); i++) { if (is_midr_in_range_list(midr, data_source_handles[i].midr_ranges)) { - data_source_handles[i].ds_synth(record, data_src); - return true; + return data_source_handles[i].ds_synth(record, data_src); } } - return false; + return; } -static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, - const struct arm_spe_record *record) +static union perf_mem_data_src +arm_spe__synth_data_source(struct arm_spe_queue *speq, + const struct arm_spe_record *record) { - union perf_mem_data_src data_src = { .mem_op = PERF_MEM_OP_NA }; + union perf_mem_data_src data_src = {}; /* Only synthesize data source for LDST operations */ if (!is_ldst_op(record->op)) - return 0; + return data_src; if (record->op & ARM_SPE_OP_LD) data_src.mem_op = PERF_MEM_OP_LOAD; else if (record->op & ARM_SPE_OP_ST) data_src.mem_op = PERF_MEM_OP_STORE; else - return 0; + return data_src; - if (!arm_spe__synth_ds(speq, record, &data_src)) - arm_spe__synth_memory_level(record, &data_src); + arm_spe__synth_ds(speq, record, &data_src); + arm_spe__synth_memory_level(speq, record, &data_src); if (record->type & (ARM_SPE_TLB_ACCESS | ARM_SPE_TLB_MISS)) { data_src.mem_dtlb = PERF_MEM_TLB_WK; @@ -919,16 +1011,24 @@ static u64 arm_spe__synth_data_source(struct arm_spe_queue *speq, data_src.mem_dtlb |= PERF_MEM_TLB_HIT; } - return data_src.val; + return data_src; } static int arm_spe_sample(struct arm_spe_queue *speq) { const struct arm_spe_record *record = &speq->decoder->record; struct arm_spe *spe = speq->spe; - u64 data_src; + union perf_mem_data_src data_src; int err; + /* + * Discard all samples until period is reached + */ + speq->sample_count++; + if (speq->sample_count < spe->synth_opts.period) + return 0; + speq->sample_count = 0; + arm_spe__sample_flags(speq); data_src = arm_spe__synth_data_source(speq, record); @@ -1532,6 +1632,7 @@ static const char * const metadata_per_cpu_fmts[] = { [ARM_SPE_CPU_MIDR] = " MIDR :0x%"PRIx64"\n", [ARM_SPE_CPU_PMU_TYPE] = " PMU Type :%"PRId64"\n", [ARM_SPE_CAP_MIN_IVAL] = " Min Interval :%"PRId64"\n", + [ARM_SPE_CAP_EVENT_FILTER] = " Event Filter :0x%"PRIx64"\n", }; static void arm_spe_print_info(struct arm_spe *spe, __u64 *arr) @@ -1628,6 +1729,7 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) attr.exclude_guest = evsel->core.attr.exclude_guest; attr.sample_id_all = evsel->core.attr.sample_id_all; attr.read_format = evsel->core.attr.read_format; + attr.sample_period = spe->synth_opts.period; /* create new id val to be a fixed offset from evsel id */ id = evsel->core.id[0] + 1000000000; @@ -1744,25 +1846,15 @@ arm_spe_synth_events(struct arm_spe *spe, struct perf_session *session) } if (spe->synth_opts.instructions) { - if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { - pr_warning("Only instruction-based sampling period is currently supported by Arm SPE.\n"); - goto synth_instructions_out; - } - if (spe->synth_opts.period > 1) - pr_warning("Arm SPE has a hardware-based sample period.\n" - "Additional instruction events will be discarded by --itrace\n"); - spe->sample_instructions = true; attr.config = PERF_COUNT_HW_INSTRUCTIONS; - attr.sample_period = spe->synth_opts.period; - spe->instructions_sample_period = attr.sample_period; + err = perf_session__deliver_synth_attr_event(session, &attr, id); if (err) return err; spe->instructions_id = id; arm_spe_set_event_name(evlist, id, "instructions"); } -synth_instructions_out: return 0; } @@ -1871,10 +1963,23 @@ int arm_spe_process_auxtrace_info(union perf_event *event, if (dump_trace) return 0; - if (session->itrace_synth_opts && session->itrace_synth_opts->set) + if (session->itrace_synth_opts && session->itrace_synth_opts->set) { spe->synth_opts = *session->itrace_synth_opts; - else + } else { itrace_synth_opts__set_default(&spe->synth_opts, false); + /* Default nanoseconds period not supported */ + spe->synth_opts.period_type = PERF_ITRACE_PERIOD_INSTRUCTIONS; + spe->synth_opts.period = 1; + } + + if (spe->synth_opts.period_type != PERF_ITRACE_PERIOD_INSTRUCTIONS) { + ui__error("You must only use i (instructions) --itrace period with Arm SPE. e.g --itrace=i1i\n"); + err = -EINVAL; + goto err_free_queues; + } + if (spe->synth_opts.period > 1) + ui__warning("Arm SPE has a hardware-based sampling period.\n\n" + "--itrace periods > 1i downsample by an interval of n SPE samples rather than n instructions.\n"); err = arm_spe_synth_events(spe, session); if (err) diff --git a/tools/perf/util/arm-spe.h b/tools/perf/util/arm-spe.h index 390679a4af2f..3966df1856d8 100644 --- a/tools/perf/util/arm-spe.h +++ b/tools/perf/util/arm-spe.h @@ -47,6 +47,8 @@ enum { ARM_SPE_CPU_PMU_TYPE, /* Minimal interval */ ARM_SPE_CAP_MIN_IVAL, + /* Event filter */ + ARM_SPE_CAP_EVENT_FILTER, ARM_SPE_CPU_PRIV_MAX, }; diff --git a/tools/perf/util/auxtrace.c b/tools/perf/util/auxtrace.c index ebd32f1b8f12..1539c1dc823c 100644 --- a/tools/perf/util/auxtrace.c +++ b/tools/perf/util/auxtrace.c @@ -55,6 +55,7 @@ #include "hisi-ptt.h" #include "s390-cpumsf.h" #include "util/mmap.h" +#include "powerpc-vpadtl.h" #include <linux/ctype.h> #include "symbol/kallsyms.h" @@ -185,10 +186,7 @@ void auxtrace_mmap_params__set_idx(struct auxtrace_mmap_params *mp, if (per_cpu) { mp->cpu = perf_cpu_map__cpu(evlist->core.all_cpus, idx); - if (evlist->core.threads) - mp->tid = perf_thread_map__pid(evlist->core.threads, 0); - else - mp->tid = -1; + mp->tid = perf_thread_map__pid(evlist->core.threads, 0); } else { mp->cpu.cpu = -1; mp->tid = perf_thread_map__pid(evlist->core.threads, idx); @@ -1393,6 +1391,9 @@ int perf_event__process_auxtrace_info(struct perf_session *session, case PERF_AUXTRACE_HISI_PTT: err = hisi_ptt_process_auxtrace_info(event, session); break; + case PERF_AUXTRACE_VPA_DTL: + err = powerpc_vpadtl_process_auxtrace_info(event, session); + break; case PERF_AUXTRACE_UNKNOWN: default: return -EINVAL; diff --git a/tools/perf/util/auxtrace.h b/tools/perf/util/auxtrace.h index f001cbb68f8e..e0a5b39fed12 100644 --- a/tools/perf/util/auxtrace.h +++ b/tools/perf/util/auxtrace.h @@ -50,6 +50,7 @@ enum auxtrace_type { PERF_AUXTRACE_ARM_SPE, PERF_AUXTRACE_S390_CPUMSF, PERF_AUXTRACE_HISI_PTT, + PERF_AUXTRACE_VPA_DTL, }; enum itrace_period_type { diff --git a/tools/perf/util/bpf-filter.c b/tools/perf/util/bpf-filter.c index a0b11f35395f..1a2e7b388d57 100644 --- a/tools/perf/util/bpf-filter.c +++ b/tools/perf/util/bpf-filter.c @@ -56,6 +56,7 @@ #include "util/debug.h" #include "util/evsel.h" #include "util/target.h" +#include "util/bpf-utils.h" #include "util/bpf-filter.h" #include <util/bpf-filter-flex.h> @@ -451,8 +452,12 @@ int perf_bpf_filter__prepare(struct evsel *evsel, struct target *target) struct bpf_link *link; struct perf_bpf_filter_entry *entry; bool needs_idx_hash = !target__has_cpu(target); +#if LIBBPF_CURRENT_VERSION_GEQ(1, 7) DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts, .dont_enable = true); +#else + DECLARE_LIBBPF_OPTS(bpf_perf_event_opts, pe_opts); +#endif entry = calloc(MAX_FILTERS, sizeof(*entry)); if (entry == NULL) diff --git a/tools/perf/util/bpf-trace-summary.c b/tools/perf/util/bpf-trace-summary.c index 69fb165da206..8dfe7e678941 100644 --- a/tools/perf/util/bpf-trace-summary.c +++ b/tools/perf/util/bpf-trace-summary.c @@ -138,11 +138,14 @@ static bool sc_node_equal(long key1, long key2, void *ctx __maybe_unused) return key1 == key2; } -static int print_common_stats(struct syscall_data *data, FILE *fp) +static int print_common_stats(struct syscall_data *data, int max_summary, FILE *fp) { int printed = 0; - for (int i = 0; i < data->nr_nodes; i++) { + if (max_summary == 0 || max_summary > data->nr_nodes) + max_summary = data->nr_nodes; + + for (int i = 0; i < max_summary; i++) { struct syscall_node *node = &data->nodes[i]; struct syscall_stats *stat = &node->stats; double total = (double)(stat->total_time) / NSEC_PER_MSEC; @@ -200,7 +203,7 @@ static int update_thread_stats(struct hashmap *hash, struct syscall_key *map_key return 0; } -static int print_thread_stat(struct syscall_data *data, FILE *fp) +static int print_thread_stat(struct syscall_data *data, int max_summary, FILE *fp) { int printed = 0; @@ -213,18 +216,18 @@ static int print_thread_stat(struct syscall_data *data, FILE *fp) printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); - printed += print_common_stats(data, fp); + printed += print_common_stats(data, max_summary, fp); printed += fprintf(fp, "\n\n"); return printed; } -static int print_thread_stats(struct syscall_data **data, int nr_data, FILE *fp) +static int print_thread_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp) { int printed = 0; for (int i = 0; i < nr_data; i++) - printed += print_thread_stat(data[i], fp); + printed += print_thread_stat(data[i], max_summary, fp); return printed; } @@ -277,7 +280,7 @@ static int update_total_stats(struct hashmap *hash, struct syscall_key *map_key, return 0; } -static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp) +static int print_total_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp) { int printed = 0; int nr_events = 0; @@ -291,8 +294,11 @@ static int print_total_stats(struct syscall_data **data, int nr_data, FILE *fp) printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); - for (int i = 0; i < nr_data; i++) - printed += print_common_stats(data[i], fp); + if (max_summary == 0 || max_summary > nr_data) + max_summary = nr_data; + + for (int i = 0; i < max_summary; i++) + printed += print_common_stats(data[i], max_summary, fp); printed += fprintf(fp, "\n\n"); return printed; @@ -333,7 +339,7 @@ static int update_cgroup_stats(struct hashmap *hash, struct syscall_key *map_key return 0; } -static int print_cgroup_stat(struct syscall_data *data, FILE *fp) +static int print_cgroup_stat(struct syscall_data *data, int max_summary, FILE *fp) { int printed = 0; struct cgroup *cgrp = __cgroup__find(&cgroups, data->key); @@ -351,23 +357,23 @@ static int print_cgroup_stat(struct syscall_data *data, FILE *fp) printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n"); printed += fprintf(fp, " --------------- -------- ------ -------- --------- --------- --------- ------\n"); - printed += print_common_stats(data, fp); + printed += print_common_stats(data, max_summary, fp); printed += fprintf(fp, "\n\n"); return printed; } -static int print_cgroup_stats(struct syscall_data **data, int nr_data, FILE *fp) +static int print_cgroup_stats(struct syscall_data **data, int nr_data, int max_summary, FILE *fp) { int printed = 0; for (int i = 0; i < nr_data; i++) - printed += print_cgroup_stat(data[i], fp); + printed += print_cgroup_stat(data[i], max_summary, fp); return printed; } -int trace_print_bpf_summary(FILE *fp) +int trace_print_bpf_summary(FILE *fp, int max_summary) { struct bpf_map *map = skel->maps.syscall_stats_map; struct syscall_key *prev_key, key; @@ -420,13 +426,13 @@ int trace_print_bpf_summary(FILE *fp) switch (skel->rodata->aggr_mode) { case SYSCALL_AGGR_THREAD: - printed += print_thread_stats(data, nr_data, fp); + printed += print_thread_stats(data, nr_data, max_summary, fp); break; case SYSCALL_AGGR_CPU: - printed += print_total_stats(data, nr_data, fp); + printed += print_total_stats(data, nr_data, max_summary, fp); break; case SYSCALL_AGGR_CGROUP: - printed += print_cgroup_stats(data, nr_data, fp); + printed += print_cgroup_stats(data, nr_data, max_summary, fp); break; default: break; diff --git a/tools/perf/util/bpf-utils.h b/tools/perf/util/bpf-utils.h index 86a5055cdfad..a8bc1a232968 100644 --- a/tools/perf/util/bpf-utils.h +++ b/tools/perf/util/bpf-utils.h @@ -8,6 +8,16 @@ #ifdef HAVE_LIBBPF_SUPPORT #include <bpf/libbpf.h> +#include <bpf/libbpf_version.h> + +#define LIBBPF_CURRENT_VERSION_GEQ(major, minor) \ + (LIBBPF_MAJOR_VERSION > (major) || \ + (LIBBPF_MAJOR_VERSION == (major) && LIBBPF_MINOR_VERSION >= (minor))) + +#if LIBBPF_CURRENT_VERSION_GEQ(1, 7) +// libbpf 1.7+ support the btf_dump_type_data_opts.emit_strings option. +#define HAVE_LIBBPF_STRINGS_SUPPORT 1 +#endif /* * Get bpf_prog_info in continuous memory diff --git a/tools/perf/util/bpf_counter.c b/tools/perf/util/bpf_counter.c index 73fcafbffc6a..ca5d01b9017d 100644 --- a/tools/perf/util/bpf_counter.c +++ b/tools/perf/util/bpf_counter.c @@ -6,10 +6,14 @@ #include <limits.h> #include <unistd.h> #include <sys/file.h> +#include <sys/resource.h> #include <sys/time.h> #include <linux/err.h> +#include <linux/list.h> #include <linux/zalloc.h> #include <api/fs/fs.h> +#include <bpf/bpf.h> +#include <bpf/btf.h> #include <perf/bpf_perf.h> #include "bpf_counter.h" @@ -28,13 +32,67 @@ #include "bpf_skel/bperf_leader.skel.h" #include "bpf_skel/bperf_follower.skel.h" +struct bpf_counter { + void *skel; + struct list_head list; +}; + #define ATTR_MAP_SIZE 16 -static inline void *u64_to_ptr(__u64 ptr) +static void *u64_to_ptr(__u64 ptr) { return (void *)(unsigned long)ptr; } + +void set_max_rlimit(void) +{ + struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; + + setrlimit(RLIMIT_MEMLOCK, &rinf); +} + +static __u32 bpf_link_get_id(int fd) +{ + struct bpf_link_info link_info = { .id = 0, }; + __u32 link_info_len = sizeof(link_info); + + bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); + return link_info.id; +} + +static __u32 bpf_link_get_prog_id(int fd) +{ + struct bpf_link_info link_info = { .id = 0, }; + __u32 link_info_len = sizeof(link_info); + + bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); + return link_info.prog_id; +} + +static __u32 bpf_map_get_id(int fd) +{ + struct bpf_map_info map_info = { .id = 0, }; + __u32 map_info_len = sizeof(map_info); + + bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len); + return map_info.id; +} + +/* trigger the leader program on a cpu */ +int bperf_trigger_reading(int prog_fd, int cpu) +{ + DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, + .ctx_in = NULL, + .ctx_size_in = 0, + .flags = BPF_F_TEST_RUN_ON_CPU, + .cpu = cpu, + .retval = 0, + ); + + return bpf_prog_test_run_opts(prog_fd, &opts); +} + static struct bpf_counter *bpf_counter_alloc(void) { struct bpf_counter *counter; @@ -278,6 +336,7 @@ static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu_map_idx { struct bpf_prog_profiler_bpf *skel; struct bpf_counter *counter; + int cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx).cpu; int ret; list_for_each_entry(counter, &evsel->bpf_counter_list, list) { @@ -285,7 +344,7 @@ static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu_map_idx assert(skel != NULL); ret = bpf_map_update_elem(bpf_map__fd(skel->maps.events), - &cpu_map_idx, &fd, BPF_ANY); + &cpu, &fd, BPF_ANY); if (ret) return ret; } @@ -393,7 +452,6 @@ static int bperf_check_target(struct evsel *evsel, return 0; } -static struct perf_cpu_map *all_cpu_map; static __u32 filter_entry_cnt; static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd, @@ -437,7 +495,7 @@ static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd, * following evsel__open_per_cpu call */ evsel->leader_skel = skel; - evsel__open_per_cpu(evsel, all_cpu_map, -1); + evsel__open(evsel, evsel->core.cpus, evsel->core.threads); out: bperf_leader_bpf__destroy(skel); @@ -475,12 +533,6 @@ static int bperf__load(struct evsel *evsel, struct target *target) if (bperf_check_target(evsel, target, &filter_type, &filter_entry_cnt)) return -1; - if (!all_cpu_map) { - all_cpu_map = perf_cpu_map__new_online_cpus(); - if (!all_cpu_map) - return -1; - } - evsel->bperf_leader_prog_fd = -1; evsel->bperf_leader_link_fd = -1; @@ -598,9 +650,10 @@ out: static int bperf__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) { struct bperf_leader_bpf *skel = evsel->leader_skel; + int cpu = perf_cpu_map__cpu(evsel->core.cpus, cpu_map_idx).cpu; return bpf_map_update_elem(bpf_map__fd(skel->maps.events), - &cpu_map_idx, &fd, BPF_ANY); + &cpu, &fd, BPF_ANY); } /* @@ -609,13 +662,12 @@ static int bperf__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) */ static int bperf_sync_counters(struct evsel *evsel) { - int num_cpu, i, cpu; + struct perf_cpu cpu; + int idx; + + perf_cpu_map__for_each_cpu(cpu, idx, evsel->core.cpus) + bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu.cpu); - num_cpu = perf_cpu_map__nr(all_cpu_map); - for (i = 0; i < num_cpu; i++) { - cpu = perf_cpu_map__cpu(all_cpu_map, i).cpu; - bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu); - } return 0; } @@ -785,7 +837,7 @@ struct bpf_counter_ops bperf_ops = { extern struct bpf_counter_ops bperf_cgrp_ops; -static inline bool bpf_counter_skip(struct evsel *evsel) +static bool bpf_counter_skip(struct evsel *evsel) { return evsel->bpf_counter_ops == NULL; } diff --git a/tools/perf/util/bpf_counter.h b/tools/perf/util/bpf_counter.h index c6d21c07b14c..658d8e7d507e 100644 --- a/tools/perf/util/bpf_counter.h +++ b/tools/perf/util/bpf_counter.h @@ -2,18 +2,10 @@ #ifndef __PERF_BPF_COUNTER_H #define __PERF_BPF_COUNTER_H 1 -#include <linux/list.h> -#include <sys/resource.h> - -#ifdef HAVE_LIBBPF_SUPPORT -#include <bpf/bpf.h> -#include <bpf/btf.h> -#include <bpf/libbpf.h> -#endif - struct evsel; struct target; -struct bpf_counter; + +#ifdef HAVE_BPF_SKEL typedef int (*bpf_counter_evsel_op)(struct evsel *evsel); typedef int (*bpf_counter_evsel_target_op)(struct evsel *evsel, @@ -22,6 +14,7 @@ typedef int (*bpf_counter_evsel_install_pe_op)(struct evsel *evsel, int cpu_map_idx, int fd); +/* Shared ops between bpf_counter, bpf_counter_cgroup, etc. */ struct bpf_counter_ops { bpf_counter_evsel_target_op load; bpf_counter_evsel_op enable; @@ -31,13 +24,6 @@ struct bpf_counter_ops { bpf_counter_evsel_install_pe_op install_pe; }; -struct bpf_counter { - void *skel; - struct list_head list; -}; - -#ifdef HAVE_BPF_SKEL - int bpf_counter__load(struct evsel *evsel, struct target *target); int bpf_counter__enable(struct evsel *evsel); int bpf_counter__disable(struct evsel *evsel); @@ -45,6 +31,9 @@ int bpf_counter__read(struct evsel *evsel); void bpf_counter__destroy(struct evsel *evsel); int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd); +int bperf_trigger_reading(int prog_fd, int cpu); +void set_max_rlimit(void); + #else /* HAVE_BPF_SKEL */ #include <linux/err.h> @@ -83,55 +72,4 @@ static inline int bpf_counter__install_pe(struct evsel *evsel __maybe_unused, #endif /* HAVE_BPF_SKEL */ -static inline void set_max_rlimit(void) -{ - struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; - - setrlimit(RLIMIT_MEMLOCK, &rinf); -} - -#ifdef HAVE_BPF_SKEL - -static inline __u32 bpf_link_get_id(int fd) -{ - struct bpf_link_info link_info = { .id = 0, }; - __u32 link_info_len = sizeof(link_info); - - bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); - return link_info.id; -} - -static inline __u32 bpf_link_get_prog_id(int fd) -{ - struct bpf_link_info link_info = { .id = 0, }; - __u32 link_info_len = sizeof(link_info); - - bpf_obj_get_info_by_fd(fd, &link_info, &link_info_len); - return link_info.prog_id; -} - -static inline __u32 bpf_map_get_id(int fd) -{ - struct bpf_map_info map_info = { .id = 0, }; - __u32 map_info_len = sizeof(map_info); - - bpf_obj_get_info_by_fd(fd, &map_info, &map_info_len); - return map_info.id; -} - -/* trigger the leader program on a cpu */ -static inline int bperf_trigger_reading(int prog_fd, int cpu) -{ - DECLARE_LIBBPF_OPTS(bpf_test_run_opts, opts, - .ctx_in = NULL, - .ctx_size_in = 0, - .flags = BPF_F_TEST_RUN_ON_CPU, - .cpu = cpu, - .retval = 0, - ); - - return bpf_prog_test_run_opts(prog_fd, &opts); -} -#endif /* HAVE_BPF_SKEL */ - #endif /* __PERF_BPF_COUNTER_H */ diff --git a/tools/perf/util/bpf_counter_cgroup.c b/tools/perf/util/bpf_counter_cgroup.c index 6ff42619de12..690be3ce3e11 100644 --- a/tools/perf/util/bpf_counter_cgroup.c +++ b/tools/perf/util/bpf_counter_cgroup.c @@ -13,6 +13,7 @@ #include <linux/zalloc.h> #include <linux/perf_event.h> #include <api/fs/fs.h> +#include <bpf/bpf.h> #include <perf/bpf_perf.h> #include "affinity.h" @@ -185,7 +186,8 @@ static int bperf_cgrp__load(struct evsel *evsel, } static int bperf_cgrp__install_pe(struct evsel *evsel __maybe_unused, - int cpu __maybe_unused, int fd __maybe_unused) + int cpu_map_idx __maybe_unused, + int fd __maybe_unused) { /* nothing to do */ return 0; diff --git a/tools/perf/util/bpf_ftrace.c b/tools/perf/util/bpf_ftrace.c index 0cb02412043c..e61a3b20be0a 100644 --- a/tools/perf/util/bpf_ftrace.c +++ b/tools/perf/util/bpf_ftrace.c @@ -3,6 +3,7 @@ #include <stdint.h> #include <stdlib.h> +#include <bpf/bpf.h> #include <linux/err.h> #include "util/ftrace.h" diff --git a/tools/perf/util/bpf_off_cpu.c b/tools/perf/util/bpf_off_cpu.c index c367fefe6ecb..88e0660c4bff 100644 --- a/tools/perf/util/bpf_off_cpu.c +++ b/tools/perf/util/bpf_off_cpu.c @@ -13,6 +13,7 @@ #include "util/cgroup.h" #include "util/strlist.h" #include <bpf/bpf.h> +#include <bpf/btf.h> #include <internal/xyarray.h> #include <linux/time64.h> diff --git a/tools/perf/util/bpf_skel/kwork_top.bpf.c b/tools/perf/util/bpf_skel/kwork_top.bpf.c index 73e32e063030..6673386302e2 100644 --- a/tools/perf/util/bpf_skel/kwork_top.bpf.c +++ b/tools/perf/util/bpf_skel/kwork_top.bpf.c @@ -18,9 +18,7 @@ enum kwork_class_type { }; #define MAX_ENTRIES 102400 -#ifndef MAX_NR_CPUS #define MAX_NR_CPUS 4096 -#endif #define PF_KTHREAD 0x00200000 #define MAX_COMMAND_LEN 16 diff --git a/tools/perf/util/bpf_skel/sample_filter.bpf.c b/tools/perf/util/bpf_skel/sample_filter.bpf.c index b195e6efeb8b..e5666d4c1722 100644 --- a/tools/perf/util/bpf_skel/sample_filter.bpf.c +++ b/tools/perf/util/bpf_skel/sample_filter.bpf.c @@ -164,7 +164,7 @@ static inline __u64 perf_get_sample(struct bpf_perf_event_data_kern *kctx, if (entry->part == 8) { union perf_mem_data_src___new *data = (void *)&kctx->data->data_src; - if (bpf_core_field_exists(data->mem_hops)) + if (__builtin_preserve_field_info(data->mem_hops, BPF_FIELD_EXISTS)) return data->mem_hops; return 0; diff --git a/tools/perf/util/build-id.c b/tools/perf/util/build-id.c index bf7f3268b9a2..35505a1ffd11 100644 --- a/tools/perf/util/build-id.c +++ b/tools/perf/util/build-id.c @@ -86,6 +86,13 @@ int build_id__snprintf(const struct build_id *build_id, char *bf, size_t bf_size { size_t offs = 0; + if (build_id->size == 0) { + /* Ensure bf is always \0 terminated. */ + if (bf_size > 0) + bf[0] = '\0'; + return 0; + } + for (size_t i = 0; i < build_id->size && offs < bf_size; ++i) offs += snprintf(bf + offs, bf_size - offs, "%02x", build_id->data[i]); diff --git a/tools/perf/util/capstone.c b/tools/perf/util/capstone.c new file mode 100644 index 000000000000..be5fd44b1f9d --- /dev/null +++ b/tools/perf/util/capstone.c @@ -0,0 +1,471 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "capstone.h" +#include "annotate.h" +#include "addr_location.h" +#include "debug.h" +#include "disasm.h" +#include "dso.h" +#include "machine.h" +#include "map.h" +#include "namespaces.h" +#include "print_insn.h" +#include "symbol.h" +#include "thread.h" +#include <errno.h> +#include <fcntl.h> +#include <string.h> + +#ifdef HAVE_LIBCAPSTONE_SUPPORT +#include <capstone/capstone.h> +#endif + +#ifdef HAVE_LIBCAPSTONE_SUPPORT +static int capstone_init(struct machine *machine, csh *cs_handle, bool is64, + bool disassembler_style) +{ + cs_arch arch; + cs_mode mode; + + if (machine__is(machine, "x86_64") && is64) { + arch = CS_ARCH_X86; + mode = CS_MODE_64; + } else if (machine__normalized_is(machine, "x86")) { + arch = CS_ARCH_X86; + mode = CS_MODE_32; + } else if (machine__normalized_is(machine, "arm64")) { + arch = CS_ARCH_ARM64; + mode = CS_MODE_ARM; + } else if (machine__normalized_is(machine, "arm")) { + arch = CS_ARCH_ARM; + mode = CS_MODE_ARM + CS_MODE_V8; + } else if (machine__normalized_is(machine, "s390")) { + arch = CS_ARCH_SYSZ; + mode = CS_MODE_BIG_ENDIAN; + } else { + return -1; + } + + if (cs_open(arch, mode, cs_handle) != CS_ERR_OK) { + pr_warning_once("cs_open failed\n"); + return -1; + } + + if (machine__normalized_is(machine, "x86")) { + /* + * In case of using capstone_init while symbol__disassemble + * setting CS_OPT_SYNTAX_ATT depends if disassembler_style opts + * is set via annotation args + */ + if (disassembler_style) + cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT); + /* + * Resolving address operands to symbols is implemented + * on x86 by investigating instruction details. + */ + cs_option(*cs_handle, CS_OPT_DETAIL, CS_OPT_ON); + } + + return 0; +} +#endif + +#ifdef HAVE_LIBCAPSTONE_SUPPORT +static size_t print_insn_x86(struct thread *thread, u8 cpumode, cs_insn *insn, + int print_opts, FILE *fp) +{ + struct addr_location al; + size_t printed = 0; + + if (insn->detail && insn->detail->x86.op_count == 1) { + cs_x86_op *op = &insn->detail->x86.operands[0]; + + addr_location__init(&al); + if (op->type == X86_OP_IMM && + thread__find_symbol(thread, cpumode, op->imm, &al)) { + printed += fprintf(fp, "%s ", insn[0].mnemonic); + printed += symbol__fprintf_symname_offs(al.sym, &al, fp); + if (print_opts & PRINT_INSN_IMM_HEX) + printed += fprintf(fp, " [%#" PRIx64 "]", op->imm); + addr_location__exit(&al); + return printed; + } + addr_location__exit(&al); + } + + printed += fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str); + return printed; +} +#endif + + +ssize_t capstone__fprintf_insn_asm(struct machine *machine __maybe_unused, + struct thread *thread __maybe_unused, + u8 cpumode __maybe_unused, bool is64bit __maybe_unused, + const uint8_t *code __maybe_unused, + size_t code_size __maybe_unused, + uint64_t ip __maybe_unused, int *lenp __maybe_unused, + int print_opts __maybe_unused, FILE *fp __maybe_unused) +{ +#ifdef HAVE_LIBCAPSTONE_SUPPORT + size_t printed; + cs_insn *insn; + csh cs_handle; + size_t count; + int ret; + + /* TODO: Try to initiate capstone only once but need a proper place. */ + ret = capstone_init(machine, &cs_handle, is64bit, true); + if (ret < 0) + return ret; + + count = cs_disasm(cs_handle, code, code_size, ip, 1, &insn); + if (count > 0) { + if (machine__normalized_is(machine, "x86")) + printed = print_insn_x86(thread, cpumode, &insn[0], print_opts, fp); + else + printed = fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str); + if (lenp) + *lenp = insn->size; + cs_free(insn, count); + } else { + printed = -1; + } + + cs_close(&cs_handle); + return printed; +#else + return -1; +#endif +} + +#ifdef HAVE_LIBCAPSTONE_SUPPORT +static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, + struct annotate_args *args, u64 addr) +{ + int i; + struct map *map = args->ms.map; + struct symbol *sym; + + /* TODO: support more architectures */ + if (!arch__is(args->arch, "x86")) + return; + + if (insn->detail == NULL) + return; + + for (i = 0; i < insn->detail->x86.op_count; i++) { + cs_x86_op *op = &insn->detail->x86.operands[i]; + u64 orig_addr; + + if (op->type != X86_OP_MEM) + continue; + + /* only print RIP-based global symbols for now */ + if (op->mem.base != X86_REG_RIP) + continue; + + /* get the target address */ + orig_addr = addr + insn->size + op->mem.disp; + addr = map__objdump_2mem(map, orig_addr); + + if (dso__kernel(map__dso(map))) { + /* + * The kernel maps can be split into sections, let's + * find the map first and the search the symbol. + */ + map = maps__find(map__kmaps(map), addr); + if (map == NULL) + continue; + } + + /* convert it to map-relative address for search */ + addr = map__map_ip(map, addr); + + sym = map__find_symbol(map, addr); + if (sym == NULL) + continue; + + if (addr == sym->start) { + scnprintf(buf, len, "\t# %"PRIx64" <%s>", + orig_addr, sym->name); + } else { + scnprintf(buf, len, "\t# %"PRIx64" <%s+%#"PRIx64">", + orig_addr, sym->name, addr - sym->start); + } + break; + } +} +#endif + +#ifdef HAVE_LIBCAPSTONE_SUPPORT +struct find_file_offset_data { + u64 ip; + u64 offset; +}; + +/* This will be called for each PHDR in an ELF binary */ +static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg) +{ + struct find_file_offset_data *data = arg; + + if (start <= data->ip && data->ip < start + len) { + data->offset = pgoff + data->ip - start; + return 1; + } + return 0; +} +#endif + +int symbol__disassemble_capstone(const char *filename __maybe_unused, + struct symbol *sym __maybe_unused, + struct annotate_args *args __maybe_unused) +{ +#ifdef HAVE_LIBCAPSTONE_SUPPORT + struct annotation *notes = symbol__annotation(sym); + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + u64 start = map__rip_2objdump(map, sym->start); + u64 offset; + int i, count, free_count; + bool is_64bit = false; + bool needs_cs_close = false; + /* Malloc-ed buffer containing instructions read from disk. */ + u8 *code_buf = NULL; + /* Pointer to code to be disassembled. */ + const u8 *buf; + u64 buf_len; + csh handle; + cs_insn *insn = NULL; + char disasm_buf[512]; + struct disasm_line *dl; + bool disassembler_style = false; + + if (args->options->objdump_path) + return -1; + + buf = dso__read_symbol(dso, filename, map, sym, + &code_buf, &buf_len, &is_64bit); + if (buf == NULL) + return errno; + + /* add the function address and name */ + scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", + start, sym->name); + + args->offset = -1; + args->line = disasm_buf; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + if (!args->options->disassembler_style || + !strcmp(args->options->disassembler_style, "att")) + disassembler_style = true; + + if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0) + goto err; + + needs_cs_close = true; + + free_count = count = cs_disasm(handle, buf, buf_len, start, buf_len, &insn); + for (i = 0, offset = 0; i < count; i++) { + int printed; + + printed = scnprintf(disasm_buf, sizeof(disasm_buf), + " %-7s %s", + insn[i].mnemonic, insn[i].op_str); + print_capstone_detail(&insn[i], disasm_buf + printed, + sizeof(disasm_buf) - printed, args, + start + offset); + + args->offset = offset; + args->line = disasm_buf; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + offset += insn[i].size; + } + + /* It failed in the middle: probably due to unknown instructions */ + if (offset != buf_len) { + struct list_head *list = ¬es->src->source; + + /* Discard all lines and fallback to objdump */ + while (!list_empty(list)) { + dl = list_first_entry(list, struct disasm_line, al.node); + + list_del_init(&dl->al.node); + disasm_line__free(dl); + } + count = -1; + } + +out: + if (needs_cs_close) { + cs_close(&handle); + if (free_count > 0) + cs_free(insn, free_count); + } + free(code_buf); + return count < 0 ? count : 0; + +err: + if (needs_cs_close) { + struct disasm_line *tmp; + + /* + * It probably failed in the middle of the above loop. + * Release any resources it might add. + */ + list_for_each_entry_safe(dl, tmp, ¬es->src->source, al.node) { + list_del(&dl->al.node); + disasm_line__free(dl); + } + } + count = -1; + goto out; +#else + return -1; +#endif +} + +int symbol__disassemble_capstone_powerpc(const char *filename __maybe_unused, + struct symbol *sym __maybe_unused, + struct annotate_args *args __maybe_unused) +{ +#ifdef HAVE_LIBCAPSTONE_SUPPORT + struct annotation *notes = symbol__annotation(sym); + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + struct nscookie nsc; + u64 start = map__rip_2objdump(map, sym->start); + u64 end = map__rip_2objdump(map, sym->end); + u64 len = end - start; + u64 offset; + int i, fd, count; + bool is_64bit = false; + bool needs_cs_close = false; + u8 *buf = NULL; + struct find_file_offset_data data = { + .ip = start, + }; + csh handle; + char disasm_buf[512]; + struct disasm_line *dl; + u32 *line; + bool disassembler_style = false; + + if (args->options->objdump_path) + return -1; + + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + fd = open(filename, O_RDONLY); + nsinfo__mountns_exit(&nsc); + if (fd < 0) + return -1; + + if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data, + &is_64bit) == 0) + goto err; + + if (!args->options->disassembler_style || + !strcmp(args->options->disassembler_style, "att")) + disassembler_style = true; + + if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0) + goto err; + + needs_cs_close = true; + + buf = malloc(len); + if (buf == NULL) + goto err; + + count = pread(fd, buf, len, data.offset); + close(fd); + fd = -1; + + if ((u64)count != len) + goto err; + + line = (u32 *)buf; + + /* add the function address and name */ + scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", + start, sym->name); + + args->offset = -1; + args->line = disasm_buf; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + /* + * TODO: enable disassm for powerpc + * count = cs_disasm(handle, buf, len, start, len, &insn); + * + * For now, only binary code is saved in disassembled line + * to be used in "type" and "typeoff" sort keys. Each raw code + * is 32 bit instruction. So use "len/4" to get the number of + * entries. + */ + count = len/4; + + for (i = 0, offset = 0; i < count; i++) { + args->offset = offset; + sprintf(args->line, "%x", line[i]); + + dl = disasm_line__new(args); + if (dl == NULL) + break; + + annotation_line__add(&dl->al, ¬es->src->source); + + offset += 4; + } + + /* It failed in the middle */ + if (offset != len) { + struct list_head *list = ¬es->src->source; + + /* Discard all lines and fallback to objdump */ + while (!list_empty(list)) { + dl = list_first_entry(list, struct disasm_line, al.node); + + list_del_init(&dl->al.node); + disasm_line__free(dl); + } + count = -1; + } + +out: + if (needs_cs_close) + cs_close(&handle); + free(buf); + return count < 0 ? count : 0; + +err: + if (fd >= 0) + close(fd); + count = -1; + goto out; +#else + return -1; +#endif +} diff --git a/tools/perf/util/capstone.h b/tools/perf/util/capstone.h new file mode 100644 index 000000000000..0f030ea034b6 --- /dev/null +++ b/tools/perf/util/capstone.h @@ -0,0 +1,24 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_CAPSTONE_H +#define __PERF_CAPSTONE_H + +#include <stdbool.h> +#include <stdint.h> +#include <stdio.h> +#include <stdlib.h> +#include <linux/types.h> + +struct annotate_args; +struct machine; +struct symbol; +struct thread; + +ssize_t capstone__fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpumode, + bool is64bit, const uint8_t *code, size_t code_size, + uint64_t ip, int *lenp, int print_opts, FILE *fp); +int symbol__disassemble_capstone(const char *filename, struct symbol *sym, + struct annotate_args *args); +int symbol__disassemble_capstone_powerpc(const char *filename, struct symbol *sym, + struct annotate_args *args); + +#endif /* __PERF_CAPSTONE_H */ diff --git a/tools/perf/util/config.c b/tools/perf/util/config.c index ae72b66b6ded..6f914620c6ff 100644 --- a/tools/perf/util/config.c +++ b/tools/perf/util/config.c @@ -19,7 +19,7 @@ #include "util/hist.h" /* perf_hist_config */ #include "util/stat.h" /* perf_stat__set_big_num */ #include "util/evsel.h" /* evsel__hw_names, evsel__use_bpf_counters */ -#include "util/srcline.h" /* addr2line_timeout_ms */ +#include "util/addr2line.h" /* addr2line_timeout_ms */ #include "build-id.h" #include "debug.h" #include "config.h" diff --git a/tools/perf/util/disasm.c b/tools/perf/util/disasm.c index b1e4919d016f..50b9433f3f8e 100644 --- a/tools/perf/util/disasm.c +++ b/tools/perf/util/disasm.c @@ -14,13 +14,15 @@ #include "annotate.h" #include "annotate-data.h" #include "build-id.h" +#include "capstone.h" #include "debug.h" #include "disasm.h" -#include "disasm_bpf.h" #include "dso.h" #include "dwarf-regs.h" #include "env.h" #include "evsel.h" +#include "libbfd.h" +#include "llvm.h" #include "map.h" #include "maps.h" #include "namespaces.h" @@ -49,7 +51,6 @@ static int call__scnprintf(struct ins *ins, char *bf, size_t size, static void ins__sort(struct arch *arch); static int disasm_line__parse(char *line, const char **namep, char **rawp); static int disasm_line__parse_powerpc(struct disasm_line *dl, struct annotate_args *args); -static char *expand_tabs(char *line, char **storage, size_t *storage_len); static __attribute__((constructor)) void symbol__init_regexpr(void) { @@ -246,8 +247,8 @@ static int ins__raw_scnprintf(struct ins *ins, char *bf, size_t size, return scnprintf(bf, size, "%-*s %s", max_ins_name, ins->name, ops->raw); } -int ins__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name) +static int ins__scnprintf(struct ins *ins, char *bf, size_t size, + struct ins_operands *ops, int max_ins_name) { if (ins->ops->scnprintf) return ins->ops->scnprintf(ins, bf, size, ops, max_ins_name); @@ -390,13 +391,16 @@ static int jump__parse(struct arch *arch, struct ins_operands *ops, struct map_s * skip over possible up to 2 operands to get to address, e.g.: * tbnz w0, #26, ffff0000083cd190 <security_file_permission+0xd0> */ - if (c++ != NULL) { + if (c != NULL) { + c++; ops->target.addr = strtoull(c, NULL, 16); if (!ops->target.addr) { c = strchr(c, ','); c = validate_comma(c, ops); - if (c++ != NULL) + if (c != NULL) { + c++; ops->target.addr = strtoull(c, NULL, 16); + } } } else { ops->target.addr = strtoull(ops->raw, NULL, 16); @@ -824,7 +828,7 @@ static struct ins_ops ret_ops = { .scnprintf = ins__raw_scnprintf, }; -bool ins__is_nop(const struct ins *ins) +static bool ins__is_nop(const struct ins *ins) { return ins->ops == &nop_ops; } @@ -1330,420 +1334,6 @@ fallback: return 0; } -#ifdef HAVE_LIBCAPSTONE_SUPPORT -#include <capstone/capstone.h> - -int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style); - -static int open_capstone_handle(struct annotate_args *args, bool is_64bit, - csh *handle) -{ - struct annotation_options *opt = args->options; - cs_mode mode = is_64bit ? CS_MODE_64 : CS_MODE_32; - - /* TODO: support more architectures */ - if (!arch__is(args->arch, "x86")) - return -1; - - if (cs_open(CS_ARCH_X86, mode, handle) != CS_ERR_OK) - return -1; - - if (!opt->disassembler_style || - !strcmp(opt->disassembler_style, "att")) - cs_option(*handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT); - - /* - * Resolving address operands to symbols is implemented - * on x86 by investigating instruction details. - */ - cs_option(*handle, CS_OPT_DETAIL, CS_OPT_ON); - - return 0; -} -#endif - -#if defined(HAVE_LIBCAPSTONE_SUPPORT) || defined(HAVE_LIBLLVM_SUPPORT) -struct find_file_offset_data { - u64 ip; - u64 offset; -}; - -/* This will be called for each PHDR in an ELF binary */ -static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg) -{ - struct find_file_offset_data *data = arg; - - if (start <= data->ip && data->ip < start + len) { - data->offset = pgoff + data->ip - start; - return 1; - } - return 0; -} - -static u8 * -read_symbol(const char *filename, struct map *map, struct symbol *sym, - u64 *len, bool *is_64bit) -{ - struct dso *dso = map__dso(map); - struct nscookie nsc; - u64 start = map__rip_2objdump(map, sym->start); - u64 end = map__rip_2objdump(map, sym->end); - int fd, count; - u8 *buf = NULL; - struct find_file_offset_data data = { - .ip = start, - }; - - *is_64bit = false; - - nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); - fd = open(filename, O_RDONLY); - nsinfo__mountns_exit(&nsc); - if (fd < 0) - return NULL; - - if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data, - is_64bit) == 0) - goto err; - - *len = end - start; - buf = malloc(*len); - if (buf == NULL) - goto err; - - count = pread(fd, buf, *len, data.offset); - close(fd); - fd = -1; - - if ((u64)count != *len) - goto err; - - return buf; - -err: - if (fd >= 0) - close(fd); - free(buf); - return NULL; -} -#endif - -#if !defined(HAVE_LIBCAPSTONE_SUPPORT) || !defined(HAVE_LIBLLVM_SUPPORT) -static void symbol__disassembler_missing(const char *disassembler, const char *filename, - struct symbol *sym) -{ - pr_debug("The %s disassembler isn't linked in for %s in %s\n", - disassembler, sym->name, filename); -} -#endif - -#ifdef HAVE_LIBCAPSTONE_SUPPORT -static void print_capstone_detail(cs_insn *insn, char *buf, size_t len, - struct annotate_args *args, u64 addr) -{ - int i; - struct map *map = args->ms.map; - struct symbol *sym; - - /* TODO: support more architectures */ - if (!arch__is(args->arch, "x86")) - return; - - if (insn->detail == NULL) - return; - - for (i = 0; i < insn->detail->x86.op_count; i++) { - cs_x86_op *op = &insn->detail->x86.operands[i]; - u64 orig_addr; - - if (op->type != X86_OP_MEM) - continue; - - /* only print RIP-based global symbols for now */ - if (op->mem.base != X86_REG_RIP) - continue; - - /* get the target address */ - orig_addr = addr + insn->size + op->mem.disp; - addr = map__objdump_2mem(map, orig_addr); - - if (dso__kernel(map__dso(map))) { - /* - * The kernel maps can be splitted into sections, - * let's find the map first and the search the symbol. - */ - map = maps__find(map__kmaps(map), addr); - if (map == NULL) - continue; - } - - /* convert it to map-relative address for search */ - addr = map__map_ip(map, addr); - - sym = map__find_symbol(map, addr); - if (sym == NULL) - continue; - - if (addr == sym->start) { - scnprintf(buf, len, "\t# %"PRIx64" <%s>", - orig_addr, sym->name); - } else { - scnprintf(buf, len, "\t# %"PRIx64" <%s+%#"PRIx64">", - orig_addr, sym->name, addr - sym->start); - } - break; - } -} - -static int symbol__disassemble_capstone_powerpc(char *filename, struct symbol *sym, - struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct map *map = args->ms.map; - struct dso *dso = map__dso(map); - struct nscookie nsc; - u64 start = map__rip_2objdump(map, sym->start); - u64 end = map__rip_2objdump(map, sym->end); - u64 len = end - start; - u64 offset; - int i, fd, count; - bool is_64bit = false; - bool needs_cs_close = false; - u8 *buf = NULL; - struct find_file_offset_data data = { - .ip = start, - }; - csh handle; - char disasm_buf[512]; - struct disasm_line *dl; - u32 *line; - bool disassembler_style = false; - - if (args->options->objdump_path) - return -1; - - nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); - fd = open(filename, O_RDONLY); - nsinfo__mountns_exit(&nsc); - if (fd < 0) - return -1; - - if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data, - &is_64bit) == 0) - goto err; - - if (!args->options->disassembler_style || - !strcmp(args->options->disassembler_style, "att")) - disassembler_style = true; - - if (capstone_init(maps__machine(args->ms.maps), &handle, is_64bit, disassembler_style) < 0) - goto err; - - needs_cs_close = true; - - buf = malloc(len); - if (buf == NULL) - goto err; - - count = pread(fd, buf, len, data.offset); - close(fd); - fd = -1; - - if ((u64)count != len) - goto err; - - line = (u32 *)buf; - - /* add the function address and name */ - scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", - start, sym->name); - - args->offset = -1; - args->line = disasm_buf; - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - - dl = disasm_line__new(args); - if (dl == NULL) - goto err; - - annotation_line__add(&dl->al, ¬es->src->source); - - /* - * TODO: enable disassm for powerpc - * count = cs_disasm(handle, buf, len, start, len, &insn); - * - * For now, only binary code is saved in disassembled line - * to be used in "type" and "typeoff" sort keys. Each raw code - * is 32 bit instruction. So use "len/4" to get the number of - * entries. - */ - count = len/4; - - for (i = 0, offset = 0; i < count; i++) { - args->offset = offset; - sprintf(args->line, "%x", line[i]); - - dl = disasm_line__new(args); - if (dl == NULL) - break; - - annotation_line__add(&dl->al, ¬es->src->source); - - offset += 4; - } - - /* It failed in the middle */ - if (offset != len) { - struct list_head *list = ¬es->src->source; - - /* Discard all lines and fallback to objdump */ - while (!list_empty(list)) { - dl = list_first_entry(list, struct disasm_line, al.node); - - list_del_init(&dl->al.node); - disasm_line__free(dl); - } - count = -1; - } - -out: - if (needs_cs_close) - cs_close(&handle); - free(buf); - return count < 0 ? count : 0; - -err: - if (fd >= 0) - close(fd); - count = -1; - goto out; -} - -static int symbol__disassemble_capstone(char *filename, struct symbol *sym, - struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct map *map = args->ms.map; - u64 start = map__rip_2objdump(map, sym->start); - u64 len; - u64 offset; - int i, count, free_count; - bool is_64bit = false; - bool needs_cs_close = false; - u8 *buf = NULL; - csh handle; - cs_insn *insn = NULL; - char disasm_buf[512]; - struct disasm_line *dl; - - if (args->options->objdump_path) - return -1; - - buf = read_symbol(filename, map, sym, &len, &is_64bit); - if (buf == NULL) - return -1; - - /* add the function address and name */ - scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", - start, sym->name); - - args->offset = -1; - args->line = disasm_buf; - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - - dl = disasm_line__new(args); - if (dl == NULL) - goto err; - - annotation_line__add(&dl->al, ¬es->src->source); - - if (open_capstone_handle(args, is_64bit, &handle) < 0) - goto err; - - needs_cs_close = true; - - free_count = count = cs_disasm(handle, buf, len, start, len, &insn); - for (i = 0, offset = 0; i < count; i++) { - int printed; - - printed = scnprintf(disasm_buf, sizeof(disasm_buf), - " %-7s %s", - insn[i].mnemonic, insn[i].op_str); - print_capstone_detail(&insn[i], disasm_buf + printed, - sizeof(disasm_buf) - printed, args, - start + offset); - - args->offset = offset; - args->line = disasm_buf; - - dl = disasm_line__new(args); - if (dl == NULL) - goto err; - - annotation_line__add(&dl->al, ¬es->src->source); - - offset += insn[i].size; - } - - /* It failed in the middle: probably due to unknown instructions */ - if (offset != len) { - struct list_head *list = ¬es->src->source; - - /* Discard all lines and fallback to objdump */ - while (!list_empty(list)) { - dl = list_first_entry(list, struct disasm_line, al.node); - - list_del_init(&dl->al.node); - disasm_line__free(dl); - } - count = -1; - } - -out: - if (needs_cs_close) { - cs_close(&handle); - if (free_count > 0) - cs_free(insn, free_count); - } - free(buf); - return count < 0 ? count : 0; - -err: - if (needs_cs_close) { - struct disasm_line *tmp; - - /* - * It probably failed in the middle of the above loop. - * Release any resources it might add. - */ - list_for_each_entry_safe(dl, tmp, ¬es->src->source, al.node) { - list_del(&dl->al.node); - disasm_line__free(dl); - } - } - count = -1; - goto out; -} -#else // HAVE_LIBCAPSTONE_SUPPORT -static int symbol__disassemble_capstone(char *filename, struct symbol *sym, - struct annotate_args *args __maybe_unused) -{ - symbol__disassembler_missing("capstone", filename, sym); - return -1; -} - -static int symbol__disassemble_capstone_powerpc(char *filename, struct symbol *sym, - struct annotate_args *args __maybe_unused) -{ - symbol__disassembler_missing("capstone powerpc", filename, sym); - return -1; -} -#endif // HAVE_LIBCAPSTONE_SUPPORT - static int symbol__disassemble_raw(char *filename, struct symbol *sym, struct annotate_args *args) { @@ -1830,201 +1420,12 @@ err: goto out; } -#ifdef HAVE_LIBLLVM_SUPPORT -#include <llvm-c/Disassembler.h> -#include <llvm-c/Target.h> -#include "util/llvm-c-helpers.h" - -struct symbol_lookup_storage { - u64 branch_addr; - u64 pcrel_load_addr; -}; - -/* - * Whenever LLVM wants to resolve an address into a symbol, it calls this - * callback. We don't ever actually _return_ anything (in particular, because - * it puts quotation marks around what we return), but we use this as a hint - * that there is a branch or PC-relative address in the expression that we - * should add some textual annotation for after the instruction. The caller - * will use this information to add the actual annotation. - */ -static const char * -symbol_lookup_callback(void *disinfo, uint64_t value, - uint64_t *ref_type, - uint64_t address __maybe_unused, - const char **ref __maybe_unused) -{ - struct symbol_lookup_storage *storage = disinfo; - - if (*ref_type == LLVMDisassembler_ReferenceType_In_Branch) - storage->branch_addr = value; - else if (*ref_type == LLVMDisassembler_ReferenceType_In_PCrel_Load) - storage->pcrel_load_addr = value; - *ref_type = LLVMDisassembler_ReferenceType_InOut_None; - return NULL; -} - -static int symbol__disassemble_llvm(char *filename, struct symbol *sym, - struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct map *map = args->ms.map; - struct dso *dso = map__dso(map); - u64 start = map__rip_2objdump(map, sym->start); - u8 *buf; - u64 len; - u64 pc; - bool is_64bit; - char triplet[64]; - char disasm_buf[2048]; - size_t disasm_len; - struct disasm_line *dl; - LLVMDisasmContextRef disasm = NULL; - struct symbol_lookup_storage storage; - char *line_storage = NULL; - size_t line_storage_len = 0; - int ret = -1; - - if (args->options->objdump_path) - return -1; - - LLVMInitializeAllTargetInfos(); - LLVMInitializeAllTargetMCs(); - LLVMInitializeAllDisassemblers(); - - buf = read_symbol(filename, map, sym, &len, &is_64bit); - if (buf == NULL) - return -1; - - if (arch__is(args->arch, "x86")) { - if (is_64bit) - scnprintf(triplet, sizeof(triplet), "x86_64-pc-linux"); - else - scnprintf(triplet, sizeof(triplet), "i686-pc-linux"); - } else { - scnprintf(triplet, sizeof(triplet), "%s-linux-gnu", - args->arch->name); - } - - disasm = LLVMCreateDisasm(triplet, &storage, 0, NULL, - symbol_lookup_callback); - if (disasm == NULL) - goto err; - - if (args->options->disassembler_style && - !strcmp(args->options->disassembler_style, "intel")) - LLVMSetDisasmOptions(disasm, - LLVMDisassembler_Option_AsmPrinterVariant); - - /* - * This needs to be set after AsmPrinterVariant, due to a bug in LLVM; - * setting AsmPrinterVariant makes a new instruction printer, making it - * forget about the PrintImmHex flag (which is applied before if both - * are given to the same call). - */ - LLVMSetDisasmOptions(disasm, LLVMDisassembler_Option_PrintImmHex); - - /* add the function address and name */ - scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", - start, sym->name); - - args->offset = -1; - args->line = disasm_buf; - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - - dl = disasm_line__new(args); - if (dl == NULL) - goto err; - - annotation_line__add(&dl->al, ¬es->src->source); - - pc = start; - for (u64 offset = 0; offset < len; ) { - unsigned int ins_len; - - storage.branch_addr = 0; - storage.pcrel_load_addr = 0; - - ins_len = LLVMDisasmInstruction(disasm, buf + offset, - len - offset, pc, - disasm_buf, sizeof(disasm_buf)); - if (ins_len == 0) - goto err; - disasm_len = strlen(disasm_buf); - - if (storage.branch_addr != 0) { - char *name = llvm_name_for_code(dso, filename, - storage.branch_addr); - if (name != NULL) { - disasm_len += scnprintf(disasm_buf + disasm_len, - sizeof(disasm_buf) - - disasm_len, - " <%s>", name); - free(name); - } - } - if (storage.pcrel_load_addr != 0) { - char *name = llvm_name_for_data(dso, filename, - storage.pcrel_load_addr); - disasm_len += scnprintf(disasm_buf + disasm_len, - sizeof(disasm_buf) - disasm_len, - " # %#"PRIx64, - storage.pcrel_load_addr); - if (name) { - disasm_len += scnprintf(disasm_buf + disasm_len, - sizeof(disasm_buf) - - disasm_len, - " <%s>", name); - free(name); - } - } - - args->offset = offset; - args->line = expand_tabs(disasm_buf, &line_storage, - &line_storage_len); - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - - llvm_addr2line(filename, pc, &args->fileloc, - (unsigned int *)&args->line_nr, false, NULL); - - dl = disasm_line__new(args); - if (dl == NULL) - goto err; - - annotation_line__add(&dl->al, ¬es->src->source); - - free(args->fileloc); - pc += ins_len; - offset += ins_len; - } - - ret = 0; - -err: - LLVMDisasmDispose(disasm); - free(buf); - free(line_storage); - return ret; -} -#else // HAVE_LIBLLVM_SUPPORT -static int symbol__disassemble_llvm(char *filename, struct symbol *sym, - struct annotate_args *args __maybe_unused) -{ - symbol__disassembler_missing("LLVM", filename, sym); - return -1; -} -#endif // HAVE_LIBLLVM_SUPPORT - /* * Possibly create a new version of line with tabs expanded. Returns the * existing or new line, storage is updated if a new line is allocated. If * allocation fails then NULL is returned. */ -static char *expand_tabs(char *line, char **storage, size_t *storage_len) +char *expand_tabs(char *line, char **storage, size_t *storage_len) { size_t i, src, dst, len, new_storage_len, num_tabs; char *new_line; @@ -2079,6 +1480,23 @@ static char *expand_tabs(char *line, char **storage, size_t *storage_len) return new_line; } +static int symbol__disassemble_bpf_image(struct symbol *sym, struct annotate_args *args) +{ + struct annotation *notes = symbol__annotation(sym); + struct disasm_line *dl; + + args->offset = -1; + args->line = strdup("to be implemented"); + args->line_nr = 0; + args->fileloc = NULL; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + zfree(&args->line); + return 0; +} + static int symbol__disassemble_objdump(const char *filename, struct symbol *sym, struct annotate_args *args) { @@ -2103,6 +1521,12 @@ static int symbol__disassemble_objdump(const char *filename, struct symbol *sym, struct child_process objdump_process; int err; + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) + return symbol__disassemble_bpf_libbfd(sym, args); + + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_IMAGE) + return symbol__disassemble_bpf_image(sym, args); + err = asprintf(&command, "%s %s%s --start-address=0x%016" PRIx64 " --stop-address=0x%016" PRIx64 @@ -2237,11 +1661,7 @@ int symbol__disassemble(struct symbol *sym, struct annotate_args *args) pr_debug("annotating [%p] %30s : [%p] %30s\n", dso, dso__long_name(dso), sym, sym->name); - if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) { - return symbol__disassemble_bpf(sym, args); - } else if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_IMAGE) { - return symbol__disassemble_bpf_image(sym, args); - } else if (dso__binary_type(dso) == DSO_BINARY_TYPE__NOT_FOUND) { + if (dso__binary_type(dso) == DSO_BINARY_TYPE__NOT_FOUND) { return SYMBOL_ANNOTATE_ERRNO__COULDNT_DETERMINE_FILE_TYPE; } else if (dso__is_kcore(dso)) { kce.addr = map__rip_2objdump(map, sym->start); diff --git a/tools/perf/util/disasm.h b/tools/perf/util/disasm.h index c135db2416b5..d2cb555e4a3b 100644 --- a/tools/perf/util/disasm.h +++ b/tools/perf/util/disasm.h @@ -98,7 +98,6 @@ struct ins_ops { struct annotate_args { struct arch *arch; struct map_symbol ms; - struct evsel *evsel; struct annotation_options *options; s64 offset; char *line; @@ -110,13 +109,10 @@ struct arch *arch__find(const char *name); bool arch__is(struct arch *arch, const char *name); struct ins_ops *ins__find(struct arch *arch, const char *name, struct disasm_line *dl); -int ins__scnprintf(struct ins *ins, char *bf, size_t size, - struct ins_operands *ops, int max_ins_name); bool ins__is_call(const struct ins *ins); bool ins__is_jump(const struct ins *ins); bool ins__is_fused(struct arch *arch, const char *ins1, const char *ins2); -bool ins__is_nop(const struct ins *ins); bool ins__is_ret(const struct ins *ins); bool ins__is_lock(const struct ins *ins); @@ -128,4 +124,6 @@ int disasm_line__scnprintf(struct disasm_line *dl, char *bf, size_t size, int symbol__disassemble(struct symbol *sym, struct annotate_args *args); +char *expand_tabs(char *line, char **storage, size_t *storage_len); + #endif /* __PERF_UTIL_DISASM_H */ diff --git a/tools/perf/util/disasm_bpf.c b/tools/perf/util/disasm_bpf.c deleted file mode 100644 index 1fee71c79b62..000000000000 --- a/tools/perf/util/disasm_bpf.c +++ /dev/null @@ -1,195 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#include "util/annotate.h" -#include "util/disasm_bpf.h" -#include "util/symbol.h" -#include <linux/zalloc.h> -#include <string.h> - -#if defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) -#define PACKAGE "perf" -#include <bfd.h> -#include <bpf/bpf.h> -#include <bpf/btf.h> -#include <bpf/libbpf.h> -#include <dis-asm.h> -#include <errno.h> -#include <linux/btf.h> -#include <tools/dis-asm-compat.h> - -#include "util/bpf-event.h" -#include "util/bpf-utils.h" -#include "util/debug.h" -#include "util/dso.h" -#include "util/map.h" -#include "util/env.h" -#include "util/util.h" - -int symbol__disassemble_bpf(struct symbol *sym, struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct bpf_prog_linfo *prog_linfo = NULL; - struct bpf_prog_info_node *info_node; - int len = sym->end - sym->start; - disassembler_ftype disassemble; - struct map *map = args->ms.map; - struct perf_bpil *info_linear; - struct disassemble_info info; - struct dso *dso = map__dso(map); - int pc = 0, count, sub_id; - struct btf *btf = NULL; - char tpath[PATH_MAX]; - size_t buf_size; - int nr_skip = 0; - char *buf; - bfd *bfdf; - int ret; - FILE *s; - - if (dso__binary_type(dso) != DSO_BINARY_TYPE__BPF_PROG_INFO) - return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE; - - pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__, - sym->name, sym->start, sym->end - sym->start); - - memset(tpath, 0, sizeof(tpath)); - perf_exe(tpath, sizeof(tpath)); - - bfdf = bfd_openr(tpath, NULL); - if (bfdf == NULL) - abort(); - - if (!bfd_check_format(bfdf, bfd_object)) - abort(); - - s = open_memstream(&buf, &buf_size); - if (!s) { - ret = errno; - goto out; - } - init_disassemble_info_compat(&info, s, - (fprintf_ftype) fprintf, - fprintf_styled); - info.arch = bfd_get_arch(bfdf); - info.mach = bfd_get_mach(bfdf); - - info_node = perf_env__find_bpf_prog_info(dso__bpf_prog(dso)->env, - dso__bpf_prog(dso)->id); - if (!info_node) { - ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; - goto out; - } - info_linear = info_node->info_linear; - sub_id = dso__bpf_prog(dso)->sub_id; - - info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns); - info.buffer_length = info_linear->info.jited_prog_len; - - if (info_linear->info.nr_line_info) - prog_linfo = bpf_prog_linfo__new(&info_linear->info); - - if (info_linear->info.btf_id) { - struct btf_node *node; - - node = perf_env__find_btf(dso__bpf_prog(dso)->env, - info_linear->info.btf_id); - if (node) - btf = btf__new((__u8 *)(node->data), - node->data_size); - } - - disassemble_init_for_target(&info); - -#ifdef DISASM_FOUR_ARGS_SIGNATURE - disassemble = disassembler(info.arch, - bfd_big_endian(bfdf), - info.mach, - bfdf); -#else - disassemble = disassembler(bfdf); -#endif - if (disassemble == NULL) - abort(); - - fflush(s); - do { - const struct bpf_line_info *linfo = NULL; - struct disasm_line *dl; - size_t prev_buf_size; - const char *srcline; - u64 addr; - - addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id]; - count = disassemble(pc, &info); - - if (prog_linfo) - linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, - addr, sub_id, - nr_skip); - - if (linfo && btf) { - srcline = btf__name_by_offset(btf, linfo->line_off); - nr_skip++; - } else - srcline = NULL; - - fprintf(s, "\n"); - prev_buf_size = buf_size; - fflush(s); - - if (!annotate_opts.hide_src_code && srcline) { - args->offset = -1; - args->line = strdup(srcline); - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - dl = disasm_line__new(args); - if (dl) { - annotation_line__add(&dl->al, - ¬es->src->source); - } - } - - args->offset = pc; - args->line = buf + prev_buf_size; - args->line_nr = 0; - args->fileloc = NULL; - args->ms.sym = sym; - dl = disasm_line__new(args); - if (dl) - annotation_line__add(&dl->al, ¬es->src->source); - - pc += count; - } while (count > 0 && pc < len); - - ret = 0; -out: - free(prog_linfo); - btf__free(btf); - fclose(s); - bfd_close(bfdf); - return ret; -} -#else // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) -int symbol__disassemble_bpf(struct symbol *sym __maybe_unused, struct annotate_args *args __maybe_unused) -{ - return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; -} -#endif // defined(HAVE_LIBBFD_SUPPORT) && defined(HAVE_LIBBPF_SUPPORT) - -int symbol__disassemble_bpf_image(struct symbol *sym, struct annotate_args *args) -{ - struct annotation *notes = symbol__annotation(sym); - struct disasm_line *dl; - - args->offset = -1; - args->line = strdup("to be implemented"); - args->line_nr = 0; - args->fileloc = NULL; - dl = disasm_line__new(args); - if (dl) - annotation_line__add(&dl->al, ¬es->src->source); - - zfree(&args->line); - return 0; -} diff --git a/tools/perf/util/disasm_bpf.h b/tools/perf/util/disasm_bpf.h deleted file mode 100644 index 2ecb19545388..000000000000 --- a/tools/perf/util/disasm_bpf.h +++ /dev/null @@ -1,12 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only - -#ifndef __PERF_DISASM_BPF_H -#define __PERF_DISASM_BPF_H - -struct symbol; -struct annotate_args; - -int symbol__disassemble_bpf(struct symbol *sym, struct annotate_args *args); -int symbol__disassemble_bpf_image(struct symbol *sym, struct annotate_args *args); - -#endif /* __PERF_DISASM_BPF_H */ diff --git a/tools/perf/util/drm_pmu.c b/tools/perf/util/drm_pmu.c index 988890f37ba7..98d4d2b556d4 100644 --- a/tools/perf/util/drm_pmu.c +++ b/tools/perf/util/drm_pmu.c @@ -458,8 +458,10 @@ static int for_each_drm_fdinfo_in_dir(int (*cb)(void *args, int fdinfo_dir_fd, c } ret = cb(args, fdinfo_dir_fd, fd_entry->d_name); if (ret) - return ret; + goto close_fdinfo; } + +close_fdinfo: if (fdinfo_dir_fd != -1) close(fdinfo_dir_fd); closedir(fd_dir); diff --git a/tools/perf/util/dso.c b/tools/perf/util/dso.c index 282e3af85d5a..344e689567ee 100644 --- a/tools/perf/util/dso.c +++ b/tools/perf/util/dso.c @@ -1798,3 +1798,115 @@ bool is_perf_pid_map_name(const char *dso_name) return perf_pid_map_tid(dso_name, &tid); } + +struct find_file_offset_data { + u64 ip; + u64 offset; +}; + +/* This will be called for each PHDR in an ELF binary */ +static int find_file_offset(u64 start, u64 len, u64 pgoff, void *arg) +{ + struct find_file_offset_data *data = arg; + + if (start <= data->ip && data->ip < start + len) { + data->offset = pgoff + data->ip - start; + return 1; + } + return 0; +} + +static const u8 *__dso__read_symbol(struct dso *dso, const char *symfs_filename, + u64 start, size_t len, + u8 **out_buf, u64 *out_buf_len, bool *is_64bit) +{ + struct nscookie nsc; + int fd; + ssize_t count; + struct find_file_offset_data data = { + .ip = start, + }; + u8 *code_buf = NULL; + int saved_errno; + + nsinfo__mountns_enter(dso__nsinfo(dso), &nsc); + fd = open(symfs_filename, O_RDONLY); + saved_errno = errno; + nsinfo__mountns_exit(&nsc); + if (fd < 0) { + errno = saved_errno; + return NULL; + } + if (file__read_maps(fd, /*exe=*/true, find_file_offset, &data, is_64bit) <= 0) { + close(fd); + errno = ENOENT; + return NULL; + } + code_buf = malloc(len); + if (code_buf == NULL) { + close(fd); + errno = ENOMEM; + return NULL; + } + count = pread(fd, code_buf, len, data.offset); + saved_errno = errno; + close(fd); + if ((u64)count != len) { + free(code_buf); + errno = saved_errno; + return NULL; + } + *out_buf = code_buf; + *out_buf_len = len; + return code_buf; +} + +/* + * Read a symbol into memory for disassembly by a library like capstone of + * libLLVM. If memory is allocated out_buf holds it. + */ +const u8 *dso__read_symbol(struct dso *dso, const char *symfs_filename, + const struct map *map, const struct symbol *sym, + u8 **out_buf, u64 *out_buf_len, bool *is_64bit) +{ + u64 start = map__rip_2objdump(map, sym->start); + u64 end = map__rip_2objdump(map, sym->end); + size_t len = end - start; + + *out_buf = NULL; + *out_buf_len = 0; + *is_64bit = false; + + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_IMAGE) { + /* + * Note, there is fallback BPF image disassembly in the objdump + * version but it currently does nothing. + */ + errno = EOPNOTSUPP; + return NULL; + } + if (dso__binary_type(dso) == DSO_BINARY_TYPE__BPF_PROG_INFO) { +#ifdef HAVE_LIBBPF_SUPPORT + struct bpf_prog_info_node *info_node; + struct perf_bpil *info_linear; + + *is_64bit = sizeof(void *) == sizeof(u64); + info_node = perf_env__find_bpf_prog_info(dso__bpf_prog(dso)->env, + dso__bpf_prog(dso)->id); + if (!info_node) { + errno = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; + return NULL; + } + info_linear = info_node->info_linear; + assert(len <= info_linear->info.jited_prog_len); + *out_buf_len = len; + return (const u8 *)(uintptr_t)(info_linear->info.jited_prog_insns); +#else + pr_debug("No BPF program disassembly support\n"); + errno = EOPNOTSUPP; + return NULL; +#endif + } + return __dso__read_symbol(dso, symfs_filename, start, len, + out_buf, out_buf_len, is_64bit); +} diff --git a/tools/perf/util/dso.h b/tools/perf/util/dso.h index 3457d713d3c5..f8ccb9816b89 100644 --- a/tools/perf/util/dso.h +++ b/tools/perf/util/dso.h @@ -10,6 +10,7 @@ #include <stdio.h> #include <linux/bitops.h> #include "build-id.h" +#include "debuginfo.h" #include "mutex.h" #include <internal/rc_check.h> @@ -299,6 +300,7 @@ DECLARE_RC_STRUCT(dso) { u8 hit:1; u8 annotate_warned:1; u8 auxtrace_warned:1; + u8 debuginfo_warned:1; u8 short_name_allocated:1; u8 long_name_allocated:1; u8 is_64_bit:1; @@ -362,6 +364,16 @@ static inline void dso__set_annotate_warned(struct dso *dso) RC_CHK_ACCESS(dso)->annotate_warned = 1; } +static inline bool dso__debuginfo_warned(const struct dso *dso) +{ + return RC_CHK_ACCESS(dso)->debuginfo_warned; +} + +static inline void dso__set_debuginfo_warned(struct dso *dso) +{ + RC_CHK_ACCESS(dso)->debuginfo_warned = 1; +} + static inline bool dso__auxtrace_warned(const struct dso *dso) { return RC_CHK_ACCESS(dso)->auxtrace_warned; @@ -903,4 +915,17 @@ u64 dso__findnew_global_type(struct dso *dso, u64 addr, u64 offset); bool perf_pid_map_tid(const char *dso_name, int *tid); bool is_perf_pid_map_name(const char *dso_name); +/* + * In the future, we may get debuginfo using build-ID (w/o path). + * Add this helper is for the smooth conversion. + */ +static inline struct debuginfo *dso__debuginfo(struct dso *dso) +{ + return debuginfo__new(dso__long_name(dso)); +} + +const u8 *dso__read_symbol(struct dso *dso, const char *symfs_filename, + const struct map *map, const struct symbol *sym, + u8 **out_buf, u64 *out_buf_len, bool *is_64bit); + #endif /* __PERF_DSO */ diff --git a/tools/perf/util/dwarf-aux.c b/tools/perf/util/dwarf-aux.c index 559c953ca172..9267af204c7d 100644 --- a/tools/perf/util/dwarf-aux.c +++ b/tools/perf/util/dwarf-aux.c @@ -1388,18 +1388,19 @@ struct find_var_data { #define DWARF_OP_DIRECT_REGS 32 static bool match_var_offset(Dwarf_Die *die_mem, struct find_var_data *data, - u64 addr_offset, u64 addr_type, bool is_pointer) + s64 addr_offset, s64 addr_type, bool is_pointer) { Dwarf_Die type_die; Dwarf_Word size; + s64 offset = addr_offset - addr_type; - if (addr_offset == addr_type) { + if (offset == 0) { /* Update offset relative to the start of the variable */ data->offset = 0; return true; } - if (addr_offset < addr_type) + if (offset < 0) return false; if (die_get_real_type(die_mem, &type_die) == NULL) @@ -1414,14 +1415,42 @@ static bool match_var_offset(Dwarf_Die *die_mem, struct find_var_data *data, if (dwarf_aggregate_size(&type_die, &size) < 0) return false; - if (addr_offset >= addr_type + size) + if ((u64)offset >= size) return false; /* Update offset relative to the start of the variable */ - data->offset = addr_offset - addr_type; + data->offset = offset; return true; } +/** + * is_breg_access_indirect - Check if breg based access implies type + * dereference + * @ops: DWARF operations array + * @nops: Number of operations in @ops + * + * Returns true if the DWARF expression evaluates to the variable's + * value, so the memory access on that register needs type dereference. + * Returns false if the expression evaluates to the variable's address. + * This is called after check_allowed_ops. + */ +static bool is_breg_access_indirect(Dwarf_Op *ops, size_t nops) +{ + /* only the base register */ + if (nops == 1) + return false; + + if (nops == 2 && ops[1].atom == DW_OP_stack_value) + return true; + + if (nops == 3 && (ops[1].atom == DW_OP_deref || + ops[1].atom == DW_OP_deref_size) && + ops[2].atom == DW_OP_stack_value) + return false; + /* unreachable, OP not supported */ + return false; +} + /* Only checks direct child DIEs in the given scope. */ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) { @@ -1450,7 +1479,7 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) if (data->is_fbreg && ops->atom == DW_OP_fbreg && check_allowed_ops(ops, nops) && match_var_offset(die_mem, data, data->offset, ops->number, - /*is_pointer=*/false)) + is_breg_access_indirect(ops, nops))) return DIE_FIND_CB_END; /* Only match with a simple case */ @@ -1462,11 +1491,11 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) /*is_pointer=*/true)) return DIE_FIND_CB_END; - /* Local variables accessed by a register + offset */ + /* variables accessed by a register + offset */ if (ops->atom == (DW_OP_breg0 + data->reg) && check_allowed_ops(ops, nops) && match_var_offset(die_mem, data, data->offset, ops->number, - /*is_pointer=*/false)) + is_breg_access_indirect(ops, nops))) return DIE_FIND_CB_END; } else { /* pointer variables saved in a register 32 or above */ @@ -1476,11 +1505,11 @@ static int __die_find_var_reg_cb(Dwarf_Die *die_mem, void *arg) /*is_pointer=*/true)) return DIE_FIND_CB_END; - /* Local variables accessed by a register + offset */ + /* variables accessed by a register + offset */ if (ops->atom == DW_OP_bregx && data->reg == ops->number && check_allowed_ops(ops, nops) && match_var_offset(die_mem, data, data->offset, ops->number2, - /*is_poitner=*/false)) + is_breg_access_indirect(ops, nops))) return DIE_FIND_CB_END; } } @@ -1598,13 +1627,22 @@ static int __die_collect_vars_cb(Dwarf_Die *die_mem, void *arg) if (!check_allowed_ops(ops, nops)) return DIE_FIND_CB_SIBLING; - if (die_get_real_type(die_mem, &type_die) == NULL) + if (__die_get_real_type(die_mem, &type_die) == NULL) return DIE_FIND_CB_SIBLING; vt = malloc(sizeof(*vt)); if (vt == NULL) return DIE_FIND_CB_END; + /* Usually a register holds the value of a variable */ + vt->is_reg_var_addr = false; + + if (((ops->atom >= DW_OP_breg0 && ops->atom <= DW_OP_breg31) || + ops->atom == DW_OP_bregx || ops->atom == DW_OP_fbreg) && + !is_breg_access_indirect(ops, nops)) + /* The register contains an address of the variable. */ + vt->is_reg_var_addr = true; + vt->die_off = dwarf_dieoffset(&type_die); vt->addr = start; vt->reg = reg_from_dwarf_op(ops); @@ -1920,6 +1958,7 @@ struct find_scope_data { static int __die_find_scope_cb(Dwarf_Die *die_mem, void *arg) { struct find_scope_data *data = arg; + int tag = dwarf_tag(die_mem); if (dwarf_haspc(die_mem, data->pc)) { Dwarf_Die *tmp; @@ -1933,6 +1972,14 @@ static int __die_find_scope_cb(Dwarf_Die *die_mem, void *arg) data->nr++; return DIE_FIND_CB_CHILD; } + + /* + * If the DIE doesn't have the PC, we still need to check its children + * and siblings if it's a container like a namespace. + */ + if (tag == DW_TAG_namespace) + return DIE_FIND_CB_CONTINUE; + return DIE_FIND_CB_SIBLING; } diff --git a/tools/perf/util/dwarf-aux.h b/tools/perf/util/dwarf-aux.h index 892c8c5c23fc..cd481ec9c5a1 100644 --- a/tools/perf/util/dwarf-aux.h +++ b/tools/perf/util/dwarf-aux.h @@ -148,6 +148,8 @@ struct die_var_type { u64 addr; int reg; int offset; + /* Whether the register holds a address to the type */ + bool is_reg_var_addr; }; /* Return type info of a member at offset */ diff --git a/tools/perf/util/env.c b/tools/perf/util/env.c index c8c248754621..f1626d2032cd 100644 --- a/tools/perf/util/env.c +++ b/tools/perf/util/env.c @@ -802,3 +802,25 @@ bool x86__is_amd_cpu(void) return is_amd; } + +bool perf_env__is_x86_intel_cpu(struct perf_env *env) +{ + static int is_intel; /* 0: Uninitialized, 1: Yes, -1: No */ + + if (is_intel == 0) + is_intel = env->cpuid && strstarts(env->cpuid, "GenuineIntel") ? 1 : -1; + + return is_intel >= 1 ? true : false; +} + +bool x86__is_intel_cpu(void) +{ + struct perf_env env = { .total_mem = 0, }; + bool is_intel; + + perf_env__cpuid(&env); + is_intel = perf_env__is_x86_intel_cpu(&env); + perf_env__exit(&env); + + return is_intel; +} diff --git a/tools/perf/util/env.h b/tools/perf/util/env.h index e00179787a34..9977b85523a8 100644 --- a/tools/perf/util/env.h +++ b/tools/perf/util/env.h @@ -201,5 +201,7 @@ void perf_env__find_br_cntr_info(struct perf_env *env, bool x86__is_amd_cpu(void); bool perf_env__is_x86_amd_cpu(struct perf_env *env); +bool x86__is_intel_cpu(void); +bool perf_env__is_x86_intel_cpu(struct perf_env *env); #endif /* __PERF_ENV_H */ diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index e40d16d3246c..64c63b59d617 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -117,6 +117,7 @@ enum perf_synth_id { PERF_SYNTH_INTEL_PSB, PERF_SYNTH_INTEL_EVT, PERF_SYNTH_INTEL_IFLAG_CHG, + PERF_SYNTH_POWERPC_VPA_DTL, }; /* @@ -254,6 +255,25 @@ struct perf_synth_intel_iflag_chg { u64 branch_ip; /* If via_branch */ }; +/* + * The powerpc VPA DTL entries are of below format + */ +struct powerpc_vpadtl_entry { + u8 dispatch_reason; + u8 preempt_reason; + u16 processor_id; + u32 enqueue_to_dispatch_time; + u32 ready_to_enqueue_time; + u32 waiting_to_ready_time; + u64 timebase; + u64 fault_addr; + u64 srr0; + u64 srr1; +}; + +extern const char *dispatch_reasons[11]; +extern const char *preempt_reasons[10]; + static inline void *perf_synth__raw_data(void *p) { return p + 4; diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index d264c143b592..56ebefd075f2 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -407,6 +407,7 @@ void evsel__init(struct evsel *evsel, evsel->collect_stat = false; evsel->group_pmu_name = NULL; evsel->skippable = false; + evsel->supported = true; evsel->alternate_hw_config = PERF_COUNT_HW_MAX; evsel->script_output_type = -1; // FIXME: OUTPUT_TYPE_UNSET, see builtin-script.c } @@ -1091,6 +1092,71 @@ static void evsel__reset_callgraph(struct evsel *evsel, struct callchain_param * } } +static void evsel__apply_ratio_to_prev(struct evsel *evsel, + struct perf_event_attr *attr, + struct record_opts *opts, + const char *buf) +{ + struct perf_event_attr *prev_attr = NULL; + struct evsel *evsel_prev = NULL; + u64 type = evsel->core.attr.sample_type; + u64 prev_type = 0; + double rtp; + + rtp = strtod(buf, NULL); + if (rtp <= 0) { + pr_err("Invalid ratio-to-prev value %lf\n", rtp); + return; + } + if (evsel == evsel__leader(evsel)) { + pr_err("Invalid use of ratio-to-prev term without preceding element in group\n"); + return; + } + if (!evsel->pmu->is_core) { + pr_err("Event using ratio-to-prev term must have a core PMU\n"); + return; + } + + evsel_prev = evsel__prev(evsel); + if (!evsel_prev) { + pr_err("Previous event does not exist.\n"); + return; + } + + if (evsel_prev->pmu->type != evsel->pmu->type) { + pr_err("Compared events (\"%s\", \"%s\") must have same PMU\n", + evsel->name, evsel_prev->name); + return; + } + + prev_attr = &evsel_prev->core.attr; + prev_type = evsel_prev->core.attr.sample_type; + + if (!(prev_type & PERF_SAMPLE_PERIOD)) { + attr->sample_period = prev_attr->sample_period * rtp; + attr->freq = 0; + evsel__reset_sample_bit(evsel, PERIOD); + } else if (!(type & PERF_SAMPLE_PERIOD)) { + prev_attr->sample_period = attr->sample_period / rtp; + prev_attr->freq = 0; + evsel__reset_sample_bit(evsel_prev, PERIOD); + } else { + if (opts->user_interval != ULLONG_MAX) { + prev_attr->sample_period = opts->user_interval; + attr->sample_period = prev_attr->sample_period * rtp; + prev_attr->freq = 0; + attr->freq = 0; + evsel__reset_sample_bit(evsel_prev, PERIOD); + evsel__reset_sample_bit(evsel, PERIOD); + } else { + pr_err("Event period term or count (-c) must be set when using ratio-to-prev term.\n"); + return; + } + } + + arch_evsel__apply_ratio_to_prev(evsel, attr); +} + static void evsel__apply_config_terms(struct evsel *evsel, struct record_opts *opts, bool track) { @@ -1104,6 +1170,7 @@ static void evsel__apply_config_terms(struct evsel *evsel, u32 dump_size = 0; int max_stack = 0; const char *callgraph_buf = NULL; + const char *rtp_buf = NULL; list_for_each_entry(term, config_terms, list) { switch (term->type) { @@ -1174,6 +1241,9 @@ static void evsel__apply_config_terms(struct evsel *evsel, break; case EVSEL__CONFIG_TERM_CFG_CHG: break; + case EVSEL__CONFIG_TERM_RATIO_TO_PREV: + rtp_buf = term->val.str; + break; default: break; } @@ -1225,6 +1295,8 @@ static void evsel__apply_config_terms(struct evsel *evsel, evsel__config_callchain(evsel, opts, ¶m); } } + if (rtp_buf) + evsel__apply_ratio_to_prev(evsel, attr, opts, rtp_buf); } struct evsel_config_term *__evsel__get_config_term(struct evsel *evsel, enum evsel_term_type type) @@ -1249,6 +1321,11 @@ void __weak arch__post_evsel_config(struct evsel *evsel __maybe_unused, { } +void __weak arch_evsel__apply_ratio_to_prev(struct evsel *evsel __maybe_unused, + struct perf_event_attr *attr __maybe_unused) +{ +} + static void evsel__set_default_freq_period(struct record_opts *opts, struct perf_event_attr *attr) { @@ -1941,7 +2018,7 @@ static int get_group_fd(struct evsel *evsel, int cpu_map_idx, int thread) struct evsel *leader = evsel__leader(evsel); int fd; - if (evsel__is_group_leader(evsel)) + if (!evsel->supported || evsel__is_group_leader(evsel)) return -1; /* @@ -1955,7 +2032,7 @@ static int get_group_fd(struct evsel *evsel, int cpu_map_idx, int thread) return -1; fd = FD(leader, cpu_map_idx, thread); - BUG_ON(fd == -1 && !leader->skippable); + BUG_ON(fd == -1 && leader->supported); /* * When the leader has been skipped, return -2 to distinguish from no @@ -2573,12 +2650,14 @@ static int evsel__open_cpu(struct evsel *evsel, struct perf_cpu_map *cpus, enum rlimit_action set_rlimit = NO_CHANGE; struct perf_cpu cpu; - if (evsel__is_retire_lat(evsel)) - return evsel__tpebs_open(evsel); + if (evsel__is_retire_lat(evsel)) { + err = evsel__tpebs_open(evsel); + goto out; + } err = __evsel__prepare_open(evsel, cpus, threads); if (err) - return err; + goto out; if (cpus == NULL) cpus = empty_cpu_map; @@ -2598,19 +2677,22 @@ fallback_missing_features: display_attr(&evsel->core.attr); if (evsel__is_tool(evsel)) { - return evsel__tool_pmu_open(evsel, threads, - start_cpu_map_idx, - end_cpu_map_idx); + err = evsel__tool_pmu_open(evsel, threads, + start_cpu_map_idx, + end_cpu_map_idx); + goto out; } if (evsel__is_hwmon(evsel)) { - return evsel__hwmon_pmu_open(evsel, threads, - start_cpu_map_idx, - end_cpu_map_idx); + err = evsel__hwmon_pmu_open(evsel, threads, + start_cpu_map_idx, + end_cpu_map_idx); + goto out; } if (evsel__is_drm(evsel)) { - return evsel__drm_pmu_open(evsel, threads, - start_cpu_map_idx, - end_cpu_map_idx); + err = evsel__drm_pmu_open(evsel, threads, + start_cpu_map_idx, + end_cpu_map_idx); + goto out; } for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) { @@ -2689,7 +2771,8 @@ retry_open: } } - return 0; + err = 0; + goto out; try_fallback: if (evsel__ignore_missing_thread(evsel, perf_cpu_map__nr(cpus), @@ -2728,6 +2811,9 @@ out_close: thread = nthreads; } while (--idx >= 0); errno = old_errno; +out: + if (err) + evsel->supported = false; return err; } @@ -3562,7 +3648,7 @@ bool evsel__fallback(struct evsel *evsel, struct target *target, int err, /* If event has exclude user then don't exclude kernel. */ if (evsel->core.attr.exclude_user) - return false; + goto no_fallback; /* Is there already the separator in the name. */ if (strchr(name, '/') || @@ -3570,7 +3656,7 @@ bool evsel__fallback(struct evsel *evsel, struct target *target, int err, sep = ""; if (asprintf(&new_name, "%s%su", name, sep) < 0) - return false; + goto no_fallback; free(evsel->name); evsel->name = new_name; @@ -3593,17 +3679,19 @@ bool evsel__fallback(struct evsel *evsel, struct target *target, int err, sep = ""; if (asprintf(&new_name, "%s%sH", name, sep) < 0) - return false; + goto no_fallback; free(evsel->name); evsel->name = new_name; /* Apple M1 requires exclude_guest */ - scnprintf(msg, msgsize, "trying to fall back to excluding guest samples"); + scnprintf(msg, msgsize, "Trying to fall back to excluding guest samples"); evsel->core.attr.exclude_guest = 1; return true; } - +no_fallback: + scnprintf(msg, msgsize, "No fallback found for '%s' for error %d", + evsel__name(evsel), err); return false; } @@ -3716,6 +3804,7 @@ static int dump_perf_event_processes(char *msg, size_t size) } int __weak arch_evsel__open_strerror(struct evsel *evsel __maybe_unused, + int err __maybe_unused, char *msg __maybe_unused, size_t size __maybe_unused) { @@ -3725,6 +3814,7 @@ int __weak arch_evsel__open_strerror(struct evsel *evsel __maybe_unused, int evsel__open_strerror(struct evsel *evsel, struct target *target, int err, char *msg, size_t size) { + struct perf_pmu *pmu; char sbuf[STRERR_BUFSIZE]; int printed = 0, enforced = 0; int ret; @@ -3840,7 +3930,8 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target, return scnprintf(msg, size, "The 'aux_action' feature is not supported, update the kernel."); if (perf_missing_features.aux_output) return scnprintf(msg, size, "The 'aux_output' feature is not supported, update the kernel."); - if (!target__has_cpu(target)) + pmu = evsel__find_pmu(evsel); + if (!pmu->is_core && !target__has_cpu(target)) return scnprintf(msg, size, "Invalid event (%s) in per-thread mode, enable system wide with '-a'.", evsel__name(evsel)); @@ -3853,7 +3944,7 @@ int evsel__open_strerror(struct evsel *evsel, struct target *target, break; } - ret = arch_evsel__open_strerror(evsel, msg, size); + ret = arch_evsel__open_strerror(evsel, err, msg, size); if (ret) return ret; @@ -3935,6 +4026,8 @@ bool evsel__is_hybrid(const struct evsel *evsel) struct evsel *evsel__leader(const struct evsel *evsel) { + if (evsel->core.leader == NULL) + return NULL; return container_of(evsel->core.leader, struct evsel, core); } @@ -4048,9 +4141,9 @@ bool evsel__set_needs_uniquify(struct evsel *counter, const struct perf_stat_con void evsel__uniquify_counter(struct evsel *counter) { - const char *name, *pmu_name; - char *new_name, *config; - int ret; + const char *name, *pmu_name, *config; + char *new_name; + int len, ret; /* No uniquification necessary. */ if (!counter->needs_uniquify) @@ -4064,15 +4157,23 @@ void evsel__uniquify_counter(struct evsel *counter) counter->uniquified_name = true; name = evsel__name(counter); + config = strchr(name, '/'); pmu_name = counter->pmu->name; - /* Already prefixed by the PMU name. */ - if (!strncmp(name, pmu_name, strlen(pmu_name))) - return; - config = strchr(name, '/'); - if (config) { - int len = config - name; + /* Already prefixed by the PMU name? */ + len = pmu_name_len_no_suffix(pmu_name); + + if (!strncmp(name, pmu_name, len)) { + /* + * If the PMU name is there, then there is no sense in not + * having a slash. Do this for robustness. + */ + if (config == NULL) + config = name - 1; + ret = asprintf(&new_name, "%s/%s", pmu_name, config + 1); + } else if (config) { + len = config - name; if (config[1] == '/') { /* case: event// */ ret = asprintf(&new_name, "%s/%.*s/%s", pmu_name, len, name, config + 2); @@ -4084,7 +4185,7 @@ void evsel__uniquify_counter(struct evsel *counter) config = strchr(name, ':'); if (config) { /* case: event:.. */ - int len = config - name; + len = config - name; ret = asprintf(&new_name, "%s/%.*s/%s", pmu_name, len, name, config + 1); } else { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 5797a02e5d6a..f8de0f9a719b 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -89,6 +89,7 @@ struct evsel { bool use_config_name; bool skippable; bool retire_lat; + bool dont_regroup; int bpf_fd; struct bpf_object *bpf_obj; struct list_head config_terms; @@ -120,7 +121,6 @@ struct evsel { bool forced_leader; bool cmdline_group_boundary; bool reset_group; - bool errored; bool needs_auxtrace_mmap; bool default_metricgroup; /* A member of the Default metricgroup */ bool needs_uniquify; @@ -341,7 +341,8 @@ void evsel__set_sample_id(struct evsel *evsel, bool use_sample_identifier); void arch_evsel__set_sample_weight(struct evsel *evsel); void arch__post_evsel_config(struct evsel *evsel, struct perf_event_attr *attr); -int arch_evsel__open_strerror(struct evsel *evsel, char *msg, size_t size); +int arch_evsel__open_strerror(struct evsel *evsel, int err, char *msg, size_t size); +void arch_evsel__apply_ratio_to_prev(struct evsel *evsel, struct perf_event_attr *attr); int evsel__set_filter(struct evsel *evsel, const char *filter); int evsel__append_tp_filter(struct evsel *evsel, const char *filter); diff --git a/tools/perf/util/evsel_config.h b/tools/perf/util/evsel_config.h index 94a1e9cf73d6..bcd3a978f0c4 100644 --- a/tools/perf/util/evsel_config.h +++ b/tools/perf/util/evsel_config.h @@ -28,6 +28,7 @@ enum evsel_term_type { EVSEL__CONFIG_TERM_AUX_ACTION, EVSEL__CONFIG_TERM_AUX_SAMPLE_SIZE, EVSEL__CONFIG_TERM_CFG_CHG, + EVSEL__CONFIG_TERM_RATIO_TO_PREV, }; struct evsel_config_term { diff --git a/tools/perf/util/get_current_dir_name.c b/tools/perf/util/get_current_dir_name.c deleted file mode 100644 index e68935e9ac8c..000000000000 --- a/tools/perf/util/get_current_dir_name.c +++ /dev/null @@ -1,18 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1 -// Copyright (C) 2018, 2019 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> -// -#ifndef HAVE_GET_CURRENT_DIR_NAME -#include "get_current_dir_name.h" -#include <limits.h> -#include <string.h> -#include <unistd.h> - -/* Android's 'bionic' library, for one, doesn't have this */ - -char *get_current_dir_name(void) -{ - char pwd[PATH_MAX]; - - return getcwd(pwd, sizeof(pwd)) == NULL ? NULL : strdup(pwd); -} -#endif // HAVE_GET_CURRENT_DIR_NAME diff --git a/tools/perf/util/get_current_dir_name.h b/tools/perf/util/get_current_dir_name.h deleted file mode 100644 index 69f7d5537d32..000000000000 --- a/tools/perf/util/get_current_dir_name.h +++ /dev/null @@ -1,8 +0,0 @@ -// SPDX-License-Identifier: LGPL-2.1 -// Copyright (C) 2018, 2019 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com> -// -#ifndef __PERF_GET_CURRENT_DIR_NAME_H -#ifndef HAVE_GET_CURRENT_DIR_NAME -char *get_current_dir_name(void); -#endif // HAVE_GET_CURRENT_DIR_NAME -#endif // __PERF_GET_CURRENT_DIR_NAME_H diff --git a/tools/perf/util/hist.h b/tools/perf/util/hist.h index 70438d03ca9c..c64005278687 100644 --- a/tools/perf/util/hist.h +++ b/tools/perf/util/hist.h @@ -713,8 +713,9 @@ struct block_hist { #include "../ui/keysyms.h" void attr_to_script(char *buf, struct perf_event_attr *attr); -int map_symbol__tui_annotate(struct map_symbol *ms, struct evsel *evsel, - struct hist_browser_timer *hbt); +int __hist_entry__tui_annotate(struct hist_entry *he, struct map_symbol *ms, + struct evsel *evsel, + struct hist_browser_timer *hbt); int hist_entry__tui_annotate(struct hist_entry *he, struct evsel *evsel, struct hist_browser_timer *hbt); @@ -742,9 +743,10 @@ int evlist__tui_browse_hists(struct evlist *evlist __maybe_unused, { return 0; } -static inline int map_symbol__tui_annotate(struct map_symbol *ms __maybe_unused, - struct evsel *evsel __maybe_unused, - struct hist_browser_timer *hbt __maybe_unused) +static inline int __hist_entry__tui_annotate(struct hist_entry *he __maybe_unused, + struct map_symbol *ms __maybe_unused, + struct evsel *evsel __maybe_unused, + struct hist_browser_timer *hbt __maybe_unused) { return 0; } diff --git a/tools/perf/util/hwmon_pmu.h b/tools/perf/util/hwmon_pmu.h index dc711b289ff5..d1e403c8b70b 100644 --- a/tools/perf/util/hwmon_pmu.h +++ b/tools/perf/util/hwmon_pmu.h @@ -37,7 +37,7 @@ enum hwmon_type { /** * enum hwmon_item: * - * Similar to enum hwmon_type but describes the item part of a a sysfs filename. + * Similar to enum hwmon_type but describes the item part of a sysfs filename. * * This enum is exposed for testing. */ diff --git a/tools/perf/util/kvm-stat.h b/tools/perf/util/kvm-stat.h index 4249542544bb..53db3d56108b 100644 --- a/tools/perf/util/kvm-stat.h +++ b/tools/perf/util/kvm-stat.h @@ -190,5 +190,15 @@ static inline struct kvm_info *kvm_info__new(void) #define kvm_info__zput(ki) do { } while (0) #endif /* HAVE_KVM_STAT_SUPPORT */ +#define STRDUP_FAIL_EXIT(s) \ + ({ char *_p; \ + _p = strdup(s); \ + if (!_p) { \ + ret = -ENOMEM; \ + goto EXIT; \ + } \ + _p; \ + }) + extern int kvm_add_default_arch_event(int *argc, const char **argv); #endif /* __PERF_KVM_STAT_H */ diff --git a/tools/perf/util/libbfd.c b/tools/perf/util/libbfd.c new file mode 100644 index 000000000000..01147fbf73b3 --- /dev/null +++ b/tools/perf/util/libbfd.c @@ -0,0 +1,600 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "libbfd.h" +#include "annotate.h" +#include "bpf-event.h" +#include "bpf-utils.h" +#include "debug.h" +#include "dso.h" +#include "env.h" +#include "map.h" +#include "srcline.h" +#include "symbol.h" +#include "symbol_conf.h" +#include "util.h" +#include <tools/dis-asm-compat.h> +#ifdef HAVE_LIBBPF_SUPPORT +#include <bpf/bpf.h> +#include <bpf/btf.h> +#endif +#include <fcntl.h> +#include <stdio.h> +#include <stdlib.h> +#define PACKAGE "perf" +#include <bfd.h> + +/* + * Implement addr2line using libbfd. + */ +struct a2l_data { + const char *input; + u64 addr; + + bool found; + const char *filename; + const char *funcname; + unsigned int line; + + bfd *abfd; + asymbol **syms; +}; + +static int bfd_error(const char *string) +{ + const char *errmsg; + + errmsg = bfd_errmsg(bfd_get_error()); + fflush(stdout); + + if (string) + pr_debug("%s: %s\n", string, errmsg); + else + pr_debug("%s\n", errmsg); + + return -1; +} + +static int slurp_symtab(bfd *abfd, struct a2l_data *a2l) +{ + long storage; + long symcount; + asymbol **syms; + bfd_boolean dynamic = FALSE; + + if ((bfd_get_file_flags(abfd) & HAS_SYMS) == 0) + return bfd_error(bfd_get_filename(abfd)); + + storage = bfd_get_symtab_upper_bound(abfd); + if (storage == 0L) { + storage = bfd_get_dynamic_symtab_upper_bound(abfd); + dynamic = TRUE; + } + if (storage < 0L) + return bfd_error(bfd_get_filename(abfd)); + + syms = malloc(storage); + if (dynamic) + symcount = bfd_canonicalize_dynamic_symtab(abfd, syms); + else + symcount = bfd_canonicalize_symtab(abfd, syms); + + if (symcount < 0) { + free(syms); + return bfd_error(bfd_get_filename(abfd)); + } + + a2l->syms = syms; + return 0; +} + +static void find_address_in_section(bfd *abfd, asection *section, void *data) +{ + bfd_vma pc, vma; + bfd_size_type size; + struct a2l_data *a2l = data; + flagword flags; + + if (a2l->found) + return; + +#ifdef bfd_get_section_flags + flags = bfd_get_section_flags(abfd, section); +#else + flags = bfd_section_flags(section); +#endif + if ((flags & SEC_ALLOC) == 0) + return; + + pc = a2l->addr; +#ifdef bfd_get_section_vma + vma = bfd_get_section_vma(abfd, section); +#else + vma = bfd_section_vma(section); +#endif +#ifdef bfd_get_section_size + size = bfd_get_section_size(section); +#else + size = bfd_section_size(section); +#endif + + if (pc < vma || pc >= vma + size) + return; + + a2l->found = bfd_find_nearest_line(abfd, section, a2l->syms, pc - vma, + &a2l->filename, &a2l->funcname, + &a2l->line); + + if (a2l->filename && !strlen(a2l->filename)) + a2l->filename = NULL; +} + +static struct a2l_data *addr2line_init(const char *path) +{ + bfd *abfd; + struct a2l_data *a2l = NULL; + + abfd = bfd_openr(path, NULL); + if (abfd == NULL) + return NULL; + + if (!bfd_check_format(abfd, bfd_object)) + goto out; + + a2l = zalloc(sizeof(*a2l)); + if (a2l == NULL) + goto out; + + a2l->abfd = abfd; + a2l->input = strdup(path); + if (a2l->input == NULL) + goto out; + + if (slurp_symtab(abfd, a2l)) + goto out; + + return a2l; + +out: + if (a2l) { + zfree((char **)&a2l->input); + free(a2l); + } + bfd_close(abfd); + return NULL; +} + +static void addr2line_cleanup(struct a2l_data *a2l) +{ + if (a2l->abfd) + bfd_close(a2l->abfd); + zfree((char **)&a2l->input); + zfree(&a2l->syms); + free(a2l); +} + +static int inline_list__append_dso_a2l(struct dso *dso, + struct inline_node *node, + struct symbol *sym) +{ + struct a2l_data *a2l = dso__a2l(dso); + struct symbol *inline_sym = new_inline_sym(dso, sym, a2l->funcname); + char *srcline = NULL; + + if (a2l->filename) + srcline = srcline_from_fileline(a2l->filename, a2l->line); + + return inline_list__append(inline_sym, srcline, node); +} + +int libbfd__addr2line(const char *dso_name, u64 addr, + char **file, unsigned int *line, struct dso *dso, + bool unwind_inlines, struct inline_node *node, + struct symbol *sym) +{ + int ret = 0; + struct a2l_data *a2l = dso__a2l(dso); + + if (!a2l) { + a2l = addr2line_init(dso_name); + dso__set_a2l(dso, a2l); + } + + if (a2l == NULL) { + if (!symbol_conf.disable_add2line_warn) + pr_warning("addr2line_init failed for %s\n", dso_name); + return 0; + } + + a2l->addr = addr; + a2l->found = false; + + bfd_map_over_sections(a2l->abfd, find_address_in_section, a2l); + + if (!a2l->found) + return 0; + + if (unwind_inlines) { + int cnt = 0; + + if (node && inline_list__append_dso_a2l(dso, node, sym)) + return 0; + + while (bfd_find_inliner_info(a2l->abfd, &a2l->filename, + &a2l->funcname, &a2l->line) && + cnt++ < MAX_INLINE_NEST) { + + if (a2l->filename && !strlen(a2l->filename)) + a2l->filename = NULL; + + if (node != NULL) { + if (inline_list__append_dso_a2l(dso, node, sym)) + return 0; + // found at least one inline frame + ret = 1; + } + } + } + + if (file) { + *file = a2l->filename ? strdup(a2l->filename) : NULL; + ret = *file ? 1 : 0; + } + + if (line) + *line = a2l->line; + + return ret; +} + +void dso__free_a2l_libbfd(struct dso *dso) +{ + struct a2l_data *a2l = dso__a2l(dso); + + if (!a2l) + return; + + addr2line_cleanup(a2l); + + dso__set_a2l(dso, NULL); +} + +static int bfd_symbols__cmpvalue(const void *a, const void *b) +{ + const asymbol *as = *(const asymbol **)a, *bs = *(const asymbol **)b; + + if (bfd_asymbol_value(as) != bfd_asymbol_value(bs)) + return bfd_asymbol_value(as) - bfd_asymbol_value(bs); + + return bfd_asymbol_name(as)[0] - bfd_asymbol_name(bs)[0]; +} + +static int bfd2elf_binding(asymbol *symbol) +{ + if (symbol->flags & BSF_WEAK) + return STB_WEAK; + if (symbol->flags & BSF_GLOBAL) + return STB_GLOBAL; + if (symbol->flags & BSF_LOCAL) + return STB_LOCAL; + return -1; +} + +int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) +{ + int err = -1; + long symbols_size, symbols_count, i; + asection *section; + asymbol **symbols, *sym; + struct symbol *symbol; + bfd *abfd; + u64 start, len; + + abfd = bfd_openr(debugfile, NULL); + if (!abfd) + return -1; + + if (!bfd_check_format(abfd, bfd_object)) { + pr_debug2("%s: cannot read %s bfd file.\n", __func__, + dso__long_name(dso)); + goto out_close; + } + + if (bfd_get_flavour(abfd) == bfd_target_elf_flavour) + goto out_close; + + symbols_size = bfd_get_symtab_upper_bound(abfd); + if (symbols_size == 0) { + bfd_close(abfd); + return 0; + } + + if (symbols_size < 0) + goto out_close; + + symbols = malloc(symbols_size); + if (!symbols) + goto out_close; + + symbols_count = bfd_canonicalize_symtab(abfd, symbols); + if (symbols_count < 0) + goto out_free; + + section = bfd_get_section_by_name(abfd, ".text"); + if (section) { + for (i = 0; i < symbols_count; ++i) { + if (!strcmp(bfd_asymbol_name(symbols[i]), "__ImageBase") || + !strcmp(bfd_asymbol_name(symbols[i]), "__image_base__")) + break; + } + if (i < symbols_count) { + /* PE symbols can only have 4 bytes, so use .text high bits */ + u64 text_offset = (section->vma - (u32)section->vma) + + (u32)bfd_asymbol_value(symbols[i]); + dso__set_text_offset(dso, text_offset); + dso__set_text_end(dso, (section->vma - text_offset) + section->size); + } else { + dso__set_text_offset(dso, section->vma - section->filepos); + dso__set_text_end(dso, section->filepos + section->size); + } + } + + qsort(symbols, symbols_count, sizeof(asymbol *), bfd_symbols__cmpvalue); + +#ifdef bfd_get_section +#define bfd_asymbol_section bfd_get_section +#endif + for (i = 0; i < symbols_count; ++i) { + sym = symbols[i]; + section = bfd_asymbol_section(sym); + if (bfd2elf_binding(sym) < 0) + continue; + + while (i + 1 < symbols_count && + bfd_asymbol_section(symbols[i + 1]) == section && + bfd2elf_binding(symbols[i + 1]) < 0) + i++; + + if (i + 1 < symbols_count && + bfd_asymbol_section(symbols[i + 1]) == section) + len = symbols[i + 1]->value - sym->value; + else + len = section->size - sym->value; + + start = bfd_asymbol_value(sym) - dso__text_offset(dso); + symbol = symbol__new(start, len, bfd2elf_binding(sym), STT_FUNC, + bfd_asymbol_name(sym)); + if (!symbol) + goto out_free; + + symbols__insert(dso__symbols(dso), symbol); + } +#ifdef bfd_get_section +#undef bfd_asymbol_section +#endif + + symbols__fixup_end(dso__symbols(dso), false); + symbols__fixup_duplicate(dso__symbols(dso)); + dso__set_adjust_symbols(dso, true); + + err = 0; +out_free: + free(symbols); +out_close: + bfd_close(abfd); + return err; +} + +int libbfd__read_build_id(const char *filename, struct build_id *bid, bool block) +{ + size_t size = sizeof(bid->data); + int err = -1, fd; + bfd *abfd; + + fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK)); + if (fd < 0) + return -1; + + abfd = bfd_fdopenr(filename, /*target=*/NULL, fd); + if (!abfd) + return -1; + + if (!bfd_check_format(abfd, bfd_object)) { + pr_debug2("%s: cannot read %s bfd file.\n", __func__, filename); + goto out_close; + } + + if (!abfd->build_id || abfd->build_id->size > size) + goto out_close; + + memcpy(bid->data, abfd->build_id->data, abfd->build_id->size); + memset(bid->data + abfd->build_id->size, 0, size - abfd->build_id->size); + err = bid->size = abfd->build_id->size; + +out_close: + bfd_close(abfd); + return err; +} + +int libbfd_filename__read_debuglink(const char *filename, char *debuglink, + size_t size) +{ + int err = -1; + asection *section; + bfd *abfd; + + abfd = bfd_openr(filename, NULL); + if (!abfd) + return -1; + + if (!bfd_check_format(abfd, bfd_object)) { + pr_debug2("%s: cannot read %s bfd file.\n", __func__, filename); + goto out_close; + } + + section = bfd_get_section_by_name(abfd, ".gnu_debuglink"); + if (!section) + goto out_close; + + if (section->size > size) + goto out_close; + + if (!bfd_get_section_contents(abfd, section, debuglink, 0, + section->size)) + goto out_close; + + err = 0; + +out_close: + bfd_close(abfd); + return err; +} + +int symbol__disassemble_bpf_libbfd(struct symbol *sym __maybe_unused, + struct annotate_args *args __maybe_unused) +{ +#ifdef HAVE_LIBBPF_SUPPORT + struct annotation *notes = symbol__annotation(sym); + struct bpf_prog_linfo *prog_linfo = NULL; + struct bpf_prog_info_node *info_node; + int len = sym->end - sym->start; + disassembler_ftype disassemble; + struct map *map = args->ms.map; + struct perf_bpil *info_linear; + struct disassemble_info info; + struct dso *dso = map__dso(map); + int pc = 0, count, sub_id; + struct btf *btf = NULL; + char tpath[PATH_MAX]; + size_t buf_size; + int nr_skip = 0; + char *buf; + bfd *bfdf; + int ret; + FILE *s; + + if (dso__binary_type(dso) != DSO_BINARY_TYPE__BPF_PROG_INFO) + return SYMBOL_ANNOTATE_ERRNO__BPF_INVALID_FILE; + + pr_debug("%s: handling sym %s addr %" PRIx64 " len %" PRIx64 "\n", __func__, + sym->name, sym->start, sym->end - sym->start); + + memset(tpath, 0, sizeof(tpath)); + perf_exe(tpath, sizeof(tpath)); + + bfdf = bfd_openr(tpath, NULL); + if (bfdf == NULL) + abort(); + + if (!bfd_check_format(bfdf, bfd_object)) + abort(); + + s = open_memstream(&buf, &buf_size); + if (!s) { + ret = errno; + goto out; + } + init_disassemble_info_compat(&info, s, + (fprintf_ftype) fprintf, + fprintf_styled); + info.arch = bfd_get_arch(bfdf); + info.mach = bfd_get_mach(bfdf); + + info_node = perf_env__find_bpf_prog_info(dso__bpf_prog(dso)->env, + dso__bpf_prog(dso)->id); + if (!info_node) { + ret = SYMBOL_ANNOTATE_ERRNO__BPF_MISSING_BTF; + goto out; + } + info_linear = info_node->info_linear; + sub_id = dso__bpf_prog(dso)->sub_id; + + info.buffer = (void *)(uintptr_t)(info_linear->info.jited_prog_insns); + info.buffer_length = info_linear->info.jited_prog_len; + + if (info_linear->info.nr_line_info) + prog_linfo = bpf_prog_linfo__new(&info_linear->info); + + if (info_linear->info.btf_id) { + struct btf_node *node; + + node = perf_env__find_btf(dso__bpf_prog(dso)->env, + info_linear->info.btf_id); + if (node) + btf = btf__new((__u8 *)(node->data), + node->data_size); + } + + disassemble_init_for_target(&info); + +#ifdef DISASM_FOUR_ARGS_SIGNATURE + disassemble = disassembler(info.arch, + bfd_big_endian(bfdf), + info.mach, + bfdf); +#else + disassemble = disassembler(bfdf); +#endif + if (disassemble == NULL) + abort(); + + fflush(s); + do { + const struct bpf_line_info *linfo = NULL; + struct disasm_line *dl; + size_t prev_buf_size; + const char *srcline; + u64 addr; + + addr = pc + ((u64 *)(uintptr_t)(info_linear->info.jited_ksyms))[sub_id]; + count = disassemble(pc, &info); + + if (prog_linfo) + linfo = bpf_prog_linfo__lfind_addr_func(prog_linfo, + addr, sub_id, + nr_skip); + + if (linfo && btf) { + srcline = btf__name_by_offset(btf, linfo->line_off); + nr_skip++; + } else + srcline = NULL; + + fprintf(s, "\n"); + prev_buf_size = buf_size; + fflush(s); + + if (!annotate_opts.hide_src_code && srcline) { + args->offset = -1; + args->line = strdup(srcline); + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) { + annotation_line__add(&dl->al, + ¬es->src->source); + } + } + + args->offset = pc; + args->line = buf + prev_buf_size; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + dl = disasm_line__new(args); + if (dl) + annotation_line__add(&dl->al, ¬es->src->source); + + pc += count; + } while (count > 0 && pc < len); + + ret = 0; +out: + free(prog_linfo); + btf__free(btf); + fclose(s); + bfd_close(bfdf); + return ret; +#else + return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; +#endif +} diff --git a/tools/perf/util/libbfd.h b/tools/perf/util/libbfd.h new file mode 100644 index 000000000000..e300f171d1bd --- /dev/null +++ b/tools/perf/util/libbfd.h @@ -0,0 +1,83 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_LIBBFD_H +#define __PERF_LIBBFD_H + +#include <linux/compiler.h> +#include <linux/types.h> +#include <stdbool.h> +#include <stddef.h> + +struct annotate_args; +struct build_id; +struct dso; +struct inline_node; +struct symbol; + +#ifdef HAVE_LIBBFD_SUPPORT +int libbfd__addr2line(const char *dso_name, u64 addr, + char **file, unsigned int *line, struct dso *dso, + bool unwind_inlines, struct inline_node *node, + struct symbol *sym); + + +void dso__free_a2l_libbfd(struct dso *dso); + +int symbol__disassemble_libbfd(const char *filename, struct symbol *sym, + struct annotate_args *args); + +int libbfd__read_build_id(const char *filename, struct build_id *bid, bool block); + +int libbfd_filename__read_debuglink(const char *filename, char *debuglink, size_t size); + +int symbol__disassemble_bpf_libbfd(struct symbol *sym, struct annotate_args *args); + +#else // !defined(HAVE_LIBBFD_SUPPORT) +#include "annotate.h" + +static inline int libbfd__addr2line(const char *dso_name __always_unused, + u64 addr __always_unused, + char **file __always_unused, + unsigned int *line __always_unused, + struct dso *dso __always_unused, + bool unwind_inlines __always_unused, + struct inline_node *node __always_unused, + struct symbol *sym __always_unused) +{ + return -1; +} + + +static inline void dso__free_a2l_libbfd(struct dso *dso __always_unused) +{ +} + +static inline int symbol__disassemble_libbfd(const char *filename __always_unused, + struct symbol *sym __always_unused, + struct annotate_args *args __always_unused) +{ + return -1; +} + +static inline int libbfd__read_build_id(const char *filename __always_unused, + struct build_id *bid __always_unused, + bool block __always_unused) +{ + return -1; +} + +static inline int libbfd_filename__read_debuglink(const char *filename __always_unused, + char *debuglink __always_unused, + size_t size __always_unused) +{ + return -1; +} + +static inline int symbol__disassemble_bpf_libbfd(struct symbol *sym __always_unused, + struct annotate_args *args __always_unused) +{ + return SYMBOL_ANNOTATE_ERRNO__NO_LIBOPCODES_FOR_BPF; +} + +#endif // defined(HAVE_LIBBFD_SUPPORT) + +#endif /* __PERF_LIBBFD_H */ diff --git a/tools/perf/util/llvm.c b/tools/perf/util/llvm.c new file mode 100644 index 000000000000..2ebf1f5f65bf --- /dev/null +++ b/tools/perf/util/llvm.c @@ -0,0 +1,273 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "llvm.h" +#include "annotate.h" +#include "debug.h" +#include "dso.h" +#include "map.h" +#include "namespaces.h" +#include "srcline.h" +#include "symbol.h" +#include <errno.h> +#include <fcntl.h> +#include <unistd.h> +#include <linux/zalloc.h> + +#ifdef HAVE_LIBLLVM_SUPPORT +#include "llvm-c-helpers.h" +#include <llvm-c/Disassembler.h> +#include <llvm-c/Target.h> +#endif + +#ifdef HAVE_LIBLLVM_SUPPORT +static void free_llvm_inline_frames(struct llvm_a2l_frame *inline_frames, + int num_frames) +{ + if (inline_frames != NULL) { + for (int i = 0; i < num_frames; ++i) { + zfree(&inline_frames[i].filename); + zfree(&inline_frames[i].funcname); + } + zfree(&inline_frames); + } +} +#endif + +int llvm__addr2line(const char *dso_name __maybe_unused, u64 addr __maybe_unused, + char **file __maybe_unused, unsigned int *line __maybe_unused, + struct dso *dso __maybe_unused, bool unwind_inlines __maybe_unused, + struct inline_node *node __maybe_unused, struct symbol *sym __maybe_unused) +{ +#ifdef HAVE_LIBLLVM_SUPPORT + struct llvm_a2l_frame *inline_frames = NULL; + int num_frames = llvm_addr2line(dso_name, addr, file, line, + node && unwind_inlines, &inline_frames); + + if (num_frames == 0 || !inline_frames) { + /* Error, or we didn't want inlines. */ + return num_frames; + } + + for (int i = 0; i < num_frames; ++i) { + struct symbol *inline_sym = + new_inline_sym(dso, sym, inline_frames[i].funcname); + char *srcline = NULL; + + if (inline_frames[i].filename) { + srcline = + srcline_from_fileline(inline_frames[i].filename, + inline_frames[i].line); + } + if (inline_list__append(inline_sym, srcline, node) != 0) { + free_llvm_inline_frames(inline_frames, num_frames); + return 0; + } + } + free_llvm_inline_frames(inline_frames, num_frames); + + return num_frames; +#else + return -1; +#endif +} + +#ifdef HAVE_LIBLLVM_SUPPORT +static void init_llvm(void) +{ + static bool init; + + if (!init) { + LLVMInitializeAllTargetInfos(); + LLVMInitializeAllTargetMCs(); + LLVMInitializeAllDisassemblers(); + init = true; + } +} + +/* + * Whenever LLVM wants to resolve an address into a symbol, it calls this + * callback. We don't ever actually _return_ anything (in particular, because + * it puts quotation marks around what we return), but we use this as a hint + * that there is a branch or PC-relative address in the expression that we + * should add some textual annotation for after the instruction. The caller + * will use this information to add the actual annotation. + */ +struct symbol_lookup_storage { + u64 branch_addr; + u64 pcrel_load_addr; +}; + +static const char * +symbol_lookup_callback(void *disinfo, uint64_t value, + uint64_t *ref_type, + uint64_t address __maybe_unused, + const char **ref __maybe_unused) +{ + struct symbol_lookup_storage *storage = disinfo; + + if (*ref_type == LLVMDisassembler_ReferenceType_In_Branch) + storage->branch_addr = value; + else if (*ref_type == LLVMDisassembler_ReferenceType_In_PCrel_Load) + storage->pcrel_load_addr = value; + *ref_type = LLVMDisassembler_ReferenceType_InOut_None; + return NULL; +} +#endif + +int symbol__disassemble_llvm(const char *filename, struct symbol *sym, + struct annotate_args *args __maybe_unused) +{ +#ifdef HAVE_LIBLLVM_SUPPORT + struct annotation *notes = symbol__annotation(sym); + struct map *map = args->ms.map; + struct dso *dso = map__dso(map); + u64 start = map__rip_2objdump(map, sym->start); + /* Malloc-ed buffer containing instructions read from disk. */ + u8 *code_buf = NULL; + /* Pointer to code to be disassembled. */ + const u8 *buf; + u64 buf_len; + u64 pc; + bool is_64bit; + char disasm_buf[2048]; + size_t disasm_len; + struct disasm_line *dl; + LLVMDisasmContextRef disasm = NULL; + struct symbol_lookup_storage storage; + char *line_storage = NULL; + size_t line_storage_len = 0; + int ret = -1; + + if (args->options->objdump_path) + return -1; + + buf = dso__read_symbol(dso, filename, map, sym, + &code_buf, &buf_len, &is_64bit); + if (buf == NULL) + return errno; + + init_llvm(); + if (arch__is(args->arch, "x86")) { + const char *triplet = is_64bit ? "x86_64-pc-linux" : "i686-pc-linux"; + + disasm = LLVMCreateDisasm(triplet, &storage, /*tag_type=*/0, + /*get_op_info=*/NULL, symbol_lookup_callback); + } else { + char triplet[64]; + + scnprintf(triplet, sizeof(triplet), "%s-linux-gnu", + args->arch->name); + disasm = LLVMCreateDisasm(triplet, &storage, /*tag_type=*/0, + /*get_op_info=*/NULL, symbol_lookup_callback); + } + + if (disasm == NULL) + goto err; + + if (args->options->disassembler_style && + !strcmp(args->options->disassembler_style, "intel")) + LLVMSetDisasmOptions(disasm, + LLVMDisassembler_Option_AsmPrinterVariant); + + /* + * This needs to be set after AsmPrinterVariant, due to a bug in LLVM; + * setting AsmPrinterVariant makes a new instruction printer, making it + * forget about the PrintImmHex flag (which is applied before if both + * are given to the same call). + */ + LLVMSetDisasmOptions(disasm, LLVMDisassembler_Option_PrintImmHex); + + /* add the function address and name */ + scnprintf(disasm_buf, sizeof(disasm_buf), "%#"PRIx64" <%s>:", + start, sym->name); + + args->offset = -1; + args->line = disasm_buf; + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + pc = start; + for (u64 offset = 0; offset < buf_len; ) { + unsigned int ins_len; + + storage.branch_addr = 0; + storage.pcrel_load_addr = 0; + + /* + * LLVM's API has the code be disassembled as non-const, cast + * here as we may be disassembling from mapped read-only memory. + */ + ins_len = LLVMDisasmInstruction(disasm, (u8 *)(buf + offset), + buf_len - offset, pc, + disasm_buf, sizeof(disasm_buf)); + if (ins_len == 0) + goto err; + disasm_len = strlen(disasm_buf); + + if (storage.branch_addr != 0) { + char *name = llvm_name_for_code(dso, filename, + storage.branch_addr); + if (name != NULL) { + disasm_len += scnprintf(disasm_buf + disasm_len, + sizeof(disasm_buf) - + disasm_len, + " <%s>", name); + free(name); + } + } + if (storage.pcrel_load_addr != 0) { + char *name = llvm_name_for_data(dso, filename, + storage.pcrel_load_addr); + disasm_len += scnprintf(disasm_buf + disasm_len, + sizeof(disasm_buf) - disasm_len, + " # %#"PRIx64, + storage.pcrel_load_addr); + if (name) { + disasm_len += scnprintf(disasm_buf + disasm_len, + sizeof(disasm_buf) - + disasm_len, + " <%s>", name); + free(name); + } + } + + args->offset = offset; + args->line = expand_tabs(disasm_buf, &line_storage, + &line_storage_len); + args->line_nr = 0; + args->fileloc = NULL; + args->ms.sym = sym; + + llvm_addr2line(filename, pc, &args->fileloc, + (unsigned int *)&args->line_nr, false, NULL); + + dl = disasm_line__new(args); + if (dl == NULL) + goto err; + + annotation_line__add(&dl->al, ¬es->src->source); + + free(args->fileloc); + pc += ins_len; + offset += ins_len; + } + + ret = 0; + +err: + LLVMDisasmDispose(disasm); + free(code_buf); + free(line_storage); + return ret; +#else // HAVE_LIBLLVM_SUPPORT + pr_debug("The LLVM disassembler isn't linked in for %s in %s\n", + sym->name, filename); + return -1; +#endif +} diff --git a/tools/perf/util/llvm.h b/tools/perf/util/llvm.h new file mode 100644 index 000000000000..57f6bafb24bb --- /dev/null +++ b/tools/perf/util/llvm.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __PERF_LLVM_H +#define __PERF_LLVM_H + +#include <stdbool.h> +#include <linux/types.h> + +struct annotate_args; +struct dso; +struct inline_node; +struct symbol; + +int llvm__addr2line(const char *dso_name, u64 addr, + char **file, unsigned int *line, struct dso *dso, + bool unwind_inlines, struct inline_node *node, + struct symbol *sym); + +int symbol__disassemble_llvm(const char *filename, struct symbol *sym, + struct annotate_args *args); + +#endif /* __PERF_LLVM_H */ diff --git a/tools/perf/util/lzma.c b/tools/perf/util/lzma.c index bbcd2ffcf4bd..c355757ed391 100644 --- a/tools/perf/util/lzma.c +++ b/tools/perf/util/lzma.c @@ -120,7 +120,7 @@ bool lzma_is_compressed(const char *input) ssize_t rc; if (fd < 0) - return -1; + return false; rc = read(fd, buf, sizeof(buf)); close(fd); diff --git a/tools/perf/util/map.c b/tools/perf/util/map.c index b46c68c24d1c..41cdddc987ee 100644 --- a/tools/perf/util/map.c +++ b/tools/perf/util/map.c @@ -513,6 +513,8 @@ void srccode_state_free(struct srccode_state *state) state->line = 0; } +static const struct kmap *__map__const_kmap(const struct map *map); + /** * map__rip_2objdump - convert symbol start address to objdump address. * @map: memory map @@ -524,9 +526,9 @@ void srccode_state_free(struct srccode_state *state) * * Return: Address suitable for passing to "objdump --start-address=" */ -u64 map__rip_2objdump(struct map *map, u64 rip) +u64 map__rip_2objdump(const struct map *map, u64 rip) { - struct kmap *kmap = __map__kmap(map); + const struct kmap *kmap = __map__const_kmap(map); const struct dso *dso = map__dso(map); /* @@ -569,7 +571,7 @@ u64 map__rip_2objdump(struct map *map, u64 rip) * * Return: Memory address. */ -u64 map__objdump_2mem(struct map *map, u64 ip) +u64 map__objdump_2mem(const struct map *map, u64 ip) { const struct dso *dso = map__dso(map); @@ -586,7 +588,7 @@ u64 map__objdump_2mem(struct map *map, u64 ip) } /* convert objdump address to relative address. (To be removed) */ -u64 map__objdump_2rip(struct map *map, u64 ip) +u64 map__objdump_2rip(const struct map *map, u64 ip) { const struct dso *dso = map__dso(map); @@ -618,6 +620,15 @@ struct kmap *__map__kmap(struct map *map) return (struct kmap *)(&RC_CHK_ACCESS(map)[1]); } +static const struct kmap *__map__const_kmap(const struct map *map) +{ + const struct dso *dso = map__dso(map); + + if (!dso || !dso__kernel(dso)) + return NULL; + return (struct kmap *)(&RC_CHK_ACCESS(map)[1]); +} + struct kmap *map__kmap(struct map *map) { struct kmap *kmap = __map__kmap(map); diff --git a/tools/perf/util/map.h b/tools/perf/util/map.h index 9cadf533a561..979b3e11b9bc 100644 --- a/tools/perf/util/map.h +++ b/tools/perf/util/map.h @@ -133,13 +133,13 @@ static inline u64 map__unmap_ip(const struct map *map, u64 ip_or_rip) } /* rip/ip <-> addr suitable for passing to `objdump --start-address=` */ -u64 map__rip_2objdump(struct map *map, u64 rip); +u64 map__rip_2objdump(const struct map *map, u64 rip); /* objdump address -> memory address */ -u64 map__objdump_2mem(struct map *map, u64 ip); +u64 map__objdump_2mem(const struct map *map, u64 ip); /* objdump address -> rip */ -u64 map__objdump_2rip(struct map *map, u64 ip); +u64 map__objdump_2rip(const struct map *map, u64 ip); struct symbol; struct thread; diff --git a/tools/perf/util/namespaces.c b/tools/perf/util/namespaces.c index 68f5de2d79c7..01502570b32d 100644 --- a/tools/perf/util/namespaces.c +++ b/tools/perf/util/namespaces.c @@ -6,7 +6,6 @@ #include "namespaces.h" #include "event.h" -#include "get_current_dir_name.h" #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> @@ -293,14 +292,14 @@ void nsinfo__mountns_enter(struct nsinfo *nsi, if (!nsi || !nsinfo__need_setns(nsi)) return; - if (snprintf(curpath, PATH_MAX, "/proc/self/ns/mnt") >= PATH_MAX) + if (!getcwd(curpath, sizeof(curpath))) return; - oldcwd = get_current_dir_name(); + oldcwd = strdup(curpath); if (!oldcwd) return; - oldns = open(curpath, O_RDONLY); + oldns = open("/proc/self/ns/mnt", O_RDONLY); if (oldns < 0) goto errout; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 8282ddf68b98..da73d686f6b9 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -126,7 +126,8 @@ static char *get_config_name(const struct parse_events_terms *head_terms) return get_config_str(head_terms, PARSE_EVENTS__TERM_TYPE_NAME); } -static struct perf_cpu_map *get_config_cpu(const struct parse_events_terms *head_terms) +static struct perf_cpu_map *get_config_cpu(const struct parse_events_terms *head_terms, + bool fake_pmu) { struct parse_events_term *term; struct perf_cpu_map *cpus = NULL; @@ -135,24 +136,33 @@ static struct perf_cpu_map *get_config_cpu(const struct parse_events_terms *head return NULL; list_for_each_entry(term, &head_terms->terms, list) { - if (term->type_term == PARSE_EVENTS__TERM_TYPE_CPU) { - struct perf_cpu_map *term_cpus; + struct perf_cpu_map *term_cpus; - if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) { - term_cpus = perf_cpu_map__new_int(term->val.num); + if (term->type_term != PARSE_EVENTS__TERM_TYPE_CPU) + continue; + + if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) { + term_cpus = perf_cpu_map__new_int(term->val.num); + } else { + struct perf_pmu *pmu = perf_pmus__find(term->val.str); + + if (pmu) { + term_cpus = pmu->is_core && perf_cpu_map__is_empty(pmu->cpus) + ? cpu_map__online() + : perf_cpu_map__get(pmu->cpus); } else { - struct perf_pmu *pmu = perf_pmus__find(term->val.str); - - if (pmu && perf_cpu_map__is_empty(pmu->cpus)) - term_cpus = pmu->is_core ? cpu_map__online() : NULL; - else if (pmu) - term_cpus = perf_cpu_map__get(pmu->cpus); - else - term_cpus = perf_cpu_map__new(term->val.str); + term_cpus = perf_cpu_map__new(term->val.str); + if (!term_cpus && fake_pmu) { + /* + * Assume the PMU string makes sense on a different + * machine and fake a value with all online CPUs. + */ + term_cpus = cpu_map__online(); + } } - perf_cpu_map__merge(&cpus, term_cpus); - perf_cpu_map__put(term_cpus); } + perf_cpu_map__merge(&cpus, term_cpus); + perf_cpu_map__put(term_cpus); } return cpus; @@ -369,13 +379,13 @@ static int parse_aliases(const char *str, const char *const names[][EVSEL__MAX_A typedef int config_term_func_t(struct perf_event_attr *attr, struct parse_events_term *term, - struct parse_events_error *err); + struct parse_events_state *parse_state); static int config_term_common(struct perf_event_attr *attr, struct parse_events_term *term, - struct parse_events_error *err); + struct parse_events_state *parse_state); static int config_attr(struct perf_event_attr *attr, const struct parse_events_terms *head, - struct parse_events_error *err, + struct parse_events_state *parse_state, config_term_func_t config_term); /** @@ -471,7 +481,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, bool found_supported = false; const char *config_name = get_config_name(parsed_terms); const char *metric_id = get_config_metric_id(parsed_terms); - struct perf_cpu_map *cpus = get_config_cpu(parsed_terms); + struct perf_cpu_map *cpus = get_config_cpu(parsed_terms, parse_state->fake_pmu); int ret = 0; struct evsel *first_wildcard_match = NULL; @@ -514,8 +524,7 @@ int parse_events_add_cache(struct list_head *list, int *idx, const char *name, found_supported = true; if (parsed_terms) { - if (config_attr(&attr, parsed_terms, parse_state->error, - config_term_common)) { + if (config_attr(&attr, parsed_terms, parse_state, config_term_common)) { ret = -EINVAL; goto out_err; } @@ -767,8 +776,7 @@ int parse_events_add_breakpoint(struct parse_events_state *parse_state, attr.sample_period = 1; if (head_config) { - if (config_attr(&attr, head_config, parse_state->error, - config_term_common)) + if (config_attr(&attr, head_config, parse_state, config_term_common)) return -EINVAL; if (get_config_terms(head_config, &config_terms)) @@ -834,6 +842,7 @@ const char *parse_events__term_type_str(enum parse_events__term_type term_type) [PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE] = "legacy-cache", [PARSE_EVENTS__TERM_TYPE_HARDWARE] = "hardware", [PARSE_EVENTS__TERM_TYPE_CPU] = "cpu", + [PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV] = "ratio-to-prev", }; if ((unsigned int)term_type >= __PARSE_EVENTS__TERM_TYPE_NR) return "unknown term"; @@ -884,6 +893,7 @@ config_term_avail(enum parse_events__term_type term_type, struct parse_events_er case PARSE_EVENTS__TERM_TYPE_RAW: case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE: case PARSE_EVENTS__TERM_TYPE_HARDWARE: + case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV: default: if (!err) return false; @@ -903,12 +913,12 @@ void parse_events__shrink_config_terms(void) static int config_term_common(struct perf_event_attr *attr, struct parse_events_term *term, - struct parse_events_error *err) + struct parse_events_state *parse_state) { -#define CHECK_TYPE_VAL(type) \ -do { \ - if (check_type_val(term, err, PARSE_EVENTS__TERM_TYPE_ ## type)) \ - return -EINVAL; \ +#define CHECK_TYPE_VAL(type) \ +do { \ + if (check_type_val(term, parse_state->error, PARSE_EVENTS__TERM_TYPE_ ## type)) \ + return -EINVAL; \ } while (0) switch (term->type_term) { @@ -939,7 +949,7 @@ do { \ if (strcmp(term->val.str, "no") && parse_branch_str(term->val.str, &attr->branch_sample_type)) { - parse_events_error__handle(err, term->err_val, + parse_events_error__handle(parse_state->error, term->err_val, strdup("invalid branch sample type"), NULL); return -EINVAL; @@ -948,7 +958,7 @@ do { \ case PARSE_EVENTS__TERM_TYPE_TIME: CHECK_TYPE_VAL(NUM); if (term->val.num > 1) { - parse_events_error__handle(err, term->err_val, + parse_events_error__handle(parse_state->error, term->err_val, strdup("expected 0 or 1"), NULL); return -EINVAL; @@ -990,7 +1000,7 @@ do { \ case PARSE_EVENTS__TERM_TYPE_PERCORE: CHECK_TYPE_VAL(NUM); if ((unsigned int)term->val.num > 1) { - parse_events_error__handle(err, term->err_val, + parse_events_error__handle(parse_state->error, term->err_val, strdup("expected 0 or 1"), NULL); return -EINVAL; @@ -1005,7 +1015,7 @@ do { \ case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE: CHECK_TYPE_VAL(NUM); if (term->val.num > UINT_MAX) { - parse_events_error__handle(err, term->err_val, + parse_events_error__handle(parse_state->error, term->err_val, strdup("too big"), NULL); return -EINVAL; @@ -1016,7 +1026,7 @@ do { \ if (term->type_val == PARSE_EVENTS__TERM_TYPE_NUM) { if (term->val.num >= (u64)cpu__max_present_cpu().cpu) { - parse_events_error__handle(err, term->err_val, + parse_events_error__handle(parse_state->error, term->err_val, strdup("too big"), /*help=*/NULL); return -EINVAL; @@ -1028,8 +1038,8 @@ do { \ break; map = perf_cpu_map__new(term->val.str); - if (!map) { - parse_events_error__handle(err, term->err_val, + if (!map && !parse_state->fake_pmu) { + parse_events_error__handle(parse_state->error, term->err_val, strdup("not a valid PMU or CPU number"), /*help=*/NULL); return -EINVAL; @@ -1037,12 +1047,27 @@ do { \ perf_cpu_map__put(map); break; } + case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV: + CHECK_TYPE_VAL(STR); + if (strtod(term->val.str, NULL) <= 0) { + parse_events_error__handle(parse_state->error, term->err_val, + strdup("zero or negative"), + NULL); + return -EINVAL; + } + if (errno == ERANGE) { + parse_events_error__handle(parse_state->error, term->err_val, + strdup("too big"), + NULL); + return -EINVAL; + } + break; case PARSE_EVENTS__TERM_TYPE_DRV_CFG: case PARSE_EVENTS__TERM_TYPE_USER: case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE: case PARSE_EVENTS__TERM_TYPE_HARDWARE: default: - parse_events_error__handle(err, term->err_term, + parse_events_error__handle(parse_state->error, term->err_term, strdup(parse_events__term_type_str(term->type_term)), parse_events_formats_error_string(NULL)); return -EINVAL; @@ -1057,7 +1082,7 @@ do { \ * if an invalid config term is provided for legacy events * (for example, instructions/badterm/...), which is confusing. */ - if (!config_term_avail(term->type_term, err)) + if (!config_term_avail(term->type_term, parse_state->error)) return -EINVAL; return 0; #undef CHECK_TYPE_VAL @@ -1065,7 +1090,7 @@ do { \ static int config_term_pmu(struct perf_event_attr *attr, struct parse_events_term *term, - struct parse_events_error *err) + struct parse_events_state *parse_state) { if (term->type_term == PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE) { struct perf_pmu *pmu = perf_pmus__find_by_type(attr->type); @@ -1074,7 +1099,7 @@ static int config_term_pmu(struct perf_event_attr *attr, char *err_str; if (asprintf(&err_str, "Failed to find PMU for type %d", attr->type) >= 0) - parse_events_error__handle(err, term->err_term, + parse_events_error__handle(parse_state->error, term->err_term, err_str, /*help=*/NULL); return -EINVAL; } @@ -1100,7 +1125,7 @@ static int config_term_pmu(struct perf_event_attr *attr, char *err_str; if (asprintf(&err_str, "Failed to find PMU for type %d", attr->type) >= 0) - parse_events_error__handle(err, term->err_term, + parse_events_error__handle(parse_state->error, term->err_term, err_str, /*help=*/NULL); return -EINVAL; } @@ -1128,12 +1153,12 @@ static int config_term_pmu(struct perf_event_attr *attr, */ return 0; } - return config_term_common(attr, term, err); + return config_term_common(attr, term, parse_state); } static int config_term_tracepoint(struct perf_event_attr *attr, struct parse_events_term *term, - struct parse_events_error *err) + struct parse_events_state *parse_state) { switch (term->type_term) { case PARSE_EVENTS__TERM_TYPE_CALLGRAPH: @@ -1147,7 +1172,7 @@ static int config_term_tracepoint(struct perf_event_attr *attr, case PARSE_EVENTS__TERM_TYPE_AUX_OUTPUT: case PARSE_EVENTS__TERM_TYPE_AUX_ACTION: case PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE: - return config_term_common(attr, term, err); + return config_term_common(attr, term, parse_state); case PARSE_EVENTS__TERM_TYPE_USER: case PARSE_EVENTS__TERM_TYPE_CONFIG: case PARSE_EVENTS__TERM_TYPE_CONFIG1: @@ -1165,13 +1190,12 @@ static int config_term_tracepoint(struct perf_event_attr *attr, case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE: case PARSE_EVENTS__TERM_TYPE_HARDWARE: case PARSE_EVENTS__TERM_TYPE_CPU: + case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV: default: - if (err) { - parse_events_error__handle(err, term->err_term, + parse_events_error__handle(parse_state->error, term->err_term, strdup(parse_events__term_type_str(term->type_term)), strdup("valid terms: call-graph,stack-size\n") ); - } return -EINVAL; } @@ -1180,13 +1204,13 @@ static int config_term_tracepoint(struct perf_event_attr *attr, static int config_attr(struct perf_event_attr *attr, const struct parse_events_terms *head, - struct parse_events_error *err, + struct parse_events_state *parse_state, config_term_func_t config_term) { struct parse_events_term *term; list_for_each_entry(term, &head->terms, list) - if (config_term(attr, term, err)) + if (config_term(attr, term, parse_state)) return -EINVAL; return 0; @@ -1289,6 +1313,9 @@ do { \ ADD_CONFIG_TERM_VAL(AUX_SAMPLE_SIZE, aux_sample_size, term->val.num, term->weak); break; + case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV: + ADD_CONFIG_TERM_STR(RATIO_TO_PREV, term->val.str, term->weak); + break; case PARSE_EVENTS__TERM_TYPE_USER: case PARSE_EVENTS__TERM_TYPE_CONFIG: case PARSE_EVENTS__TERM_TYPE_CONFIG1: @@ -1355,6 +1382,7 @@ static int get_config_chgs(struct perf_pmu *pmu, struct parse_events_terms *head case PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE: case PARSE_EVENTS__TERM_TYPE_HARDWARE: case PARSE_EVENTS__TERM_TYPE_CPU: + case PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV: default: break; } @@ -1378,8 +1406,7 @@ int parse_events_add_tracepoint(struct parse_events_state *parse_state, if (head_config) { struct perf_event_attr attr; - if (config_attr(&attr, head_config, err, - config_term_tracepoint)) + if (config_attr(&attr, head_config, parse_state, config_term_tracepoint)) return -EINVAL; } @@ -1408,8 +1435,7 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state, } if (head_config) { - if (config_attr(&attr, head_config, parse_state->error, - config_term_common)) + if (config_attr(&attr, head_config, parse_state, config_term_common)) return -EINVAL; if (get_config_terms(head_config, &config_terms)) @@ -1418,7 +1444,7 @@ static int __parse_events_add_numeric(struct parse_events_state *parse_state, name = get_config_name(head_config); metric_id = get_config_metric_id(head_config); - cpus = get_config_cpu(head_config); + cpus = get_config_cpu(head_config, parse_state->fake_pmu); ret = __add_event(list, &parse_state->idx, &attr, /*init_attr*/true, name, metric_id, pmu, &config_terms, first_wildcard_match, cpus, /*alternate_hw_config=*/PERF_COUNT_HW_MAX) ? 0 : -ENOMEM; @@ -1531,7 +1557,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, fix_raw(&parsed_terms, pmu); /* Configure attr/terms with a known PMU, this will set hardcoded terms. */ - if (config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) { + if (config_attr(&attr, &parsed_terms, parse_state, config_term_pmu)) { parse_events_terms__exit(&parsed_terms); return -EINVAL; } @@ -1555,7 +1581,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, /* Configure attr/terms again if an alias was expanded. */ if (alias_rewrote_terms && - config_attr(&attr, &parsed_terms, parse_state->error, config_term_pmu)) { + config_attr(&attr, &parsed_terms, parse_state, config_term_pmu)) { parse_events_terms__exit(&parsed_terms); return -EINVAL; } @@ -1583,7 +1609,7 @@ static int parse_events_add_pmu(struct parse_events_state *parse_state, return -EINVAL; } - term_cpu = get_config_cpu(&parsed_terms); + term_cpu = get_config_cpu(&parsed_terms, parse_state->fake_pmu); evsel = __add_event(list, &parse_state->idx, &attr, /*init_attr=*/true, get_config_name(&parsed_terms), get_config_metric_id(&parsed_terms), pmu, @@ -1892,6 +1918,8 @@ static int parse_events__modifier_list(struct parse_events_state *parse_state, evsel->bpf_counter = true; if (mod.retire_lat) evsel->retire_lat = true; + if (mod.dont_regroup) + evsel->dont_regroup = true; } return 0; } @@ -2188,13 +2216,12 @@ static int parse_events__sort_events_and_fix_groups(struct list_head *list) * Set the group leader respecting the given groupings and that * groups can't span PMUs. */ - if (!cur_leader) { + if (!cur_leader || pos->dont_regroup) { cur_leader = pos; cur_leaders_grp = &pos->core; if (pos_force_grouped) force_grouped_leader = pos; } - cur_leader_pmu_name = cur_leader->group_pmu_name; if (strcmp(cur_leader_pmu_name, pos_pmu_name)) { /* PMU changed so the group/leader must change. */ diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 62dc7202e3ba..8f8c8e7fbcf1 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -83,7 +83,8 @@ enum parse_events__term_type { PARSE_EVENTS__TERM_TYPE_LEGACY_CACHE, PARSE_EVENTS__TERM_TYPE_HARDWARE, PARSE_EVENTS__TERM_TYPE_CPU, -#define __PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_CPU + 1) + PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV, +#define __PARSE_EVENTS__TERM_TYPE_NR (PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV + 1) }; struct parse_events_term { @@ -216,6 +217,7 @@ struct parse_events_modifier { bool guest : 1; /* 'G' */ bool host : 1; /* 'H' */ bool retire_lat : 1; /* 'R' */ + bool dont_regroup : 1; /* 'X' */ }; int parse_events__modifier_event(struct parse_events_state *parse_state, void *loc, diff --git a/tools/perf/util/parse-events.l b/tools/perf/util/parse-events.l index 2034590eb789..d65eb32124c8 100644 --- a/tools/perf/util/parse-events.l +++ b/tools/perf/util/parse-events.l @@ -5,16 +5,14 @@ %option stack %option bison-locations %option yylineno -%option reject +%option noyywrap %{ #include <errno.h> -#include <sys/types.h> -#include <sys/stat.h> -#include <unistd.h> +#include <stdlib.h> +#include <stdio.h> #include "parse-events.h" #include "parse-events-bison.h" -#include "evsel.h" char *parse_events_get_text(yyscan_t yyscanner); YYSTYPE *parse_events_get_lval(yyscan_t yyscanner); @@ -206,6 +204,7 @@ static int modifiers(struct parse_events_state *parse_state, yyscan_t scanner) CASE('e', exclusive); CASE('b', bpf); CASE('R', retire_lat); + CASE('X', dont_regroup); default: return PE_ERROR; } @@ -222,10 +221,6 @@ do { \ yycolumn += yyleng; \ } while (0); -#define USER_REJECT \ - yycolumn -= yyleng; \ - REJECT - %} %x mem @@ -251,10 +246,10 @@ term_name {name_start}[a-zA-Z0-9_*?.\[\]!\-:]* quoted_name [\']{name_start}[a-zA-Z0-9_*?.\[\]!\-:,\.=]*[\'] drv_cfg_term [a-zA-Z0-9_\.]+(=[a-zA-Z0-9_*?\.:]+)? /* - * If you add a modifier you need to update check_modifier(). + * If you add a modifier you need to update modifiers(). * Also, the letters in modifier_event must not be in modifier_bp. */ -modifier_event [ukhpPGHSDIWebR]{1,16} +modifier_event [ukhpPGHSDIWebRX]{1,17} modifier_bp [rwx]{1,3} lc_type (L1-dcache|l1-d|l1d|L1-data|L1-icache|l1-i|l1i|L1-instruction|LLC|L2|dTLB|d-tlb|Data-TLB|iTLB|i-tlb|Instruction-TLB|branch|branches|bpu|btb|bpc|node) lc_op_result (load|loads|read|store|stores|write|prefetch|prefetches|speculative-read|speculative-load|refs|Reference|ops|access|misses|miss) @@ -336,6 +331,7 @@ aux-action { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_ACTION); } aux-sample-size { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_AUX_SAMPLE_SIZE); } metric-id { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_METRIC_ID); } cpu { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_CPU); } +ratio-to-prev { return term(yyscanner, PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV); } cpu-cycles|cycles { return hw_term(yyscanner, PERF_COUNT_HW_CPU_CYCLES); } stalled-cycles-frontend|idle-cycles-frontend { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND); } stalled-cycles-backend|idle-cycles-backend { return hw_term(yyscanner, PERF_COUNT_HW_STALLED_CYCLES_BACKEND); } @@ -423,8 +419,3 @@ r{num_raw_hex} { return str(yyscanner, PE_RAW); } . { } %% - -int parse_events_wrap(void *scanner __maybe_unused) -{ - return 1; -} diff --git a/tools/perf/util/pmu.c b/tools/perf/util/pmu.c index 5a291f1380ed..3d1f975e8db9 100644 --- a/tools/perf/util/pmu.c +++ b/tools/perf/util/pmu.c @@ -1541,7 +1541,7 @@ static int pmu_config_term(const struct perf_pmu *pmu, break; case PARSE_EVENTS__TERM_TYPE_USER: /* Not hardcoded. */ return -EINVAL; - case PARSE_EVENTS__TERM_TYPE_NAME ... PARSE_EVENTS__TERM_TYPE_CPU: + case PARSE_EVENTS__TERM_TYPE_NAME ... PARSE_EVENTS__TERM_TYPE_RATIO_TO_PREV: /* Skip non-config terms. */ break; default: @@ -1930,6 +1930,7 @@ int perf_pmu__for_each_format(struct perf_pmu *pmu, void *state, pmu_format_call "aux-action=(pause|resume|start-paused)", "aux-sample-size=number", "cpu=number", + "ratio-to-prev=string", }; struct perf_pmu_format *format; int ret; diff --git a/tools/perf/util/powerpc-vpadtl.c b/tools/perf/util/powerpc-vpadtl.c new file mode 100644 index 000000000000..39a3fb3f1330 --- /dev/null +++ b/tools/perf/util/powerpc-vpadtl.c @@ -0,0 +1,734 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * VPA DTL PMU support + */ + +#include <linux/string.h> +#include <inttypes.h> +#include "color.h" +#include "evlist.h" +#include "session.h" +#include "auxtrace.h" +#include "data.h" +#include "machine.h" +#include "debug.h" +#include "powerpc-vpadtl.h" +#include "sample.h" +#include "tool.h" + +/* + * Structure to save the auxtrace queue + */ +struct powerpc_vpadtl { + struct auxtrace auxtrace; + struct auxtrace_queues queues; + struct auxtrace_heap heap; + u32 auxtrace_type; + struct perf_session *session; + struct machine *machine; + u32 pmu_type; + u64 sample_id; +}; + +struct boottb_freq { + u64 boot_tb; + u64 tb_freq; + u64 timebase; + u64 padded[3]; +}; + +struct powerpc_vpadtl_queue { + struct powerpc_vpadtl *vpa; + unsigned int queue_nr; + struct auxtrace_buffer *buffer; + struct thread *thread; + bool on_heap; + struct powerpc_vpadtl_entry *dtl; + u64 timestamp; + unsigned long pkt_len; + unsigned long buf_len; + u64 boot_tb; + u64 tb_freq; + unsigned int tb_buffer; + unsigned int size; + bool done; + pid_t pid; + pid_t tid; + int cpu; +}; + +const char *dispatch_reasons[11] = { + "external_interrupt", + "firmware_internal_event", + "H_PROD", + "decrementer_interrupt", + "system_reset", + "firmware_internal_event", + "conferred_cycles", + "time_slice", + "virtual_memory_page_fault", + "expropriated_adjunct", + "priv_doorbell"}; + +const char *preempt_reasons[10] = { + "unused", + "firmware_internal_event", + "H_CEDE", + "H_CONFER", + "time_slice", + "migration_hibernation_page_fault", + "virtual_memory_page_fault", + "H_CONFER_ADJUNCT", + "hcall_adjunct", + "HDEC_adjunct"}; + +#define dtl_entry_size sizeof(struct powerpc_vpadtl_entry) + +/* + * Function to dump the dispatch trace data when perf report + * is invoked with -D + */ +static void powerpc_vpadtl_dump(struct powerpc_vpadtl *vpa __maybe_unused, + unsigned char *buf, size_t len) +{ + struct powerpc_vpadtl_entry *dtl; + int pkt_len, pos = 0; + const char *color = PERF_COLOR_BLUE; + + color_fprintf(stdout, color, + ". ... VPA DTL PMU data: size %zu bytes, entries is %zu\n", + len, len/dtl_entry_size); + + if (len % dtl_entry_size) + len = len - (len % dtl_entry_size); + + while (len) { + pkt_len = dtl_entry_size; + printf("."); + color_fprintf(stdout, color, " %08x: ", pos); + dtl = (struct powerpc_vpadtl_entry *)buf; + if (dtl->timebase != 0) { + printf("dispatch_reason:%s, preempt_reason:%s, " + "enqueue_to_dispatch_time:%d, ready_to_enqueue_time:%d, " + "waiting_to_ready_time:%d\n", + dispatch_reasons[dtl->dispatch_reason], + preempt_reasons[dtl->preempt_reason], + be32_to_cpu(dtl->enqueue_to_dispatch_time), + be32_to_cpu(dtl->ready_to_enqueue_time), + be32_to_cpu(dtl->waiting_to_ready_time)); + } else { + struct boottb_freq *boot_tb = (struct boottb_freq *)buf; + + printf("boot_tb: %" PRIu64 ", tb_freq: %" PRIu64 "\n", + boot_tb->boot_tb, boot_tb->tb_freq); + } + + pos += pkt_len; + buf += pkt_len; + len -= pkt_len; + } +} + +static unsigned long long powerpc_vpadtl_timestamp(struct powerpc_vpadtl_queue *vpaq) +{ + struct powerpc_vpadtl_entry *record = vpaq->dtl; + unsigned long long timestamp = 0; + unsigned long long boot_tb; + unsigned long long diff; + double result, div; + double boot_freq; + /* + * Formula used to get timestamp that can be co-related with + * other perf events: + * ((timbase from DTL entry - boot time) / frequency) * 1000000000 + */ + if (record->timebase) { + boot_tb = vpaq->boot_tb; + boot_freq = vpaq->tb_freq; + diff = be64_to_cpu(record->timebase) - boot_tb; + div = diff / boot_freq; + result = div; + result = result * 1000000000; + timestamp = result; + } + + return timestamp; +} + +static struct powerpc_vpadtl *session_to_vpa(struct perf_session *session) +{ + return container_of(session->auxtrace, struct powerpc_vpadtl, auxtrace); +} + +static void powerpc_vpadtl_dump_event(struct powerpc_vpadtl *vpa, unsigned char *buf, + size_t len) +{ + printf(".\n"); + powerpc_vpadtl_dump(vpa, buf, len); +} + +/* + * Generate perf sample for each entry in the dispatch trace log. + * - sample ip is picked from srr0 field of powerpc_vpadtl_entry + * - sample cpu is logical cpu. + * - cpumode is set to PERF_RECORD_MISC_KERNEL + * - Additionally save the details in raw_data of sample. This + * is to print the relevant fields in perf_sample__fprintf_synth() + * when called from builtin-script + */ +static int powerpc_vpadtl_sample(struct powerpc_vpadtl_entry *record, + struct powerpc_vpadtl *vpa, u64 save, int cpu) +{ + struct perf_sample sample; + union perf_event event; + + sample.ip = be64_to_cpu(record->srr0); + sample.period = 1; + sample.cpu = cpu; + sample.id = vpa->sample_id; + sample.callchain = NULL; + sample.branch_stack = NULL; + memset(&event, 0, sizeof(event)); + sample.cpumode = PERF_RECORD_MISC_KERNEL; + sample.time = save; + sample.raw_data = record; + sample.raw_size = sizeof(record); + event.sample.header.type = PERF_RECORD_SAMPLE; + event.sample.header.misc = sample.cpumode; + event.sample.header.size = sizeof(struct perf_event_header); + + if (perf_session__deliver_synth_event(vpa->session, &event, &sample)) { + pr_debug("Failed to create sample for dtl entry\n"); + return -1; + } + + return 0; +} + +static int powerpc_vpadtl_get_buffer(struct powerpc_vpadtl_queue *vpaq) +{ + struct auxtrace_buffer *buffer = vpaq->buffer; + struct auxtrace_queues *queues = &vpaq->vpa->queues; + struct auxtrace_queue *queue; + + queue = &queues->queue_array[vpaq->queue_nr]; + buffer = auxtrace_buffer__next(queue, buffer); + + if (!buffer) + return 0; + + vpaq->buffer = buffer; + vpaq->size = buffer->size; + + /* If the aux_buffer doesn't have data associated, try to load it */ + if (!buffer->data) { + /* get the file desc associated with the perf data file */ + int fd = perf_data__fd(vpaq->vpa->session->data); + + buffer->data = auxtrace_buffer__get_data(buffer, fd); + if (!buffer->data) + return -ENOMEM; + } + + vpaq->buf_len = buffer->size; + + if (buffer->size % dtl_entry_size) + vpaq->buf_len = buffer->size - (buffer->size % dtl_entry_size); + + if (vpaq->tb_buffer != buffer->buffer_nr) { + vpaq->pkt_len = 0; + vpaq->tb_buffer = 0; + } + + return 1; +} + +/* + * The first entry in the queue for VPA DTL PMU has the boot timebase, + * frequency details which are needed to get timestamp which is required to + * correlate with other events. Save the boot_tb and tb_freq as part of + * powerpc_vpadtl_queue. The very next entry is the actual trace data to + * be returned. + */ +static int powerpc_vpadtl_decode(struct powerpc_vpadtl_queue *vpaq) +{ + int ret; + char *buf; + struct boottb_freq *boottb; + + ret = powerpc_vpadtl_get_buffer(vpaq); + if (ret <= 0) + return ret; + + boottb = (struct boottb_freq *)vpaq->buffer->data; + if (boottb->timebase == 0) { + vpaq->boot_tb = boottb->boot_tb; + vpaq->tb_freq = boottb->tb_freq; + vpaq->pkt_len += dtl_entry_size; + } + + buf = vpaq->buffer->data; + buf += vpaq->pkt_len; + vpaq->dtl = (struct powerpc_vpadtl_entry *)buf; + + vpaq->tb_buffer = vpaq->buffer->buffer_nr; + vpaq->buffer = NULL; + vpaq->buf_len = 0; + + return 1; +} + +static int powerpc_vpadtl_decode_all(struct powerpc_vpadtl_queue *vpaq) +{ + int ret; + unsigned char *buf; + + if (!vpaq->buf_len || vpaq->pkt_len == vpaq->size) { + ret = powerpc_vpadtl_get_buffer(vpaq); + if (ret <= 0) + return ret; + } + + if (vpaq->buffer) { + buf = vpaq->buffer->data; + buf += vpaq->pkt_len; + vpaq->dtl = (struct powerpc_vpadtl_entry *)buf; + if ((long long)be64_to_cpu(vpaq->dtl->timebase) <= 0) { + if (vpaq->pkt_len != dtl_entry_size && vpaq->buf_len) { + vpaq->pkt_len += dtl_entry_size; + vpaq->buf_len -= dtl_entry_size; + } + return -1; + } + vpaq->pkt_len += dtl_entry_size; + vpaq->buf_len -= dtl_entry_size; + } else { + return 0; + } + + return 1; +} + +static int powerpc_vpadtl_run_decoder(struct powerpc_vpadtl_queue *vpaq, u64 *timestamp) +{ + struct powerpc_vpadtl *vpa = vpaq->vpa; + struct powerpc_vpadtl_entry *record; + int ret; + unsigned long long vpaq_timestamp; + + while (1) { + ret = powerpc_vpadtl_decode_all(vpaq); + if (!ret) { + pr_debug("All data in the queue has been processed.\n"); + return 1; + } + + /* + * Error is detected when decoding VPA PMU trace. Continue to + * the next trace data and find out more dtl entries. + */ + if (ret < 0) + continue; + + record = vpaq->dtl; + + vpaq_timestamp = powerpc_vpadtl_timestamp(vpaq); + + /* Update timestamp for the last record */ + if (vpaq_timestamp > vpaq->timestamp) + vpaq->timestamp = vpaq_timestamp; + + /* + * If the timestamp of the queue is later than timestamp of the + * coming perf event, bail out so can allow the perf event to + * be processed ahead. + */ + if (vpaq->timestamp >= *timestamp) { + *timestamp = vpaq->timestamp; + vpaq->pkt_len -= dtl_entry_size; + vpaq->buf_len += dtl_entry_size; + return 0; + } + + ret = powerpc_vpadtl_sample(record, vpa, vpaq_timestamp, vpaq->cpu); + if (ret) + continue; + } + return 0; +} + +/* + * For each of the PERF_RECORD_XX record, compare the timestamp + * of perf record with timestamp of top element in the auxtrace heap. + * Process the auxtrace queue if the timestamp of element from heap is + * lower than timestamp from entry in perf record. + * + * Update the timestamp of the auxtrace heap with the timestamp + * of last processed entry from the auxtrace buffer. + */ +static int powerpc_vpadtl_process_queues(struct powerpc_vpadtl *vpa, u64 timestamp) +{ + unsigned int queue_nr; + u64 ts; + int ret; + + while (1) { + struct auxtrace_queue *queue; + struct powerpc_vpadtl_queue *vpaq; + + if (!vpa->heap.heap_cnt) + return 0; + + if (vpa->heap.heap_array[0].ordinal >= timestamp) + return 0; + + queue_nr = vpa->heap.heap_array[0].queue_nr; + queue = &vpa->queues.queue_array[queue_nr]; + vpaq = queue->priv; + + auxtrace_heap__pop(&vpa->heap); + + if (vpa->heap.heap_cnt) { + ts = vpa->heap.heap_array[0].ordinal + 1; + if (ts > timestamp) + ts = timestamp; + } else { + ts = timestamp; + } + + ret = powerpc_vpadtl_run_decoder(vpaq, &ts); + if (ret < 0) { + auxtrace_heap__add(&vpa->heap, queue_nr, ts); + return ret; + } + + if (!ret) { + ret = auxtrace_heap__add(&vpa->heap, queue_nr, ts); + if (ret < 0) + return ret; + } else { + vpaq->on_heap = false; + } + } + return 0; +} + +static struct powerpc_vpadtl_queue *powerpc_vpadtl__alloc_queue(struct powerpc_vpadtl *vpa, + unsigned int queue_nr) +{ + struct powerpc_vpadtl_queue *vpaq; + + vpaq = zalloc(sizeof(*vpaq)); + if (!vpaq) + return NULL; + + vpaq->vpa = vpa; + vpaq->queue_nr = queue_nr; + + return vpaq; +} + +/* + * When the Dispatch Trace Log data is collected along with other events + * like sched tracepoint events, it needs to be correlated and present + * interleaved along with these events. Perf events can be collected + * parallely across the CPUs. + * + * An auxtrace_queue is created for each CPU. Data within each queue is in + * increasing order of timestamp. Allocate and setup auxtrace queues here. + * All auxtrace queues is maintained in auxtrace heap in the increasing order + * of timestamp. So always the lowest timestamp (entries to be processed first) + * is on top of the heap. + * + * To add to auxtrace heap, fetch the timestamp from first DTL entry + * for each of the queue. + */ +static int powerpc_vpadtl__setup_queue(struct powerpc_vpadtl *vpa, + struct auxtrace_queue *queue, + unsigned int queue_nr) +{ + struct powerpc_vpadtl_queue *vpaq = queue->priv; + + if (list_empty(&queue->head) || vpaq) + return 0; + + vpaq = powerpc_vpadtl__alloc_queue(vpa, queue_nr); + if (!vpaq) + return -ENOMEM; + + queue->priv = vpaq; + + if (queue->cpu != -1) + vpaq->cpu = queue->cpu; + + if (!vpaq->on_heap) { + int ret; +retry: + ret = powerpc_vpadtl_decode(vpaq); + if (!ret) + return 0; + + if (ret < 0) + goto retry; + + vpaq->timestamp = powerpc_vpadtl_timestamp(vpaq); + + ret = auxtrace_heap__add(&vpa->heap, queue_nr, vpaq->timestamp); + if (ret) + return ret; + vpaq->on_heap = true; + } + + return 0; +} + +static int powerpc_vpadtl__setup_queues(struct powerpc_vpadtl *vpa) +{ + unsigned int i; + int ret; + + for (i = 0; i < vpa->queues.nr_queues; i++) { + ret = powerpc_vpadtl__setup_queue(vpa, &vpa->queues.queue_array[i], i); + if (ret) + return ret; + } + + return 0; +} + +static int powerpc_vpadtl__update_queues(struct powerpc_vpadtl *vpa) +{ + if (vpa->queues.new_data) { + vpa->queues.new_data = false; + return powerpc_vpadtl__setup_queues(vpa); + } + + return 0; +} + +static int powerpc_vpadtl_process_event(struct perf_session *session, + union perf_event *event __maybe_unused, + struct perf_sample *sample, + const struct perf_tool *tool) +{ + struct powerpc_vpadtl *vpa = session_to_vpa(session); + int err = 0; + + if (dump_trace) + return 0; + + if (!tool->ordered_events) { + pr_err("VPA requires ordered events\n"); + return -EINVAL; + } + + if (sample->time) { + err = powerpc_vpadtl__update_queues(vpa); + if (err) + return err; + + err = powerpc_vpadtl_process_queues(vpa, sample->time); + } + + return err; +} + +/* + * Process PERF_RECORD_AUXTRACE records + */ +static int powerpc_vpadtl_process_auxtrace_event(struct perf_session *session, + union perf_event *event, + const struct perf_tool *tool __maybe_unused) +{ + struct powerpc_vpadtl *vpa = session_to_vpa(session); + struct auxtrace_buffer *buffer; + int fd = perf_data__fd(session->data); + off_t data_offset; + int err; + + if (!dump_trace) + return 0; + + if (perf_data__is_pipe(session->data)) { + data_offset = 0; + } else { + data_offset = lseek(fd, 0, SEEK_CUR); + if (data_offset == -1) + return -errno; + } + + err = auxtrace_queues__add_event(&vpa->queues, session, event, + data_offset, &buffer); + + if (err) + return err; + + /* Dump here now we have copied a piped trace out of the pipe */ + if (auxtrace_buffer__get_data(buffer, fd)) { + powerpc_vpadtl_dump_event(vpa, buffer->data, buffer->size); + auxtrace_buffer__put_data(buffer); + } + + return 0; +} + +static int powerpc_vpadtl_flush(struct perf_session *session __maybe_unused, + const struct perf_tool *tool __maybe_unused) +{ + return 0; +} + +static void powerpc_vpadtl_free_events(struct perf_session *session) +{ + struct powerpc_vpadtl *vpa = session_to_vpa(session); + struct auxtrace_queues *queues = &vpa->queues; + + for (unsigned int i = 0; i < queues->nr_queues; i++) + zfree(&queues->queue_array[i].priv); + + auxtrace_queues__free(queues); +} + +static void powerpc_vpadtl_free(struct perf_session *session) +{ + struct powerpc_vpadtl *vpa = session_to_vpa(session); + + auxtrace_heap__free(&vpa->heap); + powerpc_vpadtl_free_events(session); + session->auxtrace = NULL; + free(vpa); +} + +static const char * const powerpc_vpadtl_info_fmts[] = { + [POWERPC_VPADTL_TYPE] = " PMU Type %"PRId64"\n", +}; + +static void powerpc_vpadtl_print_info(__u64 *arr) +{ + if (!dump_trace) + return; + + fprintf(stdout, powerpc_vpadtl_info_fmts[POWERPC_VPADTL_TYPE], arr[POWERPC_VPADTL_TYPE]); +} + +static void set_event_name(struct evlist *evlist, u64 id, + const char *name) +{ + struct evsel *evsel; + + evlist__for_each_entry(evlist, evsel) { + if (evsel->core.id && evsel->core.id[0] == id) { + if (evsel->name) + zfree(&evsel->name); + evsel->name = strdup(name); + break; + } + } +} + +static int +powerpc_vpadtl_synth_events(struct powerpc_vpadtl *vpa, struct perf_session *session) +{ + struct evlist *evlist = session->evlist; + struct evsel *evsel; + struct perf_event_attr attr; + bool found = false; + u64 id; + int err; + + evlist__for_each_entry(evlist, evsel) { + if (strstarts(evsel->name, "vpa_dtl")) { + found = true; + break; + } + } + + if (!found) { + pr_debug("No selected events with VPA trace data\n"); + return 0; + } + + memset(&attr, 0, sizeof(struct perf_event_attr)); + attr.size = sizeof(struct perf_event_attr); + attr.sample_type = evsel->core.attr.sample_type; + attr.sample_id_all = evsel->core.attr.sample_id_all; + attr.type = PERF_TYPE_SYNTH; + attr.config = PERF_SYNTH_POWERPC_VPA_DTL; + + /* create new id val to be a fixed offset from evsel id */ + id = evsel->core.id[0] + 1000000000; + if (!id) + id = 1; + + err = perf_session__deliver_synth_attr_event(session, &attr, id); + if (err) + return err; + + vpa->sample_id = id; + set_event_name(evlist, id, "vpa-dtl"); + + return 0; +} + +/* + * Process the PERF_RECORD_AUXTRACE_INFO records and setup + * the infrastructure to process auxtrace events. PERF_RECORD_AUXTRACE_INFO + * is processed first since it is of type perf_user_event_type. + * Initialise the aux buffer queues using auxtrace_queues__init(). + * auxtrace_queue is created for each CPU. + */ +int powerpc_vpadtl_process_auxtrace_info(union perf_event *event, + struct perf_session *session) +{ + struct perf_record_auxtrace_info *auxtrace_info = &event->auxtrace_info; + size_t min_sz = sizeof(u64) * POWERPC_VPADTL_TYPE; + struct powerpc_vpadtl *vpa; + int err; + + if (auxtrace_info->header.size < sizeof(struct perf_record_auxtrace_info) + + min_sz) + return -EINVAL; + + vpa = zalloc(sizeof(struct powerpc_vpadtl)); + if (!vpa) + return -ENOMEM; + + err = auxtrace_queues__init(&vpa->queues); + if (err) + goto err_free; + + vpa->session = session; + vpa->machine = &session->machines.host; /* No kvm support */ + vpa->auxtrace_type = auxtrace_info->type; + vpa->pmu_type = auxtrace_info->priv[POWERPC_VPADTL_TYPE]; + + vpa->auxtrace.process_event = powerpc_vpadtl_process_event; + vpa->auxtrace.process_auxtrace_event = powerpc_vpadtl_process_auxtrace_event; + vpa->auxtrace.flush_events = powerpc_vpadtl_flush; + vpa->auxtrace.free_events = powerpc_vpadtl_free_events; + vpa->auxtrace.free = powerpc_vpadtl_free; + session->auxtrace = &vpa->auxtrace; + + powerpc_vpadtl_print_info(&auxtrace_info->priv[0]); + + if (dump_trace) + return 0; + + err = powerpc_vpadtl_synth_events(vpa, session); + if (err) + goto err_free_queues; + + err = auxtrace_queues__process_index(&vpa->queues, session); + if (err) + goto err_free_queues; + + return 0; + +err_free_queues: + auxtrace_queues__free(&vpa->queues); + session->auxtrace = NULL; + +err_free: + free(vpa); + return err; +} diff --git a/tools/perf/util/powerpc-vpadtl.h b/tools/perf/util/powerpc-vpadtl.h new file mode 100644 index 000000000000..ca809660b9bb --- /dev/null +++ b/tools/perf/util/powerpc-vpadtl.h @@ -0,0 +1,23 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * VPA DTL PMU Support + */ + +#ifndef INCLUDE__PERF_POWERPC_VPADTL_H__ +#define INCLUDE__PERF_POWERPC_VPADTL_H__ + +enum { + POWERPC_VPADTL_TYPE, + VPADTL_AUXTRACE_PRIV_MAX, +}; + +#define VPADTL_AUXTRACE_PRIV_SIZE (VPADTL_AUXTRACE_PRIV_MAX * sizeof(u64)) + +union perf_event; +struct perf_session; +struct perf_pmu; + +int powerpc_vpadtl_process_auxtrace_info(union perf_event *event, + struct perf_session *session); + +#endif diff --git a/tools/perf/util/print_insn.c b/tools/perf/util/print_insn.c index a33a7726422d..02e6fbb8ca04 100644 --- a/tools/perf/util/print_insn.c +++ b/tools/perf/util/print_insn.c @@ -7,6 +7,7 @@ #include <inttypes.h> #include <string.h> #include <stdbool.h> +#include "capstone.h" #include "debug.h" #include "sample.h" #include "symbol.h" @@ -29,84 +30,6 @@ size_t sample__fprintf_insn_raw(struct perf_sample *sample, FILE *fp) return printed; } -#ifdef HAVE_LIBCAPSTONE_SUPPORT -#include <capstone/capstone.h> - -int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style); - -int capstone_init(struct machine *machine, csh *cs_handle, bool is64, bool disassembler_style) -{ - cs_arch arch; - cs_mode mode; - - if (machine__is(machine, "x86_64") && is64) { - arch = CS_ARCH_X86; - mode = CS_MODE_64; - } else if (machine__normalized_is(machine, "x86")) { - arch = CS_ARCH_X86; - mode = CS_MODE_32; - } else if (machine__normalized_is(machine, "arm64")) { - arch = CS_ARCH_ARM64; - mode = CS_MODE_ARM; - } else if (machine__normalized_is(machine, "arm")) { - arch = CS_ARCH_ARM; - mode = CS_MODE_ARM + CS_MODE_V8; - } else if (machine__normalized_is(machine, "s390")) { - arch = CS_ARCH_SYSZ; - mode = CS_MODE_BIG_ENDIAN; - } else { - return -1; - } - - if (cs_open(arch, mode, cs_handle) != CS_ERR_OK) { - pr_warning_once("cs_open failed\n"); - return -1; - } - - if (machine__normalized_is(machine, "x86")) { - /* - * In case of using capstone_init while symbol__disassemble - * setting CS_OPT_SYNTAX_ATT depends if disassembler_style opts - * is set via annotation args - */ - if (disassembler_style) - cs_option(*cs_handle, CS_OPT_SYNTAX, CS_OPT_SYNTAX_ATT); - /* - * Resolving address operands to symbols is implemented - * on x86 by investigating instruction details. - */ - cs_option(*cs_handle, CS_OPT_DETAIL, CS_OPT_ON); - } - - return 0; -} - -static size_t print_insn_x86(struct thread *thread, u8 cpumode, cs_insn *insn, - int print_opts, FILE *fp) -{ - struct addr_location al; - size_t printed = 0; - - if (insn->detail && insn->detail->x86.op_count == 1) { - cs_x86_op *op = &insn->detail->x86.operands[0]; - - addr_location__init(&al); - if (op->type == X86_OP_IMM && - thread__find_symbol(thread, cpumode, op->imm, &al)) { - printed += fprintf(fp, "%s ", insn[0].mnemonic); - printed += symbol__fprintf_symname_offs(al.sym, &al, fp); - if (print_opts & PRINT_INSN_IMM_HEX) - printed += fprintf(fp, " [%#" PRIx64 "]", op->imm); - addr_location__exit(&al); - return printed; - } - addr_location__exit(&al); - } - - printed += fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str); - return printed; -} - static bool is64bitip(struct machine *machine, struct addr_location *al) { const struct dso *dso = al->map ? map__dso(al->map) : NULL; @@ -123,32 +46,8 @@ ssize_t fprintf_insn_asm(struct machine *machine, struct thread *thread, u8 cpum bool is64bit, const uint8_t *code, size_t code_size, uint64_t ip, int *lenp, int print_opts, FILE *fp) { - size_t printed; - cs_insn *insn; - csh cs_handle; - size_t count; - int ret; - - /* TODO: Try to initiate capstone only once but need a proper place. */ - ret = capstone_init(machine, &cs_handle, is64bit, true); - if (ret < 0) - return ret; - - count = cs_disasm(cs_handle, code, code_size, ip, 1, &insn); - if (count > 0) { - if (machine__normalized_is(machine, "x86")) - printed = print_insn_x86(thread, cpumode, &insn[0], print_opts, fp); - else - printed = fprintf(fp, "%s %s", insn[0].mnemonic, insn[0].op_str); - if (lenp) - *lenp = insn->size; - cs_free(insn, count); - } else { - printed = -1; - } - - cs_close(&cs_handle); - return printed; + return capstone__fprintf_insn_asm(machine, thread, cpumode, is64bit, code, code_size, + ip, lenp, print_opts, fp); } size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *thread, @@ -166,13 +65,3 @@ size_t sample__fprintf_insn_asm(struct perf_sample *sample, struct thread *threa return printed; } -#else -size_t sample__fprintf_insn_asm(struct perf_sample *sample __maybe_unused, - struct thread *thread __maybe_unused, - struct machine *machine __maybe_unused, - FILE *fp __maybe_unused, - struct addr_location *al __maybe_unused) -{ - return 0; -} -#endif diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index ea77bea0306f..779fe1280a56 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -14,10 +14,12 @@ #include "evlist.h" #include "evsel.h" #include "event.h" +#include "expr.h" #include "print_binary.h" #include "record.h" #include "strbuf.h" #include "thread_map.h" +#include "tp_pmu.h" #include "trace-event.h" #include "metricgroup.h" #include "mmap.h" @@ -485,13 +487,19 @@ static PyObject *pyrf_event__new(const union perf_event *event) if ((event->header.type < PERF_RECORD_MMAP || event->header.type > PERF_RECORD_SAMPLE) && !(event->header.type == PERF_RECORD_SWITCH || - event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)) + event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)) { + PyErr_Format(PyExc_TypeError, "Unexpected header type %u", + event->header.type); return NULL; + } // FIXME this better be dynamic or we need to parse everything // before calling perf_mmap__consume(), including tracepoint fields. - if (sizeof(pevent->event) < event->header.size) + if (sizeof(pevent->event) < event->header.size) { + PyErr_Format(PyExc_TypeError, "Unexpected event size: %zd < %u", + sizeof(pevent->event), event->header.size); return NULL; + } ptype = pyrf_event__type[event->header.type]; pevent = PyObject_New(struct pyrf_event, ptype); @@ -642,6 +650,209 @@ static int pyrf_thread_map__setup_types(void) return PyType_Ready(&pyrf_thread_map__type); } +/** + * A python wrapper for perf_pmus that are globally owned by the pmus.c code. + */ +struct pyrf_pmu { + PyObject_HEAD + + struct perf_pmu *pmu; +}; + +static void pyrf_pmu__delete(struct pyrf_pmu *ppmu) +{ + Py_TYPE(ppmu)->tp_free((PyObject *)ppmu); +} + +static PyObject *pyrf_pmu__name(PyObject *self) +{ + struct pyrf_pmu *ppmu = (void *)self; + + return PyUnicode_FromString(ppmu->pmu->name); +} + +static bool add_to_dict(PyObject *dict, const char *key, const char *value) +{ + PyObject *pkey, *pvalue; + bool ret; + + if (value == NULL) + return true; + + pkey = PyUnicode_FromString(key); + pvalue = PyUnicode_FromString(value); + + ret = pkey && pvalue && PyDict_SetItem(dict, pkey, pvalue) == 0; + Py_XDECREF(pkey); + Py_XDECREF(pvalue); + return ret; +} + +static int pyrf_pmu__events_cb(void *state, struct pmu_event_info *info) +{ + PyObject *py_list = state; + PyObject *dict = PyDict_New(); + + if (!dict) + return -ENOMEM; + + if (!add_to_dict(dict, "name", info->name) || + !add_to_dict(dict, "alias", info->alias) || + !add_to_dict(dict, "scale_unit", info->scale_unit) || + !add_to_dict(dict, "desc", info->desc) || + !add_to_dict(dict, "long_desc", info->long_desc) || + !add_to_dict(dict, "encoding_desc", info->encoding_desc) || + !add_to_dict(dict, "topic", info->topic) || + !add_to_dict(dict, "event_type_desc", info->event_type_desc) || + !add_to_dict(dict, "str", info->str) || + !add_to_dict(dict, "deprecated", info->deprecated ? "deprecated" : NULL) || + PyList_Append(py_list, dict) != 0) { + Py_DECREF(dict); + return -ENOMEM; + } + Py_DECREF(dict); + return 0; +} + +static PyObject *pyrf_pmu__events(PyObject *self) +{ + struct pyrf_pmu *ppmu = (void *)self; + PyObject *py_list = PyList_New(0); + int ret; + + if (!py_list) + return NULL; + + ret = perf_pmu__for_each_event(ppmu->pmu, + /*skip_duplicate_pmus=*/false, + py_list, + pyrf_pmu__events_cb); + if (ret) { + Py_DECREF(py_list); + errno = -ret; + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + return py_list; +} + +static PyObject *pyrf_pmu__repr(PyObject *self) +{ + struct pyrf_pmu *ppmu = (void *)self; + + return PyUnicode_FromFormat("pmu(%s)", ppmu->pmu->name); +} + +static const char pyrf_pmu__doc[] = PyDoc_STR("perf Performance Monitoring Unit (PMU) object."); + +static PyMethodDef pyrf_pmu__methods[] = { + { + .ml_name = "events", + .ml_meth = (PyCFunction)pyrf_pmu__events, + .ml_flags = METH_NOARGS, + .ml_doc = PyDoc_STR("Returns a sequence of events encoded as a dictionaries.") + }, + { + .ml_name = "name", + .ml_meth = (PyCFunction)pyrf_pmu__name, + .ml_flags = METH_NOARGS, + .ml_doc = PyDoc_STR("Name of the PMU including suffixes.") + }, + { .ml_name = NULL, } +}; + +/** The python type for a perf.pmu. */ +static PyTypeObject pyrf_pmu__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "perf.pmu", + .tp_basicsize = sizeof(struct pyrf_pmu), + .tp_dealloc = (destructor)pyrf_pmu__delete, + .tp_flags = Py_TPFLAGS_DEFAULT|Py_TPFLAGS_BASETYPE, + .tp_doc = pyrf_pmu__doc, + .tp_methods = pyrf_pmu__methods, + .tp_str = pyrf_pmu__name, + .tp_repr = pyrf_pmu__repr, +}; + +static int pyrf_pmu__setup_types(void) +{ + pyrf_pmu__type.tp_new = PyType_GenericNew; + return PyType_Ready(&pyrf_pmu__type); +} + + +/** A python iterator for pmus that has no equivalent in the C code. */ +struct pyrf_pmu_iterator { + PyObject_HEAD + struct perf_pmu *pmu; +}; + +static void pyrf_pmu_iterator__dealloc(struct pyrf_pmu_iterator *self) +{ + Py_TYPE(self)->tp_free((PyObject *) self); +} + +static PyObject *pyrf_pmu_iterator__new(PyTypeObject *type, PyObject *args __maybe_unused, + PyObject *kwds __maybe_unused) +{ + struct pyrf_pmu_iterator *itr = (void *)type->tp_alloc(type, 0); + + if (itr != NULL) + itr->pmu = perf_pmus__scan(/*pmu=*/NULL); + + return (PyObject *) itr; +} + +static PyObject *pyrf_pmu_iterator__iter(PyObject *self) +{ + Py_INCREF(self); + return self; +} + +static PyObject *pyrf_pmu_iterator__iternext(PyObject *self) +{ + struct pyrf_pmu_iterator *itr = (void *)self; + struct pyrf_pmu *ppmu; + + if (itr->pmu == NULL) { + PyErr_SetNone(PyExc_StopIteration); + return NULL; + } + // Create object to return. + ppmu = PyObject_New(struct pyrf_pmu, &pyrf_pmu__type); + if (ppmu) { + ppmu->pmu = itr->pmu; + // Advance iterator. + itr->pmu = perf_pmus__scan(itr->pmu); + } + return (PyObject *)ppmu; +} + +/** The python type for the PMU iterator. */ +static PyTypeObject pyrf_pmu_iterator__type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "pmus.iterator", + .tp_doc = "Iterator for the pmus string sequence.", + .tp_basicsize = sizeof(struct pyrf_pmu_iterator), + .tp_itemsize = 0, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_new = pyrf_pmu_iterator__new, + .tp_dealloc = (destructor) pyrf_pmu_iterator__dealloc, + .tp_iter = pyrf_pmu_iterator__iter, + .tp_iternext = pyrf_pmu_iterator__iternext, +}; + +static int pyrf_pmu_iterator__setup_types(void) +{ + return PyType_Ready(&pyrf_pmu_iterator__type); +} + +static PyObject *pyrf__pmus(PyObject *self, PyObject *args) +{ + // Calling the class creates an instance of the iterator. + return PyObject_CallObject((PyObject *) &pyrf_pmu_iterator__type, /*args=*/NULL); +} + struct pyrf_counts_values { PyObject_HEAD @@ -1093,6 +1304,151 @@ static PyObject *pyrf_evlist__all_cpus(struct pyrf_evlist *pevlist) return (PyObject *)pcpu_map; } +static PyObject *pyrf_evlist__metrics(struct pyrf_evlist *pevlist) +{ + PyObject *list = PyList_New(/*len=*/0); + struct rb_node *node; + + if (!list) + return NULL; + + for (node = rb_first_cached(&pevlist->evlist.metric_events.entries); node; + node = rb_next(node)) { + struct metric_event *me = container_of(node, struct metric_event, nd); + struct list_head *pos; + + list_for_each(pos, &me->head) { + struct metric_expr *expr = container_of(pos, struct metric_expr, nd); + PyObject *str = PyUnicode_FromString(expr->metric_name); + + if (!str || PyList_Append(list, str) != 0) { + Py_DECREF(list); + return NULL; + } + Py_DECREF(str); + } + } + return list; +} + +static int prepare_metric(const struct metric_expr *mexp, + const struct evsel *evsel, + struct expr_parse_ctx *pctx, + int cpu_idx, int thread_idx) +{ + struct evsel * const *metric_events = mexp->metric_events; + struct metric_ref *metric_refs = mexp->metric_refs; + + for (int i = 0; metric_events[i]; i++) { + char *n = strdup(evsel__metric_id(metric_events[i])); + double val, ena, run; + int source_count = evsel__source_count(metric_events[i]); + int ret; + struct perf_counts_values *old_count, *new_count; + + if (!n) + return -ENOMEM; + + if (source_count == 0) + source_count = 1; + + ret = evsel__ensure_counts(metric_events[i]); + if (ret) + return ret; + + /* Set up pointers to the old and newly read counter values. */ + old_count = perf_counts(metric_events[i]->prev_raw_counts, cpu_idx, thread_idx); + new_count = perf_counts(metric_events[i]->counts, cpu_idx, thread_idx); + /* Update the value in metric_events[i]->counts. */ + evsel__read_counter(metric_events[i], cpu_idx, thread_idx); + + val = new_count->val - old_count->val; + ena = new_count->ena - old_count->ena; + run = new_count->run - old_count->run; + + if (ena != run && run != 0) + val = val * ena / run; + ret = expr__add_id_val_source_count(pctx, n, val, source_count); + if (ret) + return ret; + } + + for (int i = 0; metric_refs && metric_refs[i].metric_name; i++) { + int ret = expr__add_ref(pctx, &metric_refs[i]); + + if (ret) + return ret; + } + + return 0; +} + +static PyObject *pyrf_evlist__compute_metric(struct pyrf_evlist *pevlist, + PyObject *args, PyObject *kwargs) +{ + int ret, cpu = 0, cpu_idx = 0, thread = 0, thread_idx = 0; + const char *metric; + struct rb_node *node; + struct metric_expr *mexp = NULL; + struct expr_parse_ctx *pctx; + double result = 0; + + if (!PyArg_ParseTuple(args, "sii", &metric, &cpu, &thread)) + return NULL; + + for (node = rb_first_cached(&pevlist->evlist.metric_events.entries); + mexp == NULL && node; + node = rb_next(node)) { + struct metric_event *me = container_of(node, struct metric_event, nd); + struct list_head *pos; + + list_for_each(pos, &me->head) { + struct metric_expr *e = container_of(pos, struct metric_expr, nd); + + if (strcmp(e->metric_name, metric)) + continue; + + if (e->metric_events[0] == NULL) + continue; + + cpu_idx = perf_cpu_map__idx(e->metric_events[0]->core.cpus, + (struct perf_cpu){.cpu = cpu}); + if (cpu_idx < 0) + continue; + + thread_idx = perf_thread_map__idx(e->metric_events[0]->core.threads, + thread); + if (thread_idx < 0) + continue; + + mexp = e; + break; + } + } + if (!mexp) { + PyErr_Format(PyExc_TypeError, "Unknown metric '%s' for CPU '%d' and thread '%d'", + metric, cpu, thread); + return NULL; + } + + pctx = expr__ctx_new(); + if (!pctx) + return PyErr_NoMemory(); + + ret = prepare_metric(mexp, mexp->metric_events[0], pctx, cpu_idx, thread_idx); + if (ret) { + expr__ctx_free(pctx); + errno = -ret; + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + if (expr__parse(&result, pctx, mexp->metric_expr)) + result = 0.0; + + expr__ctx_free(pctx); + return PyFloat_FromDouble(result); +} + static PyObject *pyrf_evlist__mmap(struct pyrf_evlist *pevlist, PyObject *args, PyObject *kwargs) { @@ -1209,8 +1565,10 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, return NULL; md = get_md(evlist, cpu); - if (!md) + if (!md) { + PyErr_Format(PyExc_TypeError, "Unknown CPU '%d'", cpu); return NULL; + } if (perf_mmap__read_init(&md->core) < 0) goto end; @@ -1320,6 +1678,18 @@ static PyMethodDef pyrf_evlist__methods[] = { .ml_doc = PyDoc_STR("CPU map union of all evsel CPU maps.") }, { + .ml_name = "metrics", + .ml_meth = (PyCFunction)pyrf_evlist__metrics, + .ml_flags = METH_NOARGS, + .ml_doc = PyDoc_STR("List of metric names within the evlist.") + }, + { + .ml_name = "compute_metric", + .ml_meth = (PyCFunction)pyrf_evlist__compute_metric, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = PyDoc_STR("compute metric for given name, cpu and thread") + }, + { .ml_name = "mmap", .ml_meth = (PyCFunction)pyrf_evlist__mmap, .ml_flags = METH_VARARGS | METH_KEYWORDS, @@ -1546,10 +1916,6 @@ static const struct perf_constant perf__constants[] = { static PyObject *pyrf__tracepoint(struct pyrf_evsel *pevsel, PyObject *args, PyObject *kwargs) { -#ifndef HAVE_LIBTRACEEVENT - return NULL; -#else - struct tep_event *tp_format; static char *kwlist[] = { "sys", "name", NULL }; char *sys = NULL; char *name = NULL; @@ -1558,12 +1924,7 @@ static PyObject *pyrf__tracepoint(struct pyrf_evsel *pevsel, &sys, &name)) return NULL; - tp_format = trace_event__tp_format(sys, name); - if (IS_ERR(tp_format)) - return PyLong_FromLong(-1); - - return PyLong_FromLong(tp_format->id); -#endif // HAVE_LIBTRACEEVENT + return PyLong_FromLong(tp_pmu__id(sys, name)); } static PyObject *pyrf_evsel__from_evsel(struct evsel *evsel) @@ -1688,8 +2049,128 @@ static PyObject *pyrf__parse_events(PyObject *self, PyObject *args) return result; } +static PyObject *pyrf__parse_metrics(PyObject *self, PyObject *args) +{ + const char *input; + struct evlist evlist = {}; + PyObject *result; + PyObject *pcpus = NULL, *pthreads = NULL; + struct perf_cpu_map *cpus; + struct perf_thread_map *threads; + int ret; + + if (!PyArg_ParseTuple(args, "s|OO", &input, &pcpus, &pthreads)) + return NULL; + + threads = pthreads ? ((struct pyrf_thread_map *)pthreads)->threads : NULL; + cpus = pcpus ? ((struct pyrf_cpu_map *)pcpus)->cpus : NULL; + + evlist__init(&evlist, cpus, threads); + ret = metricgroup__parse_groups(&evlist, /*pmu=*/"all", input, + /*metric_no_group=*/ false, + /*metric_no_merge=*/ false, + /*metric_no_threshold=*/ true, + /*user_requested_cpu_list=*/ NULL, + /*system_wide=*/true, + /*hardware_aware_grouping=*/ false); + if (ret) { + errno = -ret; + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + result = pyrf_evlist__from_evlist(&evlist); + evlist__exit(&evlist); + return result; +} + +static PyObject *pyrf__metrics_groups(const struct pmu_metric *pm) +{ + PyObject *groups = PyList_New(/*len=*/0); + const char *mg = pm->metric_group; + + if (!groups) + return NULL; + + while (mg) { + PyObject *val = NULL; + const char *sep = strchr(mg, ';'); + size_t len = sep ? (size_t)(sep - mg) : strlen(mg); + + if (len > 0) { + val = PyUnicode_FromStringAndSize(mg, len); + if (val) + PyList_Append(groups, val); + + Py_XDECREF(val); + } + mg = sep ? sep + 1 : NULL; + } + return groups; +} + +static int pyrf__metrics_cb(const struct pmu_metric *pm, + const struct pmu_metrics_table *table __maybe_unused, + void *vdata) +{ + PyObject *py_list = vdata; + PyObject *dict = PyDict_New(); + PyObject *key = dict ? PyUnicode_FromString("MetricGroup") : NULL; + PyObject *value = key ? pyrf__metrics_groups(pm) : NULL; + + if (!value || PyDict_SetItem(dict, key, value) != 0) { + Py_XDECREF(key); + Py_XDECREF(value); + Py_XDECREF(dict); + return -ENOMEM; + } + + if (!add_to_dict(dict, "MetricName", pm->metric_name) || + !add_to_dict(dict, "PMU", pm->pmu) || + !add_to_dict(dict, "MetricExpr", pm->metric_expr) || + !add_to_dict(dict, "MetricThreshold", pm->metric_threshold) || + !add_to_dict(dict, "ScaleUnit", pm->unit) || + !add_to_dict(dict, "Compat", pm->compat) || + !add_to_dict(dict, "BriefDescription", pm->desc) || + !add_to_dict(dict, "PublicDescription", pm->long_desc) || + PyList_Append(py_list, dict) != 0) { + Py_DECREF(dict); + return -ENOMEM; + } + Py_DECREF(dict); + return 0; +} + +static PyObject *pyrf__metrics(PyObject *self, PyObject *args) +{ + const struct pmu_metrics_table *table = pmu_metrics_table__find(); + PyObject *list = PyList_New(/*len=*/0); + int ret; + + if (!list) + return NULL; + + ret = pmu_metrics_table__for_each_metric(table, pyrf__metrics_cb, list); + if (!ret) + ret = pmu_for_each_sys_metric(pyrf__metrics_cb, list); + + if (ret) { + Py_DECREF(list); + errno = -ret; + PyErr_SetFromErrno(PyExc_OSError); + return NULL; + } + return list; +} + static PyMethodDef perf__methods[] = { { + .ml_name = "metrics", + .ml_meth = (PyCFunction) pyrf__metrics, + .ml_flags = METH_NOARGS, + .ml_doc = PyDoc_STR( + "Returns a list of metrics represented as string values in dictionaries.") + }, + { .ml_name = "tracepoint", .ml_meth = (PyCFunction) pyrf__tracepoint, .ml_flags = METH_VARARGS | METH_KEYWORDS, @@ -1701,6 +2182,19 @@ static PyMethodDef perf__methods[] = { .ml_flags = METH_VARARGS, .ml_doc = PyDoc_STR("Parse a string of events and return an evlist.") }, + { + .ml_name = "parse_metrics", + .ml_meth = (PyCFunction) pyrf__parse_metrics, + .ml_flags = METH_VARARGS, + .ml_doc = PyDoc_STR( + "Parse a string of metrics or metric groups and return an evlist.") + }, + { + .ml_name = "pmus", + .ml_meth = (PyCFunction) pyrf__pmus, + .ml_flags = METH_NOARGS, + .ml_doc = PyDoc_STR("Returns a sequence of pmus.") + }, { .ml_name = NULL, } }; @@ -1728,6 +2222,8 @@ PyMODINIT_FUNC PyInit_perf(void) pyrf_evsel__setup_types() < 0 || pyrf_thread_map__setup_types() < 0 || pyrf_cpu_map__setup_types() < 0 || + pyrf_pmu_iterator__setup_types() < 0 || + pyrf_pmu__setup_types() < 0 || pyrf_counts_values__setup_types() < 0) return module; diff --git a/tools/perf/util/scripting-engines/Build b/tools/perf/util/scripting-engines/Build index 2282fe3772f3..24f087b0cd11 100644 --- a/tools/perf/util/scripting-engines/Build +++ b/tools/perf/util/scripting-engines/Build @@ -3,7 +3,7 @@ ifeq ($(CONFIG_LIBTRACEEVENT),y) endif perf-util-$(CONFIG_LIBPYTHON) += trace-event-python.o -CFLAGS_trace-event-perl.o += $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-nested-externs -Wno-undef -Wno-switch-default -Wno-bad-function-cast -Wno-declaration-after-statement -Wno-switch-enum +CFLAGS_trace-event-perl.o += $(PERL_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-nested-externs -Wno-undef -Wno-switch-default -Wno-bad-function-cast -Wno-declaration-after-statement -Wno-switch-enum -Wno-thread-safety-analysis # -Wno-declaration-after-statement: The python headers have mixed code with declarations (decls after asserts, for instance) CFLAGS_trace-event-python.o += $(PYTHON_EMBED_CCOPTS) -Wno-redundant-decls -Wno-strict-prototypes -Wno-unused-parameter -Wno-shadow -Wno-deprecated-declarations -Wno-switch-enum -Wno-declaration-after-statement diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 26ae078278cd..09af486c83e4 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -1402,7 +1402,7 @@ static s64 perf_session__process_user_event(struct perf_session *session, const struct perf_tool *tool = session->tool; struct perf_sample sample; int fd = perf_data__fd(session->data); - int err; + s64 err; perf_sample__init(&sample, /*all=*/true); if ((event->header.type != PERF_RECORD_COMPRESSED && diff --git a/tools/perf/util/setup.py b/tools/perf/util/setup.py index dd289d15acfd..9cae2c472f4a 100644 --- a/tools/perf/util/setup.py +++ b/tools/perf/util/setup.py @@ -1,6 +1,7 @@ from os import getenv, path from subprocess import Popen, PIPE from re import sub +import shlex cc = getenv("CC") assert cc, "Environment variable CC not set" @@ -22,7 +23,9 @@ assert srctree, "Environment variable srctree, for the Linux sources, not set" src_feature_tests = f'{srctree}/tools/build/feature' def clang_has_option(option): - cc_output = Popen([cc, cc_options + option, path.join(src_feature_tests, "test-hello.c") ], stderr=PIPE).stderr.readlines() + cmd = shlex.split(f"{cc} {cc_options} {option}") + cmd.append(path.join(src_feature_tests, "test-hello.c")) + cc_output = Popen(cmd, stderr=PIPE).stderr.readlines() return [o for o in cc_output if ((b"unknown argument" in o) or (b"is not supported" in o) or (b"unknown warning option" in o))] == [ ] if cc_is_clang: diff --git a/tools/perf/util/srcline.c b/tools/perf/util/srcline.c index 3e3449e35dd4..27c0966611ab 100644 --- a/tools/perf/util/srcline.c +++ b/tools/perf/util/srcline.c @@ -1,32 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 -#include <inttypes.h> -#include <signal.h> -#include <stdio.h> -#include <stdlib.h> -#include <string.h> -#include <sys/types.h> - -#include <linux/compiler.h> -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/zalloc.h> - -#include <api/io.h> - -#include "util/dso.h" -#include "util/debug.h" -#include "util/callchain.h" -#include "util/symbol_conf.h" -#ifdef HAVE_LIBLLVM_SUPPORT -#include "util/llvm-c-helpers.h" -#endif #include "srcline.h" -#include "string2.h" +#include "addr2line.h" +#include "dso.h" +#include "callchain.h" +#include "libbfd.h" +#include "llvm.h" #include "symbol.h" -#include "subcmd/run-command.h" -/* If addr2line doesn't return data for 1 second then timeout. */ -int addr2line_timeout_ms = 1 * 1000; +#include <inttypes.h> +#include <string.h> + bool srcline_full_filename; char *srcline__unknown = (char *)"??:0"; @@ -49,8 +32,7 @@ static const char *srcline_dso_name(struct dso *dso) return dso_name; } -static int inline_list__append(struct symbol *symbol, char *srcline, - struct inline_node *node) +int inline_list__append(struct symbol *symbol, char *srcline, struct inline_node *node) { struct inline_list *ilist; @@ -77,7 +59,7 @@ static const char *gnu_basename(const char *path) return base ? base + 1 : path; } -static char *srcline_from_fileline(const char *file, unsigned int line) +char *srcline_from_fileline(const char *file, unsigned int line) { char *srcline; @@ -93,9 +75,9 @@ static char *srcline_from_fileline(const char *file, unsigned int line) return srcline; } -static struct symbol *new_inline_sym(struct dso *dso, - struct symbol *base_sym, - const char *funcname) +struct symbol *new_inline_sym(struct dso *dso, + struct symbol *base_sym, + const char *funcname) { struct symbol *inline_sym; char *demangled = NULL; @@ -132,722 +114,23 @@ static struct symbol *new_inline_sym(struct dso *dso, return inline_sym; } -#define MAX_INLINE_NEST 1024 - -#ifdef HAVE_LIBLLVM_SUPPORT - -static void free_llvm_inline_frames(struct llvm_a2l_frame *inline_frames, - int num_frames) -{ - if (inline_frames != NULL) { - for (int i = 0; i < num_frames; ++i) { - zfree(&inline_frames[i].filename); - zfree(&inline_frames[i].funcname); - } - zfree(&inline_frames); - } -} - -static int addr2line(const char *dso_name, u64 addr, - char **file, unsigned int *line, struct dso *dso, - bool unwind_inlines, struct inline_node *node, +static int addr2line(const char *dso_name, u64 addr, char **file, unsigned int *line_nr, + struct dso *dso, bool unwind_inlines, struct inline_node *node, struct symbol *sym) { - struct llvm_a2l_frame *inline_frames = NULL; - int num_frames = llvm_addr2line(dso_name, addr, file, line, - node && unwind_inlines, &inline_frames); - - if (num_frames == 0 || !inline_frames) { - /* Error, or we didn't want inlines. */ - return num_frames; - } - - for (int i = 0; i < num_frames; ++i) { - struct symbol *inline_sym = - new_inline_sym(dso, sym, inline_frames[i].funcname); - char *srcline = NULL; - - if (inline_frames[i].filename) { - srcline = - srcline_from_fileline(inline_frames[i].filename, - inline_frames[i].line); - } - if (inline_list__append(inline_sym, srcline, node) != 0) { - free_llvm_inline_frames(inline_frames, num_frames); - return 0; - } - } - free_llvm_inline_frames(inline_frames, num_frames); - - return num_frames; -} - -void dso__free_a2l(struct dso *dso __maybe_unused) -{ - /* Nothing to free. */ -} - -#elif defined(HAVE_LIBBFD_SUPPORT) - -/* - * Implement addr2line using libbfd. - */ -#define PACKAGE "perf" -#include <bfd.h> - -struct a2l_data { - const char *input; - u64 addr; - - bool found; - const char *filename; - const char *funcname; - unsigned line; - - bfd *abfd; - asymbol **syms; -}; - -static int bfd_error(const char *string) -{ - const char *errmsg; - - errmsg = bfd_errmsg(bfd_get_error()); - fflush(stdout); - - if (string) - pr_debug("%s: %s\n", string, errmsg); - else - pr_debug("%s\n", errmsg); - - return -1; -} - -static int slurp_symtab(bfd *abfd, struct a2l_data *a2l) -{ - long storage; - long symcount; - asymbol **syms; - bfd_boolean dynamic = FALSE; - - if ((bfd_get_file_flags(abfd) & HAS_SYMS) == 0) - return bfd_error(bfd_get_filename(abfd)); - - storage = bfd_get_symtab_upper_bound(abfd); - if (storage == 0L) { - storage = bfd_get_dynamic_symtab_upper_bound(abfd); - dynamic = TRUE; - } - if (storage < 0L) - return bfd_error(bfd_get_filename(abfd)); - - syms = malloc(storage); - if (dynamic) - symcount = bfd_canonicalize_dynamic_symtab(abfd, syms); - else - symcount = bfd_canonicalize_symtab(abfd, syms); - - if (symcount < 0) { - free(syms); - return bfd_error(bfd_get_filename(abfd)); - } - - a2l->syms = syms; - return 0; -} - -static void find_address_in_section(bfd *abfd, asection *section, void *data) -{ - bfd_vma pc, vma; - bfd_size_type size; - struct a2l_data *a2l = data; - flagword flags; - - if (a2l->found) - return; + int ret; -#ifdef bfd_get_section_flags - flags = bfd_get_section_flags(abfd, section); -#else - flags = bfd_section_flags(section); -#endif - if ((flags & SEC_ALLOC) == 0) - return; - - pc = a2l->addr; -#ifdef bfd_get_section_vma - vma = bfd_get_section_vma(abfd, section); -#else - vma = bfd_section_vma(section); -#endif -#ifdef bfd_get_section_size - size = bfd_get_section_size(section); -#else - size = bfd_section_size(section); -#endif - - if (pc < vma || pc >= vma + size) - return; + ret = llvm__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym); + if (ret > 0) + return ret; - a2l->found = bfd_find_nearest_line(abfd, section, a2l->syms, pc - vma, - &a2l->filename, &a2l->funcname, - &a2l->line); - - if (a2l->filename && !strlen(a2l->filename)) - a2l->filename = NULL; -} - -static struct a2l_data *addr2line_init(const char *path) -{ - bfd *abfd; - struct a2l_data *a2l = NULL; - - abfd = bfd_openr(path, NULL); - if (abfd == NULL) - return NULL; + ret = libbfd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym); + if (ret > 0) + return ret; - if (!bfd_check_format(abfd, bfd_object)) - goto out; - - a2l = zalloc(sizeof(*a2l)); - if (a2l == NULL) - goto out; - - a2l->abfd = abfd; - a2l->input = strdup(path); - if (a2l->input == NULL) - goto out; - - if (slurp_symtab(abfd, a2l)) - goto out; - - return a2l; - -out: - if (a2l) { - zfree((char **)&a2l->input); - free(a2l); - } - bfd_close(abfd); - return NULL; + return cmd__addr2line(dso_name, addr, file, line_nr, dso, unwind_inlines, node, sym); } -static void addr2line_cleanup(struct a2l_data *a2l) -{ - if (a2l->abfd) - bfd_close(a2l->abfd); - zfree((char **)&a2l->input); - zfree(&a2l->syms); - free(a2l); -} - -static int inline_list__append_dso_a2l(struct dso *dso, - struct inline_node *node, - struct symbol *sym) -{ - struct a2l_data *a2l = dso__a2l(dso); - struct symbol *inline_sym = new_inline_sym(dso, sym, a2l->funcname); - char *srcline = NULL; - - if (a2l->filename) - srcline = srcline_from_fileline(a2l->filename, a2l->line); - - return inline_list__append(inline_sym, srcline, node); -} - -static int addr2line(const char *dso_name, u64 addr, - char **file, unsigned int *line, struct dso *dso, - bool unwind_inlines, struct inline_node *node, - struct symbol *sym) -{ - int ret = 0; - struct a2l_data *a2l = dso__a2l(dso); - - if (!a2l) { - a2l = addr2line_init(dso_name); - dso__set_a2l(dso, a2l); - } - - if (a2l == NULL) { - if (!symbol_conf.disable_add2line_warn) - pr_warning("addr2line_init failed for %s\n", dso_name); - return 0; - } - - a2l->addr = addr; - a2l->found = false; - - bfd_map_over_sections(a2l->abfd, find_address_in_section, a2l); - - if (!a2l->found) - return 0; - - if (unwind_inlines) { - int cnt = 0; - - if (node && inline_list__append_dso_a2l(dso, node, sym)) - return 0; - - while (bfd_find_inliner_info(a2l->abfd, &a2l->filename, - &a2l->funcname, &a2l->line) && - cnt++ < MAX_INLINE_NEST) { - - if (a2l->filename && !strlen(a2l->filename)) - a2l->filename = NULL; - - if (node != NULL) { - if (inline_list__append_dso_a2l(dso, node, sym)) - return 0; - // found at least one inline frame - ret = 1; - } - } - } - - if (file) { - *file = a2l->filename ? strdup(a2l->filename) : NULL; - ret = *file ? 1 : 0; - } - - if (line) - *line = a2l->line; - - return ret; -} - -void dso__free_a2l(struct dso *dso) -{ - struct a2l_data *a2l = dso__a2l(dso); - - if (!a2l) - return; - - addr2line_cleanup(a2l); - - dso__set_a2l(dso, NULL); -} - -#else /* HAVE_LIBBFD_SUPPORT */ - -static int filename_split(char *filename, unsigned int *line_nr) -{ - char *sep; - - sep = strchr(filename, '\n'); - if (sep) - *sep = '\0'; - - if (!strcmp(filename, "??:0")) - return 0; - - sep = strchr(filename, ':'); - if (sep) { - *sep++ = '\0'; - *line_nr = strtoul(sep, NULL, 0); - return 1; - } - pr_debug("addr2line missing ':' in filename split\n"); - return 0; -} - -static void addr2line_subprocess_cleanup(struct child_process *a2l) -{ - if (a2l->pid != -1) { - kill(a2l->pid, SIGKILL); - finish_command(a2l); /* ignore result, we don't care */ - a2l->pid = -1; - close(a2l->in); - close(a2l->out); - } - - free(a2l); -} - -static struct child_process *addr2line_subprocess_init(const char *addr2line_path, - const char *binary_path) -{ - const char *argv[] = { - addr2line_path ?: "addr2line", - "-e", binary_path, - "-a", "-i", "-f", NULL - }; - struct child_process *a2l = zalloc(sizeof(*a2l)); - int start_command_status = 0; - - if (a2l == NULL) { - pr_err("Failed to allocate memory for addr2line"); - return NULL; - } - - a2l->pid = -1; - a2l->in = -1; - a2l->out = -1; - a2l->no_stderr = 1; - - a2l->argv = argv; - start_command_status = start_command(a2l); - a2l->argv = NULL; /* it's not used after start_command; avoid dangling pointers */ - - if (start_command_status != 0) { - pr_warning("could not start addr2line (%s) for %s: start_command return code %d\n", - addr2line_path, binary_path, start_command_status); - addr2line_subprocess_cleanup(a2l); - return NULL; - } - - return a2l; -} - -enum a2l_style { - BROKEN, - GNU_BINUTILS, - LLVM, -}; - -static enum a2l_style addr2line_configure(struct child_process *a2l, const char *dso_name) -{ - static bool cached; - static enum a2l_style style; - - if (!cached) { - char buf[128]; - struct io io; - int ch; - int lines; - - if (write(a2l->in, ",\n", 2) != 2) - return BROKEN; - - io__init(&io, a2l->out, buf, sizeof(buf)); - ch = io__get_char(&io); - if (ch == ',') { - style = LLVM; - cached = true; - lines = 1; - pr_debug3("Detected LLVM addr2line style\n"); - } else if (ch == '0') { - style = GNU_BINUTILS; - cached = true; - lines = 3; - pr_debug3("Detected binutils addr2line style\n"); - } else { - if (!symbol_conf.disable_add2line_warn) { - char *output = NULL; - size_t output_len; - - io__getline(&io, &output, &output_len); - pr_warning("%s %s: addr2line configuration failed\n", - __func__, dso_name); - pr_warning("\t%c%s", ch, output); - } - pr_debug("Unknown/broken addr2line style\n"); - return BROKEN; - } - while (lines) { - ch = io__get_char(&io); - if (ch <= 0) - break; - if (ch == '\n') - lines--; - } - /* Ignore SIGPIPE in the event addr2line exits. */ - signal(SIGPIPE, SIG_IGN); - } - return style; -} - -static int read_addr2line_record(struct io *io, - enum a2l_style style, - const char *dso_name, - u64 addr, - bool first, - char **function, - char **filename, - unsigned int *line_nr) -{ - /* - * Returns: - * -1 ==> error - * 0 ==> sentinel (or other ill-formed) record read - * 1 ==> a genuine record read - */ - char *line = NULL; - size_t line_len = 0; - unsigned int dummy_line_nr = 0; - int ret = -1; - - if (function != NULL) - zfree(function); - - if (filename != NULL) - zfree(filename); - - if (line_nr != NULL) - *line_nr = 0; - - /* - * Read the first line. Without an error this will be: - * - for the first line an address like 0x1234, - * - the binutils sentinel 0x0000000000000000, - * - the llvm-addr2line the sentinel ',' character, - * - the function name line for an inlined function. - */ - if (io__getline(io, &line, &line_len) < 0 || !line_len) - goto error; - - pr_debug3("%s %s: addr2line read address for sentinel: %s", __func__, dso_name, line); - if (style == LLVM && line_len == 2 && line[0] == ',') { - /* Found the llvm-addr2line sentinel character. */ - zfree(&line); - return 0; - } else if (style == GNU_BINUTILS && (!first || addr != 0)) { - int zero_count = 0, non_zero_count = 0; - /* - * Check for binutils sentinel ignoring it for the case the - * requested address is 0. - */ - - /* A given address should always start 0x. */ - if (line_len >= 2 || line[0] != '0' || line[1] != 'x') { - for (size_t i = 2; i < line_len; i++) { - if (line[i] == '0') - zero_count++; - else if (line[i] != '\n') - non_zero_count++; - } - if (!non_zero_count) { - int ch; - - if (first && !zero_count) { - /* Line was erroneous just '0x'. */ - goto error; - } - /* - * Line was 0x0..0, the sentinel for binutils. Remove - * the function and filename lines. - */ - zfree(&line); - do { - ch = io__get_char(io); - } while (ch > 0 && ch != '\n'); - do { - ch = io__get_char(io); - } while (ch > 0 && ch != '\n'); - return 0; - } - } - } - /* Read the second function name line (if inline data then this is the first line). */ - if (first && (io__getline(io, &line, &line_len) < 0 || !line_len)) - goto error; - - pr_debug3("%s %s: addr2line read line: %s", __func__, dso_name, line); - if (function != NULL) - *function = strdup(strim(line)); - - zfree(&line); - line_len = 0; - - /* Read the third filename and line number line. */ - if (io__getline(io, &line, &line_len) < 0 || !line_len) - goto error; - - pr_debug3("%s %s: addr2line filename:number : %s", __func__, dso_name, line); - if (filename_split(line, line_nr == NULL ? &dummy_line_nr : line_nr) == 0 && - style == GNU_BINUTILS) { - ret = 0; - goto error; - } - - if (filename != NULL) - *filename = strdup(line); - - zfree(&line); - line_len = 0; - - return 1; - -error: - free(line); - if (function != NULL) - zfree(function); - if (filename != NULL) - zfree(filename); - return ret; -} - -static int inline_list__append_record(struct dso *dso, - struct inline_node *node, - struct symbol *sym, - const char *function, - const char *filename, - unsigned int line_nr) -{ - struct symbol *inline_sym = new_inline_sym(dso, sym, function); - - return inline_list__append(inline_sym, srcline_from_fileline(filename, line_nr), node); -} - -static int addr2line(const char *dso_name, u64 addr, - char **file, unsigned int *line_nr, - struct dso *dso, - bool unwind_inlines, - struct inline_node *node, - struct symbol *sym __maybe_unused) -{ - struct child_process *a2l = dso__a2l(dso); - char *record_function = NULL; - char *record_filename = NULL; - unsigned int record_line_nr = 0; - int record_status = -1; - int ret = 0; - size_t inline_count = 0; - int len; - char buf[128]; - ssize_t written; - struct io io = { .eof = false }; - enum a2l_style a2l_style; - - if (!a2l) { - if (!filename__has_section(dso_name, ".debug_line")) - goto out; - - dso__set_a2l(dso, - addr2line_subprocess_init(symbol_conf.addr2line_path, dso_name)); - a2l = dso__a2l(dso); - } - - if (a2l == NULL) { - if (!symbol_conf.disable_add2line_warn) - pr_warning("%s %s: addr2line_subprocess_init failed\n", __func__, dso_name); - goto out; - } - a2l_style = addr2line_configure(a2l, dso_name); - if (a2l_style == BROKEN) - goto out; - - /* - * Send our request and then *deliberately* send something that can't be - * interpreted as a valid address to ask addr2line about (namely, - * ","). This causes addr2line to first write out the answer to our - * request, in an unbounded/unknown number of records, and then to write - * out the lines "0x0...0", "??" and "??:0", for GNU binutils, or "," - * for llvm-addr2line, so that we can detect when it has finished giving - * us anything useful. - */ - len = snprintf(buf, sizeof(buf), "%016"PRIx64"\n,\n", addr); - written = len > 0 ? write(a2l->in, buf, len) : -1; - if (written != len) { - if (!symbol_conf.disable_add2line_warn) - pr_warning("%s %s: could not send request\n", __func__, dso_name); - goto out; - } - io__init(&io, a2l->out, buf, sizeof(buf)); - io.timeout_ms = addr2line_timeout_ms; - switch (read_addr2line_record(&io, a2l_style, dso_name, addr, /*first=*/true, - &record_function, &record_filename, &record_line_nr)) { - case -1: - if (!symbol_conf.disable_add2line_warn) - pr_warning("%s %s: could not read first record\n", __func__, dso_name); - goto out; - case 0: - /* - * The first record was invalid, so return failure, but first - * read another record, since we sent a sentinel ',' for the - * sake of detected the last inlined function. Treat this as the - * first of a record as the ',' generates a new start with GNU - * binutils, also force a non-zero address as we're no longer - * reading that record. - */ - switch (read_addr2line_record(&io, a2l_style, dso_name, - /*addr=*/1, /*first=*/true, - NULL, NULL, NULL)) { - case -1: - if (!symbol_conf.disable_add2line_warn) - pr_warning("%s %s: could not read sentinel record\n", - __func__, dso_name); - break; - case 0: - /* The sentinel as expected. */ - break; - default: - if (!symbol_conf.disable_add2line_warn) - pr_warning("%s %s: unexpected record instead of sentinel", - __func__, dso_name); - break; - } - goto out; - default: - /* First record as expected. */ - break; - } - - if (file) { - *file = strdup(record_filename); - ret = 1; - } - if (line_nr) - *line_nr = record_line_nr; - - if (unwind_inlines) { - if (node && inline_list__append_record(dso, node, sym, - record_function, - record_filename, - record_line_nr)) { - ret = 0; - goto out; - } - } - - /* - * We have to read the records even if we don't care about the inline - * info. This isn't the first record and force the address to non-zero - * as we're reading records beyond the first. - */ - while ((record_status = read_addr2line_record(&io, - a2l_style, - dso_name, - /*addr=*/1, - /*first=*/false, - &record_function, - &record_filename, - &record_line_nr)) == 1) { - if (unwind_inlines && node && inline_count++ < MAX_INLINE_NEST) { - if (inline_list__append_record(dso, node, sym, - record_function, - record_filename, - record_line_nr)) { - ret = 0; - goto out; - } - ret = 1; /* found at least one inline frame */ - } - } - -out: - free(record_function); - free(record_filename); - if (io.eof) { - dso__set_a2l(dso, NULL); - addr2line_subprocess_cleanup(a2l); - } - return ret; -} - -void dso__free_a2l(struct dso *dso) -{ - struct child_process *a2l = dso__a2l(dso); - - if (!a2l) - return; - - addr2line_subprocess_cleanup(a2l); - - dso__set_a2l(dso, NULL); -} - -#endif /* HAVE_LIBBFD_SUPPORT */ - static struct inline_node *addr2inlines(const char *dso_name, u64 addr, struct dso *dso, struct symbol *sym) { @@ -862,7 +145,9 @@ static struct inline_node *addr2inlines(const char *dso_name, u64 addr, INIT_LIST_HEAD(&node->val); node->addr = addr; - addr2line(dso_name, addr, NULL, NULL, dso, true, node, sym); + addr2line(dso_name, addr, /*file=*/NULL, /*line_nr=*/NULL, dso, + /*unwind_inlines=*/true, node, sym); + return node; } @@ -889,7 +174,7 @@ char *__get_srcline(struct dso *dso, u64 addr, struct symbol *sym, goto out_err; if (!addr2line(dso_name, addr, &file, &line, dso, - unwind_inlines, NULL, sym)) + unwind_inlines, /*node=*/NULL, sym)) goto out_err; srcline = srcline_from_fileline(file, line); @@ -935,7 +220,8 @@ char *get_srcline_split(struct dso *dso, u64 addr, unsigned *line) if (dso_name == NULL) goto out_err; - if (!addr2line(dso_name, addr, &file, line, dso, true, NULL, NULL)) + if (!addr2line(dso_name, addr, &file, line, dso, /*unwind_inlines=*/true, + /*node=*/NULL, /*sym=*/NULL)) goto out_err; dso__set_a2l_fails(dso, 0); diff --git a/tools/perf/util/srcline.h b/tools/perf/util/srcline.h index 75010d39ea28..c36f573cd339 100644 --- a/tools/perf/util/srcline.h +++ b/tools/perf/util/srcline.h @@ -9,7 +9,6 @@ struct dso; struct symbol; -extern int addr2line_timeout_ms; extern bool srcline_full_filename; char *get_srcline(struct dso *dso, u64 addr, struct symbol *sym, bool show_sym, bool show_addr, u64 ip); @@ -29,6 +28,8 @@ void srcline__tree_delete(struct rb_root_cached *tree); extern char *srcline__unknown; #define SRCLINE_UNKNOWN srcline__unknown +#define MAX_INLINE_NEST 1024 + struct inline_list { struct symbol *symbol; char *srcline; @@ -55,4 +56,10 @@ struct inline_node *inlines__tree_find(struct rb_root_cached *tree, u64 addr); /* delete all nodes within the tree of inline_node s */ void inlines__tree_delete(struct rb_root_cached *tree); +int inline_list__append(struct symbol *symbol, char *srcline, struct inline_node *node); +char *srcline_from_fileline(const char *file, unsigned int line); +struct symbol *new_inline_sym(struct dso *dso, + struct symbol *base_sym, + const char *funcname); + #endif /* PERF_SRCLINE_H */ diff --git a/tools/perf/util/stat.c b/tools/perf/util/stat.c index 50b1a92d16df..101ed6c497bc 100644 --- a/tools/perf/util/stat.c +++ b/tools/perf/util/stat.c @@ -716,59 +716,3 @@ size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp) return ret; } - -int create_perf_stat_counter(struct evsel *evsel, - struct perf_stat_config *config, - struct target *target, - int cpu_map_idx) -{ - struct perf_event_attr *attr = &evsel->core.attr; - struct evsel *leader = evsel__leader(evsel); - - attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING; - - /* - * The event is part of non trivial group, let's enable - * the group read (for leader) and ID retrieval for all - * members. - */ - if (leader->core.nr_members > 1) - attr->read_format |= PERF_FORMAT_ID|PERF_FORMAT_GROUP; - - attr->inherit = !config->no_inherit && list_empty(&evsel->bpf_counter_list); - - /* - * Some events get initialized with sample_(period/type) set, - * like tracepoints. Clear it up for counting. - */ - attr->sample_period = 0; - - if (config->identifier) - attr->sample_type = PERF_SAMPLE_IDENTIFIER; - - if (config->all_user) { - attr->exclude_kernel = 1; - attr->exclude_user = 0; - } - - if (config->all_kernel) { - attr->exclude_kernel = 0; - attr->exclude_user = 1; - } - - /* - * Disabling all counters initially, they will be enabled - * either manually by us or by kernel via enable_on_exec - * set later. - */ - if (evsel__is_group_leader(evsel)) { - attr->disabled = 1; - - if (target__enable_on_exec(target)) - attr->enable_on_exec = 1; - } - - return evsel__open_per_cpu_and_thread(evsel, evsel__cpus(evsel), cpu_map_idx, - evsel->core.threads); -} diff --git a/tools/perf/util/stat.h b/tools/perf/util/stat.h index 4b0f14ae4e5f..34f30a295f89 100644 --- a/tools/perf/util/stat.h +++ b/tools/perf/util/stat.h @@ -223,10 +223,6 @@ size_t perf_event__fprintf_stat(union perf_event *event, FILE *fp); size_t perf_event__fprintf_stat_round(union perf_event *event, FILE *fp); size_t perf_event__fprintf_stat_config(union perf_event *event, FILE *fp); -int create_perf_stat_counter(struct evsel *evsel, - struct perf_stat_config *config, - struct target *target, - int cpu_map_idx); void evlist__print_counters(struct evlist *evlist, struct perf_stat_config *config, struct target *_target, struct timespec *ts, int argc, const char **argv); diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c index 1346fd180653..9e820599bab3 100644 --- a/tools/perf/util/symbol-elf.c +++ b/tools/perf/util/symbol-elf.c @@ -9,6 +9,7 @@ #include "compress.h" #include "dso.h" +#include "libbfd.h" #include "map.h" #include "maps.h" #include "symbol.h" @@ -24,18 +25,6 @@ #include <symbol/kallsyms.h> #include <internal/lib.h> -#ifdef HAVE_LIBBFD_SUPPORT -#define PACKAGE 'perf' -#include <bfd.h> -#endif - -#if defined(HAVE_LIBBFD_SUPPORT) || defined(HAVE_CPLUS_DEMANGLE_SUPPORT) -#ifndef DMGL_PARAMS -#define DMGL_PARAMS (1 << 0) /* Include function args */ -#define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ -#endif -#endif - #ifndef EM_AARCH64 #define EM_AARCH64 183 /* ARM 64 bit */ #endif @@ -871,47 +860,16 @@ out: return err; } -#ifdef HAVE_LIBBFD_BUILDID_SUPPORT - -static int read_build_id(const char *filename, struct build_id *bid, bool block) -{ - size_t size = sizeof(bid->data); - int err = -1, fd; - bfd *abfd; - - fd = open(filename, block ? O_RDONLY : (O_RDONLY | O_NONBLOCK)); - if (fd < 0) - return -1; - - abfd = bfd_fdopenr(filename, /*target=*/NULL, fd); - if (!abfd) - return -1; - - if (!bfd_check_format(abfd, bfd_object)) { - pr_debug2("%s: cannot read %s bfd file.\n", __func__, filename); - goto out_close; - } - - if (!abfd->build_id || abfd->build_id->size > size) - goto out_close; - - memcpy(bid->data, abfd->build_id->data, abfd->build_id->size); - memset(bid->data + abfd->build_id->size, 0, size - abfd->build_id->size); - err = bid->size = abfd->build_id->size; - -out_close: - bfd_close(abfd); - return err; -} - -#else // HAVE_LIBBFD_BUILDID_SUPPORT - static int read_build_id(const char *filename, struct build_id *bid, bool block) { size_t size = sizeof(bid->data); - int fd, err = -1; + int fd, err; Elf *elf; + err = libbfd__read_build_id(filename, bid, block); + if (err >= 0) + goto out; + if (size < BUILD_ID_SIZE) goto out; @@ -936,8 +894,6 @@ out: return err; } -#endif // HAVE_LIBBFD_BUILDID_SUPPORT - int filename__read_build_id(const char *filename, struct build_id *bid, bool block) { struct kmod_path m = { .name = NULL, }; @@ -1022,44 +978,6 @@ out: return err; } -#ifdef HAVE_LIBBFD_SUPPORT - -int filename__read_debuglink(const char *filename, char *debuglink, - size_t size) -{ - int err = -1; - asection *section; - bfd *abfd; - - abfd = bfd_openr(filename, NULL); - if (!abfd) - return -1; - - if (!bfd_check_format(abfd, bfd_object)) { - pr_debug2("%s: cannot read %s bfd file.\n", __func__, filename); - goto out_close; - } - - section = bfd_get_section_by_name(abfd, ".gnu_debuglink"); - if (!section) - goto out_close; - - if (section->size > size) - goto out_close; - - if (!bfd_get_section_contents(abfd, section, debuglink, 0, - section->size)) - goto out_close; - - err = 0; - -out_close: - bfd_close(abfd); - return err; -} - -#else - int filename__read_debuglink(const char *filename, char *debuglink, size_t size) { @@ -1071,6 +989,10 @@ int filename__read_debuglink(const char *filename, char *debuglink, Elf_Scn *sec; Elf_Kind ek; + err = libbfd_filename__read_debuglink(filename, debuglink, size); + if (err >= 0) + goto out; + fd = open(filename, O_RDONLY); if (fd < 0) goto out; @@ -1112,8 +1034,6 @@ out: return err; } -#endif - bool symsrc__possibly_runtime(struct symsrc *ss) { return ss->dynsym || ss->opdsec; diff --git a/tools/perf/util/symbol-minimal.c b/tools/perf/util/symbol-minimal.c index 41e4ebe5eac5..aeb253248895 100644 --- a/tools/perf/util/symbol-minimal.c +++ b/tools/perf/util/symbol-minimal.c @@ -42,7 +42,7 @@ static int read_build_id(void *note_data, size_t note_len, struct build_id *bid, void *ptr; ptr = note_data; - while (ptr < (note_data + note_len)) { + while ((ptr + sizeof(*nhdr)) < (note_data + note_len)) { const char *name; size_t namesz, descsz; diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 3fed54de5401..cc26b7bf302b 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -107,9 +107,14 @@ static enum dso_binary_type binary_type_symtab[] = { static bool symbol_type__filter(char __symbol_type) { // Since 'U' == undefined and 'u' == unique global symbol, we can't use toupper there + // 'N' is for debugging symbols, 'n' is a non-data, non-code, non-debug read-only section. + // According to 'man nm'. + // 'N' first seen in: + // ffffffff9b35d130 N __pfx__RNCINvNtNtNtCsbDUBuN8AbD4_4core4iter8adapters3map12map_try_foldjNtCs6vVzKs5jPr6_12drm_panic_qr7VersionuINtNtNtBa_3ops12control_flow11ControlFlowB10_ENcB10_0NCINvNvNtNtNtB8_6traits8iterator8Iterator4find5checkB10_NCNvMB12_B10_13from_segments0E0E0B12_ + // a seemingly Rust mangled name char symbol_type = toupper(__symbol_type); return symbol_type == 'T' || symbol_type == 'W' || symbol_type == 'D' || symbol_type == 'B' || - __symbol_type == 'u' || __symbol_type == 'l'; + __symbol_type == 'u' || __symbol_type == 'l' || __symbol_type == 'N'; } static int prefix_underscores_count(const char *str) @@ -1584,137 +1589,6 @@ out_failure: return -1; } -#ifdef HAVE_LIBBFD_SUPPORT -#define PACKAGE 'perf' -#include <bfd.h> - -static int bfd_symbols__cmpvalue(const void *a, const void *b) -{ - const asymbol *as = *(const asymbol **)a, *bs = *(const asymbol **)b; - - if (bfd_asymbol_value(as) != bfd_asymbol_value(bs)) - return bfd_asymbol_value(as) - bfd_asymbol_value(bs); - - return bfd_asymbol_name(as)[0] - bfd_asymbol_name(bs)[0]; -} - -static int bfd2elf_binding(asymbol *symbol) -{ - if (symbol->flags & BSF_WEAK) - return STB_WEAK; - if (symbol->flags & BSF_GLOBAL) - return STB_GLOBAL; - if (symbol->flags & BSF_LOCAL) - return STB_LOCAL; - return -1; -} - -int dso__load_bfd_symbols(struct dso *dso, const char *debugfile) -{ - int err = -1; - long symbols_size, symbols_count, i; - asection *section; - asymbol **symbols, *sym; - struct symbol *symbol; - bfd *abfd; - u64 start, len; - - abfd = bfd_openr(debugfile, NULL); - if (!abfd) - return -1; - - if (!bfd_check_format(abfd, bfd_object)) { - pr_debug2("%s: cannot read %s bfd file.\n", __func__, - dso__long_name(dso)); - goto out_close; - } - - if (bfd_get_flavour(abfd) == bfd_target_elf_flavour) - goto out_close; - - symbols_size = bfd_get_symtab_upper_bound(abfd); - if (symbols_size == 0) { - bfd_close(abfd); - return 0; - } - - if (symbols_size < 0) - goto out_close; - - symbols = malloc(symbols_size); - if (!symbols) - goto out_close; - - symbols_count = bfd_canonicalize_symtab(abfd, symbols); - if (symbols_count < 0) - goto out_free; - - section = bfd_get_section_by_name(abfd, ".text"); - if (section) { - for (i = 0; i < symbols_count; ++i) { - if (!strcmp(bfd_asymbol_name(symbols[i]), "__ImageBase") || - !strcmp(bfd_asymbol_name(symbols[i]), "__image_base__")) - break; - } - if (i < symbols_count) { - /* PE symbols can only have 4 bytes, so use .text high bits */ - u64 text_offset = (section->vma - (u32)section->vma) - + (u32)bfd_asymbol_value(symbols[i]); - dso__set_text_offset(dso, text_offset); - dso__set_text_end(dso, (section->vma - text_offset) + section->size); - } else { - dso__set_text_offset(dso, section->vma - section->filepos); - dso__set_text_end(dso, section->filepos + section->size); - } - } - - qsort(symbols, symbols_count, sizeof(asymbol *), bfd_symbols__cmpvalue); - -#ifdef bfd_get_section -#define bfd_asymbol_section bfd_get_section -#endif - for (i = 0; i < symbols_count; ++i) { - sym = symbols[i]; - section = bfd_asymbol_section(sym); - if (bfd2elf_binding(sym) < 0) - continue; - - while (i + 1 < symbols_count && - bfd_asymbol_section(symbols[i + 1]) == section && - bfd2elf_binding(symbols[i + 1]) < 0) - i++; - - if (i + 1 < symbols_count && - bfd_asymbol_section(symbols[i + 1]) == section) - len = symbols[i + 1]->value - sym->value; - else - len = section->size - sym->value; - - start = bfd_asymbol_value(sym) - dso__text_offset(dso); - symbol = symbol__new(start, len, bfd2elf_binding(sym), STT_FUNC, - bfd_asymbol_name(sym)); - if (!symbol) - goto out_free; - - symbols__insert(dso__symbols(dso), symbol); - } -#ifdef bfd_get_section -#undef bfd_asymbol_section -#endif - - symbols__fixup_end(dso__symbols(dso), false); - symbols__fixup_duplicate(dso__symbols(dso)); - dso__set_adjust_symbols(dso, true); - - err = 0; -out_free: - free(symbols); -out_close: - bfd_close(abfd); - return err; -} -#endif - static bool dso__is_compatible_symtab_type(struct dso *dso, bool kmod, enum dso_binary_type type) { diff --git a/tools/perf/util/tool_pmu.c b/tools/perf/util/tool_pmu.c index d99e699e646d..f075098488ba 100644 --- a/tools/perf/util/tool_pmu.c +++ b/tools/perf/util/tool_pmu.c @@ -239,9 +239,6 @@ int evsel__tool_pmu_open(struct evsel *evsel, nthreads = perf_thread_map__nr(threads); for (idx = start_cpu_map_idx; idx < end_cpu_map_idx; idx++) { for (thread = 0; thread < nthreads; thread++) { - if (thread >= nthreads) - break; - if (!evsel->cgrp && !evsel->core.system_wide) pid = perf_thread_map__pid(threads, thread); diff --git a/tools/perf/util/tp_pmu.c b/tools/perf/util/tp_pmu.c index e7534a973247..eddb9807131a 100644 --- a/tools/perf/util/tp_pmu.c +++ b/tools/perf/util/tp_pmu.c @@ -88,8 +88,6 @@ int tp_pmu__for_each_tp_sys(void *state, tp_sys_callback cb) continue; ret = cb(state, events_ent->d_name); - if (ret) - break; } close(events_dir.dirfd); return ret; diff --git a/tools/perf/util/trace.h b/tools/perf/util/trace.h index fa8d480527a2..fbbcfe6f44fe 100644 --- a/tools/perf/util/trace.h +++ b/tools/perf/util/trace.h @@ -16,7 +16,7 @@ enum trace_summary_mode { int trace_prepare_bpf_summary(enum trace_summary_mode mode); void trace_start_bpf_summary(void); void trace_end_bpf_summary(void); -int trace_print_bpf_summary(FILE *fp); +int trace_print_bpf_summary(FILE *fp, int max_summary); void trace_cleanup_bpf_summary(void); #else /* !HAVE_BPF_SKEL */ @@ -27,7 +27,7 @@ static inline int trace_prepare_bpf_summary(enum trace_summary_mode mode __maybe } static inline void trace_start_bpf_summary(void) {} static inline void trace_end_bpf_summary(void) {} -static inline int trace_print_bpf_summary(FILE *fp __maybe_unused) +static inline int trace_print_bpf_summary(FILE *fp __maybe_unused, int max_summary __maybe_unused) { return 0; } diff --git a/tools/perf/util/zlib.c b/tools/perf/util/zlib.c index 78d2297c1b67..1f7c06523059 100644 --- a/tools/perf/util/zlib.c +++ b/tools/perf/util/zlib.c @@ -88,7 +88,7 @@ bool gzip_is_compressed(const char *input) ssize_t rc; if (fd < 0) - return -1; + return false; rc = read(fd, buf, sizeof(buf)); close(fd); |