From 411ad22ecf0281d666a82aa7f4de90c70365da7d Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:36 -0700 Subject: perf parse-events: Add pmu filter To support the cputype argument added to "perf stat" for hybrid it is necessary to filter events during wildcard matching. Add a scanner argument for the filter and checking it when wildcard matching. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-30-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index b9ad32f21e57..de2e915427c9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -101,6 +101,10 @@ static void print_counters(struct timespec *ts, int argc, const char **argv); static struct evlist *evsel_list; +static struct parse_events_option_args parse_events_option_args = { + .evlistp = &evsel_list, +}; + static bool all_counters_use_bpf = true; static struct target target = { @@ -1096,8 +1100,8 @@ static int parse_hybrid_type(const struct option *opt, return -1; } - evlist->hybrid_pmu_name = perf_pmu__hybrid_type_to_pmu(str); - if (!evlist->hybrid_pmu_name) { + parse_events_option_args.pmu_filter = perf_pmu__hybrid_type_to_pmu(str); + if (!parse_events_option_args.pmu_filter) { fprintf(stderr, "--cputype %s is not supported!\n", str); return -1; } @@ -1108,7 +1112,7 @@ static int parse_hybrid_type(const struct option *opt, static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), - OPT_CALLBACK('e', "event", &evsel_list, "event", + OPT_CALLBACK('e', "event", &parse_events_option_args, "event", "event selector. use 'perf list' to list available events", parse_events_option), OPT_CALLBACK(0, "filter", &evsel_list, "filter", -- cgit From 003be8c4f71753092bbb86fa9d7ad26dd9fb98db Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:37 -0700 Subject: perf stat: Make cputype filter generic Rather than limit the --cputype argument for "perf list" and "perf stat" to hybrid PMUs of just cpu_atom and cpu_core, allow any PMU. Note, that if cpu_atom isn't mounted but a filter of cpu_atom is requested, then this will now fail. As such a filter would never succeed, no events can come from that unmounted PMU, then this behavior could never have been useful and failing is clearer. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-31-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index de2e915427c9..fe9c6fa3f14a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -44,6 +44,7 @@ #include "util/cgroup.h" #include #include "util/parse-events.h" +#include "util/pmus.h" #include "util/pmu.h" #include "util/event.h" #include "util/evlist.h" @@ -69,7 +70,6 @@ #include "util/pfm.h" #include "util/bpf_counter.h" #include "util/iostat.h" -#include "util/pmu-hybrid.h" #include "util/util.h" #include "asm/bug.h" @@ -1089,10 +1089,11 @@ static int parse_stat_cgroups(const struct option *opt, return parse_cgroups(opt, str, unset); } -static int parse_hybrid_type(const struct option *opt, +static int parse_cputype(const struct option *opt, const char *str, int unset __maybe_unused) { + const struct perf_pmu *pmu; struct evlist *evlist = *(struct evlist **)opt->value; if (!list_empty(&evlist->core.entries)) { @@ -1100,11 +1101,12 @@ static int parse_hybrid_type(const struct option *opt, return -1; } - parse_events_option_args.pmu_filter = perf_pmu__hybrid_type_to_pmu(str); - if (!parse_events_option_args.pmu_filter) { + pmu = perf_pmus__pmu_for_pmu_filter(str); + if (!pmu) { fprintf(stderr, "--cputype %s is not supported!\n", str); return -1; } + parse_events_option_args.pmu_filter = pmu->name; return 0; } @@ -1230,7 +1232,7 @@ static struct option stat_options[] = { OPT_CALLBACK(0, "cputype", &evsel_list, "hybrid cpu type", "Only enable events on applying cpu with this type " "for hybrid platform (e.g. core or atom)", - parse_hybrid_type), + parse_cputype), #ifdef HAVE_LIBPFM OPT_CALLBACK(0, "pfm-events", &evsel_list, "event", "libpfm4 event selector. use 'perf list' to list available events", -- cgit From bd3846d0fea2e8e3375fc54a6556561726f466cf Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:45 -0700 Subject: perf metrics: Be PMU specific for referenced metrics. Hybrid systems may define the same metric for different PMUs, this can cause confusion of events. To avoid this make the referenced metric searches PMU specific, matching that in the table. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-39-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index fe9c6fa3f14a..8161f922715c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1794,7 +1794,7 @@ static int add_default_attributes(void) * will use this approach. To determine transaction support * on an architecture test for such a metric name. */ - if (!metricgroup__has_metric("transaction")) { + if (!metricgroup__has_metric("all", "transaction")) { pr_err("Missing transaction metrics"); return -1; } @@ -1823,7 +1823,7 @@ static int add_default_attributes(void) smi_reset = true; } - if (!metricgroup__has_metric("smi")) { + if (!metricgroup__has_metric("all", "smi")) { pr_err("Missing smi metrics"); return -1; } @@ -1903,7 +1903,7 @@ static int add_default_attributes(void) * caused by exposing latent bugs. This is fixed properly in: * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/ */ - if (metricgroup__has_metric("TopdownL1") && !perf_pmu__has_hybrid()) { + if (metricgroup__has_metric("all", "TopdownL1") && !perf_pmu__has_hybrid()) { struct evlist *metric_evlist = evlist__new(); struct evsel *metric_evsel; -- cgit From dae47d3940a77e1639edb0c5f0596f43bcff8bf8 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:46 -0700 Subject: perf stat: Command line PMU metric filtering Wire up the --cputype value to limit which metrics are parsed. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-40-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 8161f922715c..e18b3239d42a 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1783,6 +1783,7 @@ static int add_default_attributes(void) }; struct perf_event_attr default_null_attrs[] = {}; + const char *pmu = parse_events_option_args.pmu_filter ?: "all"; /* Set attrs if no event is selected and !null_run: */ if (stat_config.null_run) @@ -1794,11 +1795,11 @@ static int add_default_attributes(void) * will use this approach. To determine transaction support * on an architecture test for such a metric name. */ - if (!metricgroup__has_metric("all", "transaction")) { + if (!metricgroup__has_metric(pmu, "transaction")) { pr_err("Missing transaction metrics"); return -1; } - return metricgroup__parse_groups(evsel_list, "transaction", + return metricgroup__parse_groups(evsel_list, pmu, "transaction", stat_config.metric_no_group, stat_config.metric_no_merge, stat_config.metric_no_threshold, @@ -1823,7 +1824,7 @@ static int add_default_attributes(void) smi_reset = true; } - if (!metricgroup__has_metric("all", "smi")) { + if (!metricgroup__has_metric(pmu, "smi")) { pr_err("Missing smi metrics"); return -1; } @@ -1831,7 +1832,7 @@ static int add_default_attributes(void) if (!force_metric_only) stat_config.metric_only = true; - return metricgroup__parse_groups(evsel_list, "smi", + return metricgroup__parse_groups(evsel_list, pmu, "smi", stat_config.metric_no_group, stat_config.metric_no_merge, stat_config.metric_no_threshold, @@ -1864,7 +1865,8 @@ static int add_default_attributes(void) "Please print the result regularly, e.g. -I1000\n"); } str[8] = stat_config.topdown_level + '0'; - if (metricgroup__parse_groups(evsel_list, str, + if (metricgroup__parse_groups(evsel_list, + pmu, str, /*metric_no_group=*/false, /*metric_no_merge=*/false, /*metric_no_threshold=*/true, @@ -1903,14 +1905,14 @@ static int add_default_attributes(void) * caused by exposing latent bugs. This is fixed properly in: * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/ */ - if (metricgroup__has_metric("all", "TopdownL1") && !perf_pmu__has_hybrid()) { + if (metricgroup__has_metric(pmu, "TopdownL1") && !perf_pmu__has_hybrid()) { struct evlist *metric_evlist = evlist__new(); struct evsel *metric_evsel; if (!metric_evlist) return -1; - if (metricgroup__parse_groups(metric_evlist, "TopdownL1", + if (metricgroup__parse_groups(metric_evlist, pmu, "TopdownL1", /*metric_no_group=*/false, /*metric_no_merge=*/false, /*metric_no_threshold=*/true, @@ -2434,7 +2436,9 @@ int cmd_stat(int argc, const char **argv) * knowing the target is system-wide. */ if (metrics) { - metricgroup__parse_groups(evsel_list, metrics, + const char *pmu = parse_events_option_args.pmu_filter ?: "all"; + + metricgroup__parse_groups(evsel_list, pmu, metrics, stat_config.metric_no_group, stat_config.metric_no_merge, stat_config.metric_no_threshold, -- cgit From 718eabe1f329acedf1470aed67632d65dca5088c Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 2 May 2023 15:38:50 -0700 Subject: perf stat: Don't disable TopdownL1 metric on hybrid Now that hybrid bugs are fixed sufficient to run TopdownL1 metrics, don't implicitly disable them for hybrid. Signed-off-by: Ian Rogers Tested-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Alexander Shishkin Cc: Andi Kleen Cc: Athira Rajeev Cc: Caleb Biggers Cc: Edward Baker Cc: Florian Fischer Cc: Ingo Molnar Cc: James Clark Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Mark Rutland Cc: Namhyung Kim Cc: Perry Taylor Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Samantha Alt Cc: Stephane Eranian Cc: Sumanth Korikkar Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Tiezhu Yang Cc: Weilin Wang Cc: Xing Zhengjun Cc: Yang Jihong Link: https://lore.kernel.org/r/20230502223851.2234828-44-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e18b3239d42a..bc45cee3f77c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1900,12 +1900,7 @@ static int add_default_attributes(void) * Add TopdownL1 metrics if they exist. To minimize * multiplexing, don't request threshold computation. */ - /* - * TODO: TopdownL1 is disabled on hybrid CPUs to avoid a crashes - * caused by exposing latent bugs. This is fixed properly in: - * https://lore.kernel.org/lkml/bff481ba-e60a-763f-0aa0-3ee53302c480@linux.intel.com/ - */ - if (metricgroup__has_metric(pmu, "TopdownL1") && !perf_pmu__has_hybrid()) { + if (metricgroup__has_metric(pmu, "TopdownL1")) { struct evlist *metric_evlist = evlist__new(); struct evsel *metric_evsel; -- cgit From 995ed074b829f293586028560f2f27f47889df64 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 17 May 2023 22:57:42 +0530 Subject: perf stat: Setup the foundation to allow aggregation based on cache topology Processors based on chiplet architecture, such as AMD EPYC and Hygon do not expose the chiplet details in the sysfs CPU topology information. However, this information can be derived from the per CPU cache level information from the sysfs. 'perf stat' has already supported aggregation based on topology information using core ID, socket ID, etc. It'll be useful to aggregate based on the cache topology to detect problems like imbalance and cache-to-cache sharing at various cache levels. This patch lays the foundation for aggregating data in 'perf stat' based on the processor's cache topology. The cmdline option to aggregate data based on the cache topology is added in Patch 4 of the series while this patch sets up all the necessary functions and variables required to support the new aggregation option. The patch also adds support to display per-cache aggregation, or save it as a JSON or CSV, as splitting it into a separate patch would break builds when compiling with "-Werror=switch-enum" where the compiler will complain about the lack of handling for the AGGR_CACHE case in the output functions. Committer notes: Don't use perf_stat_config in tools/perf/util/cpumap.c, this would make code that is in util/, thus not really specific to a single builtin, use a specific builtin config structure. Move the functions introduced in this patch from tools/perf/util/cpumap.c since it needs access to builtin specific and is not strictly needed to live in the util/ directory. With this 'perf test python' is back building. Suggested-by: Gautham Shenoy Signed-off-by: K Prateek Nayak Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Stephane Eranian Cc: Wen Pu Link: https://lore.kernel.org/r/20230517172745.5833-3-kprateek.nayak@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 209 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 208 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index bc45cee3f77c..0528d1bc15d2 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -142,6 +142,7 @@ struct perf_stat { struct perf_cpu_map *cpus; struct perf_thread_map *threads; enum aggr_mode aggr_mode; + u32 aggr_level; }; static struct perf_stat perf_stat; @@ -151,6 +152,7 @@ static volatile sig_atomic_t done = 0; static struct perf_stat_config stat_config = { .aggr_mode = AGGR_GLOBAL, + .aggr_level = MAX_CACHE_LVL + 1, .scale = true, .unit_width = 4, /* strlen("unit") */ .run_count = 1, @@ -1249,8 +1251,132 @@ static struct option stat_options[] = { OPT_END() }; +/** + * Calculate the cache instance ID from the map in + * /sys/devices/system/cpu/cpuX/cache/indexY/shared_cpu_list + * Cache instance ID is the first CPU reported in the shared_cpu_list file. + */ +static int cpu__get_cache_id_from_map(struct perf_cpu cpu, char *map) +{ + int id; + struct perf_cpu_map *cpu_map = perf_cpu_map__new(map); + + /* + * If the map contains no CPU, consider the current CPU to + * be the first online CPU in the cache domain else use the + * first online CPU of the cache domain as the ID. + */ + if (perf_cpu_map__empty(cpu_map)) + id = cpu.cpu; + else + id = perf_cpu_map__cpu(cpu_map, 0).cpu; + + /* Free the perf_cpu_map used to find the cache ID */ + perf_cpu_map__put(cpu_map); + + return id; +} + +/** + * cpu__get_cache_id - Returns 0 if successful in populating the + * cache level and cache id. Cache level is read from + * /sys/devices/system/cpu/cpuX/cache/indexY/level where as cache instance ID + * is the first CPU reported by + * /sys/devices/system/cpu/cpuX/cache/indexY/shared_cpu_list + */ +static int cpu__get_cache_details(struct perf_cpu cpu, struct perf_cache *cache) +{ + int ret = 0; + u32 cache_level = stat_config.aggr_level; + struct cpu_cache_level caches[MAX_CACHE_LVL]; + u32 i = 0, caches_cnt = 0; + + cache->cache_lvl = (cache_level > MAX_CACHE_LVL) ? 0 : cache_level; + cache->cache = -1; + + ret = build_caches_for_cpu(cpu.cpu, caches, &caches_cnt); + if (ret) { + /* + * If caches_cnt is not 0, cpu_cache_level data + * was allocated when building the topology. + * Free the allocated data before returning. + */ + if (caches_cnt) + goto free_caches; + + return ret; + } + + if (!caches_cnt) + return -1; + + /* + * Save the data for the highest level if no + * level was specified by the user. + */ + if (cache_level > MAX_CACHE_LVL) { + int max_level_index = 0; + + for (i = 1; i < caches_cnt; ++i) { + if (caches[i].level > caches[max_level_index].level) + max_level_index = i; + } + + cache->cache_lvl = caches[max_level_index].level; + cache->cache = cpu__get_cache_id_from_map(cpu, caches[max_level_index].map); + + /* Reset i to 0 to free entire caches[] */ + i = 0; + goto free_caches; + } + + for (i = 0; i < caches_cnt; ++i) { + if (caches[i].level == cache_level) { + cache->cache_lvl = cache_level; + cache->cache = cpu__get_cache_id_from_map(cpu, caches[i].map); + } + + cpu_cache_level__free(&caches[i]); + } + +free_caches: + /* + * Free all the allocated cpu_cache_level data. + */ + while (i < caches_cnt) + cpu_cache_level__free(&caches[i++]); + + return ret; +} + +/** + * aggr_cpu_id__cache - Create an aggr_cpu_id with cache instache ID, cache + * level, die and socket populated with the cache instache ID, cache level, + * die and socket for cpu. The function signature is compatible with + * aggr_cpu_id_get_t. + */ +static struct aggr_cpu_id aggr_cpu_id__cache(struct perf_cpu cpu, void *data) +{ + int ret; + struct aggr_cpu_id id; + struct perf_cache cache; + + id = aggr_cpu_id__die(cpu, data); + if (aggr_cpu_id__is_empty(&id)) + return id; + + ret = cpu__get_cache_details(cpu, &cache); + if (ret) + return id; + + id.cache_lvl = cache.cache_lvl; + id.cache = cache.cache; + return id; +} + static const char *const aggr_mode__string[] = { [AGGR_CORE] = "core", + [AGGR_CACHE] = "cache", [AGGR_DIE] = "die", [AGGR_GLOBAL] = "global", [AGGR_NODE] = "node", @@ -1272,6 +1398,12 @@ static struct aggr_cpu_id perf_stat__get_die(struct perf_stat_config *config __m return aggr_cpu_id__die(cpu, /*data=*/NULL); } +static struct aggr_cpu_id perf_stat__get_cache_id(struct perf_stat_config *config __maybe_unused, + struct perf_cpu cpu) +{ + return aggr_cpu_id__cache(cpu, /*data=*/NULL); +} + static struct aggr_cpu_id perf_stat__get_core(struct perf_stat_config *config __maybe_unused, struct perf_cpu cpu) { @@ -1324,6 +1456,12 @@ static struct aggr_cpu_id perf_stat__get_die_cached(struct perf_stat_config *con return perf_stat__get_aggr(config, perf_stat__get_die, cpu); } +static struct aggr_cpu_id perf_stat__get_cache_id_cached(struct perf_stat_config *config, + struct perf_cpu cpu) +{ + return perf_stat__get_aggr(config, perf_stat__get_cache_id, cpu); +} + static struct aggr_cpu_id perf_stat__get_core_cached(struct perf_stat_config *config, struct perf_cpu cpu) { @@ -1355,6 +1493,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr(enum aggr_mode aggr_mode) return aggr_cpu_id__socket; case AGGR_DIE: return aggr_cpu_id__die; + case AGGR_CACHE: + return aggr_cpu_id__cache; case AGGR_CORE: return aggr_cpu_id__core; case AGGR_NODE: @@ -1378,6 +1518,8 @@ static aggr_get_id_t aggr_mode__get_id(enum aggr_mode aggr_mode) return perf_stat__get_socket_cached; case AGGR_DIE: return perf_stat__get_die_cached; + case AGGR_CACHE: + return perf_stat__get_cache_id_cached; case AGGR_CORE: return perf_stat__get_core_cached; case AGGR_NODE: @@ -1490,6 +1632,60 @@ static struct aggr_cpu_id perf_env__get_die_aggr_by_cpu(struct perf_cpu cpu, voi return id; } +static void perf_env__get_cache_id_for_cpu(struct perf_cpu cpu, struct perf_env *env, + u32 cache_level, struct aggr_cpu_id *id) +{ + int i; + int caches_cnt = env->caches_cnt; + struct cpu_cache_level *caches = env->caches; + + id->cache_lvl = (cache_level > MAX_CACHE_LVL) ? 0 : cache_level; + id->cache = -1; + + if (!caches_cnt) + return; + + for (i = caches_cnt - 1; i > -1; --i) { + struct perf_cpu_map *cpu_map; + int map_contains_cpu; + + /* + * If user has not specified a level, find the fist level with + * the cpu in the map. Since building the map is expensive, do + * this only if levels match. + */ + if (cache_level <= MAX_CACHE_LVL && caches[i].level != cache_level) + continue; + + cpu_map = perf_cpu_map__new(caches[i].map); + map_contains_cpu = perf_cpu_map__idx(cpu_map, cpu); + perf_cpu_map__put(cpu_map); + + if (map_contains_cpu != -1) { + id->cache_lvl = caches[i].level; + id->cache = cpu__get_cache_id_from_map(cpu, caches[i].map); + return; + } + } +} + +static struct aggr_cpu_id perf_env__get_cache_aggr_by_cpu(struct perf_cpu cpu, + void *data) +{ + struct perf_env *env = data; + struct aggr_cpu_id id = aggr_cpu_id__empty(); + + if (cpu.cpu != -1) { + u32 cache_level = (perf_stat.aggr_level) ?: stat_config.aggr_level; + + id.socket = env->cpu[cpu.cpu].socket_id; + id.die = env->cpu[cpu.cpu].die_id; + perf_env__get_cache_id_for_cpu(cpu, env, cache_level, &id); + } + + return id; +} + static struct aggr_cpu_id perf_env__get_core_aggr_by_cpu(struct perf_cpu cpu, void *data) { struct perf_env *env = data; @@ -1558,6 +1754,12 @@ static struct aggr_cpu_id perf_stat__get_die_file(struct perf_stat_config *confi return perf_env__get_die_aggr_by_cpu(cpu, &perf_stat.session->header.env); } +static struct aggr_cpu_id perf_stat__get_cache_file(struct perf_stat_config *config __maybe_unused, + struct perf_cpu cpu) +{ + return perf_env__get_cache_aggr_by_cpu(cpu, &perf_stat.session->header.env); +} + static struct aggr_cpu_id perf_stat__get_core_file(struct perf_stat_config *config __maybe_unused, struct perf_cpu cpu) { @@ -1589,6 +1791,8 @@ static aggr_cpu_id_get_t aggr_mode__get_aggr_file(enum aggr_mode aggr_mode) return perf_env__get_socket_aggr_by_cpu; case AGGR_DIE: return perf_env__get_die_aggr_by_cpu; + case AGGR_CACHE: + return perf_env__get_cache_aggr_by_cpu; case AGGR_CORE: return perf_env__get_core_aggr_by_cpu; case AGGR_NODE: @@ -1612,6 +1816,8 @@ static aggr_get_id_t aggr_mode__get_id_file(enum aggr_mode aggr_mode) return perf_stat__get_socket_file; case AGGR_DIE: return perf_stat__get_die_file; + case AGGR_CACHE: + return perf_stat__get_cache_file; case AGGR_CORE: return perf_stat__get_core_file; case AGGR_NODE: @@ -2127,7 +2333,8 @@ static struct perf_stat perf_stat = { .stat = perf_event__process_stat_event, .stat_round = process_stat_round_event, }, - .aggr_mode = AGGR_UNSET, + .aggr_mode = AGGR_UNSET, + .aggr_level = 0, }; static int __cmd_report(int argc, const char **argv) -- cgit From aab667ca8837e45fda0204bed7b59abd634c0b2b Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Wed, 17 May 2023 22:57:44 +0530 Subject: perf stat: Add "--per-cache" aggregation option and document it This patch adds support for "--per-cache" option for aggregation at a particular cache level and documents the same. Following is the output of 'perf stat' with aggregation at L3 for the event "ls_dmnd_fills_from_sys.ext_cache_remote" on a dual socket 3rd Generation EPYC Processor (2 x 64C/128T - 16 LLCs) when running hackbench pinned to 4 LLCs: $ sudo perf stat --per-cache=L3 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8 ... Performance counter stats for 'system wide': S0-D0-L3-ID0 16 9,500,803 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID8 16 6,338,099 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID16 16 355,005 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID24 16 22,067 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID32 16 16,321 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID40 16 11,619 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID48 16 4,238 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID56 16 31,158 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID64 16 28,242,452 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID72 16 22,906,973 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID80 16 72,898 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID88 16 56,907 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID96 16 20,456 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID104 16 40,913 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID112 16 78,113 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID120 16 37,897 ls_dmnd_fills_from_sys.ext_cache_remote Also support 'perf stat record' and 'perf stat report' with the ability to specify a different cache level to aggregate data at when running 'perf stat report'. $ sudo perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote -- \ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8 ... Performance counter stats for 'system wide': S0-D0-L2-ID0 2 1,442,061 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID1 2 1,548,994 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID2 2 1,553,557 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID3 2 1,420,122 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID4 2 1,465,461 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID5 2 1,455,153 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID6 2 1,595,237 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID7 2 1,499,321 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L2-ID8 2 1,919,025 ls_dmnd_fills_from_sys.ext_cache_remote ... S1-D1-L2-ID127 2 21,295 ls_dmnd_fills_from_sys.ext_cache_remote $ sudo perf stat report --per-cache=L3 Performance counter stats for 'perf stat record --per-cache=L2 -a -e ls_dmnd_fills_from_sys.ext_cache_remote --\ taskset -c 0-15,64-79,128-143,192-207 \ perf bench sched messaging -p -t -l 100000 -g 8': S0-D0-L3-ID0 16 11,979,906 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID8 16 14,257,202 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID16 16 377,484 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID24 16 27,224 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID32 16 26,816 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID40 16 14,461 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID48 16 10,499 ls_dmnd_fills_from_sys.ext_cache_remote S0-D0-L3-ID56 16 53,817 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID64 16 27,361,987 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID72 16 37,299,024 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID80 16 84,125 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID88 16 64,561 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID96 16 13,403 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID104 16 20,138 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID112 16 93,220 ls_dmnd_fills_from_sys.ext_cache_remote S1-D1-L3-ID120 16 35,465 ls_dmnd_fills_from_sys.ext_cache_remote On the above system, the domain covered by S0-D0-L3-ID0 contains S0-D0-L2-ID0 to S0-D0-L2-ID7, the corresponding count for L3-ID0 is equal to the sum of counts for L2-ID0 to L2-ID7. Add documentation for the newly introduced "--per-cache" option. Suggested-by: Gautham Shenoy Signed-off-by: K Prateek Nayak Acked-by: Ian Rogers Cc: Alexander Shishkin Cc: Ananth Narayan Cc: Ingo Molnar Cc: Jiri Olsa Cc: Mark Rutland Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sandipan Das Cc: Stephane Eranian Cc: Wen Pu Link: https://lore.kernel.org/r/20230517172745.5833-5-kprateek.nayak@amd.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0528d1bc15d2..176deeb8ee66 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1113,6 +1113,55 @@ static int parse_cputype(const struct option *opt, return 0; } +static int parse_cache_level(const struct option *opt, + const char *str, + int unset __maybe_unused) +{ + int level; + u32 *aggr_mode = (u32 *)opt->value; + u32 *aggr_level = (u32 *)opt->data; + + /* + * If no string is specified, aggregate based on the topology of + * Last Level Cache (LLC). Since the LLC level can change from + * architecture to architecture, set level greater than + * MAX_CACHE_LVL which will be interpreted as LLC. + */ + if (str == NULL) { + level = MAX_CACHE_LVL + 1; + goto out; + } + + /* + * The format to specify cache level is LX or lX where X is the + * cache level. + */ + if (strlen(str) != 2 || (str[0] != 'l' && str[0] != 'L')) { + pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n", + MAX_CACHE_LVL, + MAX_CACHE_LVL); + return -EINVAL; + } + + level = atoi(&str[1]); + if (level < 1) { + pr_err("Cache level must be of form L[1-%d], or l[1-%d]\n", + MAX_CACHE_LVL, + MAX_CACHE_LVL); + return -EINVAL; + } + + if (level > MAX_CACHE_LVL) { + pr_err("perf only supports max cache level of %d.\n" + "Consider increasing MAX_CACHE_LVL\n", MAX_CACHE_LVL); + return -EINVAL; + } +out: + *aggr_mode = AGGR_CACHE; + *aggr_level = level; + return 0; +} + static struct option stat_options[] = { OPT_BOOLEAN('T', "transaction", &transaction_run, "hardware transaction statistics"), @@ -1190,6 +1239,9 @@ static struct option stat_options[] = { "aggregate counts per processor socket", AGGR_SOCKET), OPT_SET_UINT(0, "per-die", &stat_config.aggr_mode, "aggregate counts per processor die", AGGR_DIE), + OPT_CALLBACK_OPTARG(0, "per-cache", &stat_config.aggr_mode, &stat_config.aggr_level, + "cache level", "aggregate count at this cache level (Default: LLC)", + parse_cache_level), OPT_SET_UINT(0, "per-core", &stat_config.aggr_mode, "aggregate counts per physical processor core", AGGR_CORE), OPT_SET_UINT(0, "per-thread", &stat_config.aggr_mode, @@ -2346,6 +2398,10 @@ static int __cmd_report(int argc, const char **argv) "aggregate counts per processor socket", AGGR_SOCKET), OPT_SET_UINT(0, "per-die", &perf_stat.aggr_mode, "aggregate counts per processor die", AGGR_DIE), + OPT_CALLBACK_OPTARG(0, "per-cache", &perf_stat.aggr_mode, &perf_stat.aggr_level, + "cache level", + "aggregate count at this cache level (Default: LLC)", + parse_cache_level), OPT_SET_UINT(0, "per-core", &perf_stat.aggr_mode, "aggregate counts per physical processor core", AGGR_CORE), OPT_SET_UINT(0, "per-node", &perf_stat.aggr_mode, -- cgit From 8ec984d53714dfa538f3f5b1e22a309ac18edf63 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:46 -0700 Subject: perf target: Remove unused hybrid value Previously this was used to modify CPU map propagation, but it is now unnecessary as map propagation ensure core PMUs only have valid PMUs in the CPU map from user requested CPUs. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-11-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 176deeb8ee66..8d4c4f4ca8ea 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2730,7 +2730,6 @@ int cmd_stat(int argc, const char **argv) goto out; } - target.hybrid = perf_pmu__has_hybrid(); if (evlist__create_maps(evsel_list, &target) < 0) { if (target__has_task(&target)) { pr_err("Problems finding threads of monitor\n"); -- cgit From 5ac72634482143a8be5e04e5d09a2026f6a94315 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:47 -0700 Subject: perf tools: Warn if no user requested CPUs match PMU's CPUs In commit 1d3351e631fc ("perf tools: Enable on a list of CPUs for hybrid") perf on hybrid will warn if a user requested CPU doesn't match the PMU of the given event but only for hybrid PMUs. Make the logic generic for all PMUs and remove the hybrid logic. Warn if a CPU is requested that isn't present/offline for events not on the core. Warn if a CPU is requested for a core PMU, but the CPU isn't within the cpu map of that PMU. For example on a 16 (0-15) CPU system: ``` $ perf stat -e imc_free_running/data_read/,cycles -C 16 true WARNING: A requested CPU in '16' is not supported by PMU 'uncore_imc_free_running_1' (CPUs 0-15) for event 'imc_free_running/data_read/' WARNING: A requested CPU in '16' is not supported by PMU 'uncore_imc_free_running_0' (CPUs 0-15) for event 'imc_free_running/data_read/' WARNING: A requested CPU in '16' is not supported by PMU 'cpu' (CPUs 0-15) for event 'cycles' Performance counter stats for 'CPU(s) 16': MiB imc_free_running/data_read/ cycles 0.000575312 seconds time elapsed ``` Remove evlist__fix_hybrid_cpus that previously produced the warnings and also perf_pmu__cpus_match that worked with evlist__fix_hybrid_cpus to change CPU maps for hybrid CPUs, something that is no longer necessary as CPU map propagation properly intersects user requested CPUs with the core PMU's CPU map. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 8d4c4f4ca8ea..84d304cffd2c 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2725,10 +2725,7 @@ int cmd_stat(int argc, const char **argv) } } - if (evlist__fix_hybrid_cpus(evsel_list, target.cpu_list)) { - pr_err("failed to use cpu list %s\n", target.cpu_list); - goto out; - } + evlist__warn_user_requested_cpus(evsel_list, target.cpu_list); if (evlist__create_maps(evsel_list, &target) < 0) { if (target__has_task(&target)) { -- cgit From b4388dfa3ae5aca7d4d3bbc9b80fe5e483ef78e9 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:48 -0700 Subject: perf evlist: Remove evlist__warn_hybrid_group Parse events now corrects PMU groups in parse_events__sort_events_and_fix_groups and so this warning is no longer possible. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-13-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 84d304cffd2c..d414ee30dcf9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -189,9 +189,6 @@ static void evlist__check_cpu_maps(struct evlist *evlist) { struct evsel *evsel, *warned_leader = NULL; - if (evlist__has_hybrid(evlist)) - evlist__warn_hybrid_group(evlist); - evlist__for_each_entry(evlist, evsel) { struct evsel *leader = evsel__leader(evsel); -- cgit From b167b530eb83dfd791061e1d312236bffde772a4 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:21:50 -0700 Subject: perf evlist: Reduce scope of evlist__has_hybrid Function is only used in printout, reduce scope to stat-display.c. Remove the now empty evlist-hybrid.c and evlist-hybrid.h. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-15-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 1 - 1 file changed, 1 deletion(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index d414ee30dcf9..62bbeea93bf3 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -48,7 +48,6 @@ #include "util/pmu.h" #include "util/event.h" #include "util/evlist.h" -#include "util/evlist-hybrid.h" #include "util/evsel.h" #include "util/debug.h" #include "util/color.h" -- cgit From 1eaf496ed386934f1c2439a120fe84a05194f91a Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Sat, 27 May 2023 00:22:03 -0700 Subject: perf pmu: Separate pmu and pmus Separate and hide the pmus list in pmus.[ch]. Move pmus functionality out of pmu.[ch] into pmus.[ch] renaming pmus functions which were prefixed perf_pmu__ to perf_pmus__. Reviewed-by: Kan Liang Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Athira Rajeev Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Huacai Chen Cc: Ingo Molnar Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: Kajol Jain Cc: Kang Minchul Cc: Leo Yan Cc: Madhavan Srinivasan Cc: Mark Rutland Cc: Mike Leach Cc: Ming Wang Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Rob Herring Cc: Sandipan Das Cc: Sean Christopherson Cc: Suzuki Poulouse Cc: Thomas Richter Cc: Will Deacon Cc: Xing Zhengjun Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230527072210.2900565-28-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 62bbeea93bf3..c87c6897edc9 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2140,11 +2140,11 @@ static int add_default_attributes(void) if (evlist__add_default_attrs(evsel_list, default_attrs0) < 0) return -1; - if (pmu_have_event("cpu", "stalled-cycles-frontend")) { + if (perf_pmus__have_event("cpu", "stalled-cycles-frontend")) { if (evlist__add_default_attrs(evsel_list, frontend_attrs) < 0) return -1; } - if (pmu_have_event("cpu", "stalled-cycles-backend")) { + if (perf_pmus__have_event("cpu", "stalled-cycles-backend")) { if (evlist__add_default_attrs(evsel_list, backend_attrs) < 0) return -1; } -- cgit From 2b87be183bca9774c8ce238f5fc84d3b3f671b33 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Thu, 8 Jun 2023 16:28:08 -0700 Subject: perf stat: Avoid evlist leak Free evlist before overwriting in "perf stat report" mode. Detected using leak sanitizer. Signed-off-by: Ian Rogers Cc: Adrian Hunter Cc: Alexander Shishkin Cc: Ali Saidi Cc: Andi Kleen Cc: Athira Rajeev Cc: Brian Robbins Cc: Changbin Du Cc: Dmitrii Dolgov <9erthalion6@gmail.com> Cc: Fangrui Song Cc: German Gomez Cc: Ingo Molnar Cc: Ivan Babrou Cc: James Clark Cc: Jing Zhang Cc: Jiri Olsa Cc: John Garry Cc: K Prateek Nayak Cc: Kan Liang Cc: Leo Yan Cc: Liam Howlett Cc: Mark Rutland Cc: Miguel Ojeda Cc: Mike Leach Cc: Namhyung Kim Cc: Naveen N. Rao Cc: Peter Zijlstra Cc: Ravi Bangoria Cc: Sean Christopherson Cc: Steinar H. Gunderson Cc: Suzuki Poulouse Cc: Wenyu Liu Cc: Will Deacon Cc: Yang Jihong Cc: Ye Xingchen Cc: Yuan Can Cc: coresight@lists.linaro.org Cc: linux-arm-kernel@lists.infradead.org Link: https://lore.kernel.org/r/20230608232823.4027869-12-irogers@google.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c87c6897edc9..fc615bdeed4f 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2427,6 +2427,7 @@ static int __cmd_report(int argc, const char **argv) perf_stat.session = session; stat_config.output = stderr; + evlist__delete(evsel_list); evsel_list = session->evlist; ret = perf_session__process_events(session); -- cgit From b0a9e8f81fc45e6d3c9ddf290dabd7f4610f2939 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 06:53:10 -0700 Subject: perf stat,jevents: Introduce Default tags for the default mode Introduce a new metricgroup, Default, to tag all the metric groups which will be collected in the default mode. Add a new field, DefaultMetricgroupName, in the JSON file to indicate the real metric group name. It will be printed in the default output to replace the event names. There is nothing changed for the output format. On SPR, both TopdownL1 and TopdownL2 are displayed in the default output. On ARM, Intel ICL and later platforms (before SPR), only TopdownL1 is displayed in the default output. Suggested-by: Stephane Eranian Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230615135315.3662428-4-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index fc615bdeed4f..55601b4b5c34 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2154,14 +2154,14 @@ static int add_default_attributes(void) * Add TopdownL1 metrics if they exist. To minimize * multiplexing, don't request threshold computation. */ - if (metricgroup__has_metric(pmu, "TopdownL1")) { + if (metricgroup__has_metric(pmu, "Default")) { struct evlist *metric_evlist = evlist__new(); struct evsel *metric_evsel; if (!metric_evlist) return -1; - if (metricgroup__parse_groups(metric_evlist, pmu, "TopdownL1", + if (metricgroup__parse_groups(metric_evlist, pmu, "Default", /*metric_no_group=*/false, /*metric_no_merge=*/false, /*metric_no_threshold=*/true, -- cgit From 6a80d794d796d22910c03d3e52a3bf0d885326a7 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Thu, 15 Jun 2023 20:14:17 -0700 Subject: perf stat: New metricgroup output for the default mode In the default mode, the current output of the metricgroup include both events and metrics, which is not necessary and just makes the output hard to read. Since different ARCHs (even different generations in the same ARCH) may use different events. The output also vary on different platforms. For a metricgroup, only outputting the value of each metric is good enough. Add a new field default_metricgroup in evsel to indicate an event of the default metricgroup. For those events, printout() should print the metricgroup name rather than each event. Add perf_stat__skip_metric_event() to skip the evsel in the Default metricgroup, if it's not running or not the metric event. Add print_metricgroup_header_t to pass the functions which print the display name of each metricgroup in the Default metricgroup. Support all three output methods. Factor out perf_stat__print_shadow_stats_metricgroup() to print out each metrics. On SPR: Before: ./perf_old stat sleep 1 Performance counter stats for 'sleep 1': 0.54 msec task-clock:u # 0.001 CPUs utilized 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 68 page-faults:u # 125.445 K/sec 540,970 cycles:u # 0.998 GHz 556,325 instructions:u # 1.03 insn per cycle 123,602 branches:u # 228.018 M/sec 6,889 branch-misses:u # 5.57% of all branches 3,245,820 TOPDOWN.SLOTS:u # 18.4 % tma_backend_bound # 17.2 % tma_retiring # 23.1 % tma_bad_speculation # 41.4 % tma_frontend_bound 564,859 topdown-retiring:u 1,370,999 topdown-fe-bound:u 603,271 topdown-be-bound:u 744,874 topdown-bad-spec:u 12,661 INT_MISC.UOP_DROPPING:u # 23.357 M/sec 1.001798215 seconds time elapsed 0.000193000 seconds user 0.001700000 seconds sys After: $ ./perf stat sleep 1 Performance counter stats for 'sleep 1': 0.51 msec task-clock:u # 0.001 CPUs utilized 0 context-switches:u # 0.000 /sec 0 cpu-migrations:u # 0.000 /sec 68 page-faults:u # 132.683 K/sec 545,228 cycles:u # 1.064 GHz 555,509 instructions:u # 1.02 insn per cycle 123,574 branches:u # 241.120 M/sec 6,957 branch-misses:u # 5.63% of all branches TopdownL1 # 17.5 % tma_backend_bound # 22.6 % tma_bad_speculation # 42.7 % tma_frontend_bound # 17.1 % tma_retiring TopdownL2 # 21.8 % tma_branch_mispredicts # 11.5 % tma_core_bound # 13.4 % tma_fetch_bandwidth # 29.3 % tma_fetch_latency # 2.7 % tma_heavy_operations # 14.5 % tma_light_operations # 0.8 % tma_machine_clears # 6.1 % tma_memory_bound 1.001712086 seconds time elapsed 0.000151000 seconds user 0.001618000 seconds sys Reviewed-by: Ian Rogers Signed-off-by: Kan Liang Cc: Adrian Hunter Cc: Ahmad Yasin Cc: Andi Kleen Cc: Ingo Molnar Cc: Jiri Olsa Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Stephane Eranian Link: https://lore.kernel.org/r/20230616031420.3751973-3-kan.liang@linux.intel.com Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 1 + 1 file changed, 1 insertion(+) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 55601b4b5c34..3f4e76f76f94 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2172,6 +2172,7 @@ static int add_default_attributes(void) evlist__for_each_entry(metric_evlist, metric_evsel) { metric_evsel->skippable = true; + metric_evsel->default_metricgroup = true; } evlist__splice_list_tail(evsel_list, &metric_evlist->core.entries); evlist__delete(metric_evlist); -- cgit From ed4090a22c123b9b33368741253edddc6ff8d18f Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Jun 2023 00:32:10 -0700 Subject: perf stat: Reset aggr stats for each run When it runs multiple times with -r option, it missed to reset the aggregation counters and the values were added up. The aggregation count has the values to be printed in the end. It should reset the counters at the beginning of each run. But the current code does that only when -I/--interval-print option is given. Fixes: 91f85f98da7ab8c3 ("perf stat: Display event stats using aggr counts") Reported-by: Jiri Olsa Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Kan Liang Cc: Namhyung Kim Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230616073211.1057936-1-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 3f4e76f76f94..7029e7a7cc2e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -725,6 +725,8 @@ static int __run_perf_stat(int argc, const char **argv, int run_idx) all_counters_use_bpf = false; } + evlist__reset_aggr_stats(evsel_list); + evlist__for_each_cpu(evlist_cpu_itr, evsel_list, affinity) { counter = evlist_cpu_itr.evsel; -- cgit From dada1a1f5fbccc74e9e6754fc586b1e8b82ac0af Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Fri, 16 Jun 2023 00:32:11 -0700 Subject: perf stat: Show average value on multiple runs When -r option is used, perf stat runs the command multiple times and update stats in the evsel->stats.res_stats for global aggregation. But the value is never used and the value it prints at the end is just the value from the last run. I think we should print the average number of multiple runs. Add evlist__copy_res_stats() to update the aggr counter (for display) using the values in the evsel->stats.res_stats. Signed-off-by: Namhyung Kim Cc: Adrian Hunter Cc: Andi Kleen Cc: Ian Rogers Cc: Ingo Molnar Cc: Jiri Olsa Cc: Kan Liang Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230616073211.1057936-2-namhyung@kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-stat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 7029e7a7cc2e..a3c04fb265f7 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -2831,8 +2831,11 @@ int cmd_stat(int argc, const char **argv) } } - if (!forever && status != -1 && (!interval || stat_config.summary)) + if (!forever && status != -1 && (!interval || stat_config.summary)) { + if (stat_config.run_count > 1) + evlist__copy_res_stats(&stat_config, evsel_list); print_counters(NULL, argc, argv); + } evlist__finalize_ctlfd(evsel_list); -- cgit From db1f5f1038a2df67f549e2657f56327e28127c27 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Wed, 14 Jun 2023 02:15:05 +0000 Subject: perf stat: Add missing newline in pr_err messages The newline is missing for error messages in add_default_attributes() Before: # perf stat --topdown Topdown requested but the topdown metric groups aren't present. (See perf list the metric groups have names like TopdownL1)# After: # perf stat --topdown Topdown requested but the topdown metric groups aren't present. (See perf list the metric groups have names like TopdownL1) # In addition, perf_stat_init_aggr_mode() and perf_stat_init_aggr_mode_file() have the same problem, fixed by the way. Signed-off-by: Yang Jihong Acked-by: Ian Rogers Reviewed-by: James Clark Link: https://lore.kernel.org/r/20230614021505.59856-1-yangjihong1@huawei.com Signed-off-by: Namhyung Kim --- tools/perf/builtin-stat.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'tools/perf/builtin-stat.c') diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index a3c04fb265f7..07b48f6df48e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -1596,7 +1596,7 @@ static int perf_stat_init_aggr_mode(void) stat_config.aggr_map = cpu_aggr_map__new(evsel_list->core.user_requested_cpus, get_id, /*data=*/NULL, needs_sort); if (!stat_config.aggr_map) { - pr_err("cannot build %s map", aggr_mode__string[stat_config.aggr_mode]); + pr_err("cannot build %s map\n", aggr_mode__string[stat_config.aggr_mode]); return -1; } stat_config.aggr_get_id = aggr_mode__get_id(stat_config.aggr_mode); @@ -1912,7 +1912,7 @@ static int perf_stat_init_aggr_mode_file(struct perf_stat *st) stat_config.aggr_map = cpu_aggr_map__new(evsel_list->core.user_requested_cpus, get_id, env, needs_sort); if (!stat_config.aggr_map) { - pr_err("cannot build %s map", aggr_mode__string[stat_config.aggr_mode]); + pr_err("cannot build %s map\n", aggr_mode__string[stat_config.aggr_mode]); return -1; } stat_config.aggr_get_id = aggr_mode__get_id_file(stat_config.aggr_mode); @@ -2052,7 +2052,7 @@ static int add_default_attributes(void) * on an architecture test for such a metric name. */ if (!metricgroup__has_metric(pmu, "transaction")) { - pr_err("Missing transaction metrics"); + pr_err("Missing transaction metrics\n"); return -1; } return metricgroup__parse_groups(evsel_list, pmu, "transaction", @@ -2068,7 +2068,7 @@ static int add_default_attributes(void) int smi; if (sysfs__read_int(FREEZE_ON_SMI_PATH, &smi) < 0) { - pr_err("freeze_on_smi is not supported."); + pr_err("freeze_on_smi is not supported.\n"); return -1; } @@ -2081,7 +2081,7 @@ static int add_default_attributes(void) } if (!metricgroup__has_metric(pmu, "smi")) { - pr_err("Missing smi metrics"); + pr_err("Missing smi metrics\n"); return -1; } @@ -2106,7 +2106,7 @@ static int add_default_attributes(void) if (!max_level) { pr_err("Topdown requested but the topdown metric groups aren't present.\n" - "(See perf list the metric groups have names like TopdownL1)"); + "(See perf list the metric groups have names like TopdownL1)\n"); return -1; } if (stat_config.topdown_level > max_level) { -- cgit