summaryrefslogtreecommitdiff
path: root/tools/perf/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'tools/perf/Documentation')
-rw-r--r--tools/perf/Documentation/perf-amd-ibs.txt9
-rw-r--r--tools/perf/Documentation/perf-c2c.txt11
-rw-r--r--tools/perf/Documentation/perf-config.txt4
-rw-r--r--tools/perf/Documentation/perf-list.txt9
-rw-r--r--tools/perf/Documentation/perf-lock.txt15
-rw-r--r--tools/perf/Documentation/perf-mem.txt32
-rw-r--r--tools/perf/Documentation/perf-record.txt16
-rw-r--r--tools/perf/Documentation/perf-report.txt1
-rw-r--r--tools/perf/Documentation/perf-stat.txt7
-rw-r--r--tools/perf/Documentation/perf-trace.txt9
-rw-r--r--tools/perf/Documentation/perf.data-file-format.txt24
11 files changed, 126 insertions, 11 deletions
diff --git a/tools/perf/Documentation/perf-amd-ibs.txt b/tools/perf/Documentation/perf-amd-ibs.txt
index 2fd31d9d7b71..55f80beae037 100644
--- a/tools/perf/Documentation/perf-amd-ibs.txt
+++ b/tools/perf/Documentation/perf-amd-ibs.txt
@@ -85,6 +85,15 @@ System-wide profile, uOps event, sampling period: 100000, L3MissOnly (Zen4 onwar
# perf record -e ibs_op/cnt_ctl=1,l3missonly=1/ -c 100000 -a
+System-wide profile, cycles event, sampling period: 100000, LdLat filtering (Zen5
+onward)
+
+ # perf record -e ibs_op/ldlat=128/ -c 100000 -a
+
+ Supported load latency threshold values are 128 to 2048 (both inclusive).
+ Latency value which is a multiple of 128 incurs a little less profiling
+ overhead compared to other values.
+
Per process(upstream v6.2 onward), uOps event, sampling period: 100000
# perf record -e ibs_op/cnt_ctl=1/ -c 100000 -p 1234
diff --git a/tools/perf/Documentation/perf-c2c.txt b/tools/perf/Documentation/perf-c2c.txt
index 856f0dfb8e5a..f4af2dd6ab31 100644
--- a/tools/perf/Documentation/perf-c2c.txt
+++ b/tools/perf/Documentation/perf-c2c.txt
@@ -54,8 +54,15 @@ RECORD OPTIONS
-l::
--ldlat::
- Configure mem-loads latency. Supported on Intel and Arm64 processors
- only. Ignored on other archs.
+ Configure mem-loads latency. Supported on Intel, Arm64 and some AMD
+ processors. Ignored on other archs.
+
+ On supported AMD processors:
+ - /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
+ - Supported latency values are 128 to 2048 (both inclusive).
+ - Latency value which is a multiple of 128 incurs a little less profiling
+ overhead compared to other values.
+ - Load latency filtering is disabled by default.
-k::
--all-kernel::
diff --git a/tools/perf/Documentation/perf-config.txt b/tools/perf/Documentation/perf-config.txt
index 36ebebc875ea..c6f335659667 100644
--- a/tools/perf/Documentation/perf-config.txt
+++ b/tools/perf/Documentation/perf-config.txt
@@ -708,6 +708,10 @@ intel-pt.*::
the maximum is exceeded there will be a "Never-ending loop"
error. The default is 100000.
+ intel-pt.all-switch-events::
+ If the user has permission to do so, always record all context
+ switch events on all CPUs.
+
auxtrace.*::
auxtrace.dumpdir::
diff --git a/tools/perf/Documentation/perf-list.txt b/tools/perf/Documentation/perf-list.txt
index 8914f12d2b85..ce0735021473 100644
--- a/tools/perf/Documentation/perf-list.txt
+++ b/tools/perf/Documentation/perf-list.txt
@@ -289,6 +289,15 @@ Sums up the event counts for all hardware threads in a core, e.g.:
perf stat -e cpu/event=0,umask=0x3,percore=1/
+cpu:
+
+Specifies the CPU to open the event upon. The value may be repeated to
+specify opening the event on multiple CPUs:
+
+
+ perf stat -e instructions/cpu=0,cpu=2/,cycles/cpu=1,cpu=2/ -a sleep 1
+ perf stat -e data_read/cpu=0/,data_write/cpu=1/ -a sleep 1
+
EVENT GROUPS
------------
diff --git a/tools/perf/Documentation/perf-lock.txt b/tools/perf/Documentation/perf-lock.txt
index 859dc11a7372..c17b3e318169 100644
--- a/tools/perf/Documentation/perf-lock.txt
+++ b/tools/perf/Documentation/perf-lock.txt
@@ -216,6 +216,21 @@ CONTENTION OPTIONS
--cgroup-filter=<value>::
Show lock contention only in the given cgroups (comma separated list).
+-J::
+--inject-delay=<time@function>::
+ Add delays to the given lock. It's added to the contention-end part so
+ that the (new) owner of the lock will be delayed. But by slowing down
+ the owner, the waiters will also be delayed as well. This is working
+ only with -b/--use-bpf.
+
+ The 'time' is specified in nsec but it can have a unit suffix. Available
+ units are "ms", "us" and "ns". Currently it accepts up to 10ms of delays
+ for safety reasons.
+
+ Note that it will busy-wait after it gets the lock. Delaying locks can
+ have significant consequences including potential kernel crashes. Please
+ use it at your own risk.
+
SEE ALSO
--------
diff --git a/tools/perf/Documentation/perf-mem.txt b/tools/perf/Documentation/perf-mem.txt
index 8a1bd9ff0f86..965e73d37772 100644
--- a/tools/perf/Documentation/perf-mem.txt
+++ b/tools/perf/Documentation/perf-mem.txt
@@ -28,6 +28,8 @@ and kernel support is required. See linkperf:perf-arm-spe[1] for a setup guide.
Due to the statistical nature of SPE sampling, not every memory operation will
be sampled.
+On AMD this use IBS Op PMU to sample load-store operations.
+
COMMON OPTIONS
--------------
-f::
@@ -67,8 +69,15 @@ RECORD OPTIONS
Configure all used events to run in user space.
--ldlat <n>::
- Specify desired latency for loads event. Supported on Intel and Arm64
- processors only. Ignored on other archs.
+ Specify desired latency for loads event. Supported on Intel, Arm64 and
+ some AMD processors. Ignored on other archs.
+
+ On supported AMD processors:
+ - /sys/bus/event_source/devices/ibs_op/caps/ldlat file contains '1'.
+ - Supported latency values are 128 to 2048 (both inclusive).
+ - Latency value which is a multiple of 128 incurs a little less profiling
+ overhead compared to other values.
+ - Load latency filtering is disabled by default.
REPORT OPTIONS
--------------
@@ -128,6 +137,25 @@ REPORT OPTIONS
In addition, for report all perf report options are valid, and for record
all perf record options.
+OVERHEAD CALCULATION
+--------------------
+Unlike linkperf:perf-report[1], which calculates overhead from the actual
+sample period, perf-mem overhead is calculated using sample weight. E.g.
+there are two samples in perf.data file, both with the same sample period,
+but one sample with weight 180 and the other with weight 20:
+
+ $ perf script -F period,data_src,weight,ip,sym
+ 100000 629080842 |OP LOAD|LVL L3 hit|... 20 7e69b93ca524 strcmp
+ 100000 1a29081042 |OP LOAD|LVL RAM hit|... 180 ffffffff82429168 memcpy
+
+ $ perf report -F overhead,symbol
+ 50% [.] strcmp
+ 50% [k] memcpy
+
+ $ perf mem report -F overhead,symbol
+ 90% [k] memcpy
+ 10% [.] strcmp
+
SEE ALSO
--------
linkperf:perf-record[1], linkperf:perf-report[1], linkperf:perf-arm-spe[1]
diff --git a/tools/perf/Documentation/perf-record.txt b/tools/perf/Documentation/perf-record.txt
index c7fc1ba265e2..612612fa2d80 100644
--- a/tools/perf/Documentation/perf-record.txt
+++ b/tools/perf/Documentation/perf-record.txt
@@ -340,7 +340,7 @@ OPTIONS
-d::
--data::
- Record the sample virtual addresses.
+ Record the sample virtual addresses. Implies --sample-mem-info.
--phys-data::
Record the sample physical addresses.
@@ -368,6 +368,11 @@ OPTIONS
the sample_type member of the struct perf_event_attr argument to the
perf_event_open system call.
+--sample-mem-info::
+ Record the sample data source information for memory operations.
+ It requires hardware supports and may work on specific events only.
+ Please consider using 'perf mem record' instead if you're not sure.
+
-n::
--no-samples::
Don't sample.
@@ -837,6 +842,15 @@ filtered through the mask provided by -C option.
only, as of now. So the applications built without the frame
pointer might see bogus addresses.
+ off-cpu profiling consists two types of samples: direct samples, which
+ share the same behavior as regular samples, and the accumulated
+ samples, stored in BPF stack trace map, presented after all the regular
+ samples.
+
+--off-cpu-thresh::
+ Once a task's off-cpu time reaches this threshold (in milliseconds), it
+ generates a direct off-cpu sample. The default is 500ms.
+
--setup-filter=<action>::
Prepare BPF filter to be used by regular users. The action should be
either "pin" or "unpin". The filter can be used after it's pinned.
diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 3376c4710575..acef3ff4178e 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -94,6 +94,7 @@ OPTIONS
- comm: command (name) of the task which can be read via /proc/<pid>/comm
- pid: command and tid of the task
+ - tgid: command and tgid of the task
- dso: name of library or module executed at the time of sample
- dso_size: size of library or module executed at the time of sample
- symbol: name of function executed at the time of sample
diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt
index 2bc063672486..61d091670dee 100644
--- a/tools/perf/Documentation/perf-stat.txt
+++ b/tools/perf/Documentation/perf-stat.txt
@@ -506,6 +506,13 @@ this option is not set. The TPEBS hardware feature starts from Intel Granite
Rapids microarchitecture. This option only exists in X86_64 and is meaningful on
Intel platforms with TPEBS feature.
+--tpebs-mode=[mean|min|max|last]::
+Set how retirement latency events have their sample times
+combined. The default "mean" gives the average of retirement
+latency. "min" or "max" give the smallest or largest retirment latency
+times respectively. "last" uses the last retirment latency sample's
+time.
+
--td-level::
Print the top-down statistics that equal the input level. It allows
users to print the interested top-down metrics level instead of the
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
index 887dc37773d0..c1fb6056a0d3 100644
--- a/tools/perf/Documentation/perf-trace.txt
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -152,7 +152,8 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
--summary-mode=mode::
To be used with -s or -S, to select how to show summary. By default it'll
- show the syscall summary by thread. Possible values are: thread, total.
+ show the syscall summary by thread. Possible values are: thread, total,
+ cgroup.
--tool_stats::
Show tool stats such as number of times fd->pathname was discovered thru
@@ -251,6 +252,12 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
pretty-printing serves as a fallback to hand-crafted pretty printers, as the latter can
better pretty-print integer flags and struct pointers.
+--bpf-summary::
+ Collect system call statistics in BPF. This is only for live mode and
+ works well with -s/--summary option where no argument information is
+ required.
+
+
PAGEFAULTS
----------
diff --git a/tools/perf/Documentation/perf.data-file-format.txt b/tools/perf/Documentation/perf.data-file-format.txt
index 010a4edcd384..cd95ba09f727 100644
--- a/tools/perf/Documentation/perf.data-file-format.txt
+++ b/tools/perf/Documentation/perf.data-file-format.txt
@@ -370,7 +370,7 @@ struct {
u32 mmap_len;
};
-Indicates that trace contains records of PERF_RECORD_COMPRESSED type
+Indicates that trace contains records of PERF_RECORD_COMPRESSED2 type
that have perf_events records in compressed form.
HEADER_CPU_PMU_CAPS = 28,
@@ -602,7 +602,14 @@ struct auxtrace_error_event {
Describes a header feature. These are records used in pipe-mode that
contain information that otherwise would be in perf.data file's header.
- PERF_RECORD_COMPRESSED = 81,
+ PERF_RECORD_COMPRESSED = 81, /* deprecated */
+
+The header is followed by compressed data frame that can be decompressed
+into array of perf trace records. The size of the entire compressed event
+record including the header is limited by the max value of header.size.
+
+It is deprecated and new files should use PERF_RECORD_COMPRESSED2 to gurantee
+8-byte alignment.
struct compressed_event {
struct perf_event_header header;
@@ -618,10 +625,17 @@ This is used, for instance, to 'perf inject' events after init and before
regular events, those emitted by the kernel, to support combining guest and
host records.
+ PERF_RECORD_COMPRESSED2 = 83,
-The header is followed by compressed data frame that can be decompressed
-into array of perf trace records. The size of the entire compressed event
-record including the header is limited by the max value of header.size.
+8-byte aligned version of `PERF_RECORD_COMPRESSED`. `header.size` indicates the
+total record size, including padding for 8-byte alignment, and `data_size`
+specifies the actual size of the compressed data.
+
+struct perf_record_compressed2 {
+ struct perf_event_header header;
+ __u64 data_size;
+ char data[];
+};
Event types