Merge tag 'perf-tools-for-v6.15-2025-03-27' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools

Pull perf tools updates from Namhyung Kim:
 "perf record:

   - Introduce latency profiling using scheduler information.

     The latency profiling is to show impacts on wall-time rather than
     cpu-time. By tracking context switches, it can weight samples and
     find which part of the code contributed more to the execution
     latency.

     The value (period) of the sample is weighted by dividing it by the
     number of parallel execution at the moment. The parallelism is
     tracked in perf report with sched-switch records. This will reduce
     the portion that are run in parallel and in turn increase the
     portion of serial executions.

     For now, it's limited to profile processes, IOW system-wide
     profiling is not supported. You can add --latency option to enable
     this.

       $ perf record --latency -- make -C tools/perf

     I've run the above command for perf build which adds -j option to
     make with the number of CPUs in the system internally. Normally
     it'd show something like below:

       $ perf report -F overhead,comm
       ...
       #
       # Overhead  Command
       # ........  ...............
       #
           78.97%  cc1
            6.54%  python3
            4.21%  shellcheck
            3.28%  ld
            1.80%  as
            1.37%  cc1plus
            0.80%  sh
            0.62%  clang
            0.56%  gcc
            0.44%  perl
            0.39%  make
  	 ...

     The cc1 takes around 80% of the overhead as it's the actual
     compiler. However it runs in parallel so its contribution to
     latency may be less than that. Now, perf report will show both
     overhead and latency (if --latency was given at record time) like
     below:

       $ perf report -s comm
       ...
       #
       # Overhead   Latency  Command
       # ........  ........  ...............
       #
           78.97%    48.66%  cc1
            6.54%    25.68%  python3
            4.21%     0.39%  shellcheck
            3.28%    13.70%  ld
            1.80%     2.56%  as
            1.37%     3.08%  cc1plus
            0.80%     0.98%  sh
            0.62%     0.61%  clang
            0.56%     0.33%  gcc
            0.44%     1.71%  perl
            0.39%     0.83%  make
  	 ...

     You can see latency of cc1 goes down to around 50% and python3 and
     ld contribute a lot more than their overhead. You can use --latency
     option in perf report to get the same result but ordered by
     latency.

       $ perf report --latency -s comm

  perf report:

   - As a side effect of the latency profiling work, it adds a new
     output field 'latency' and a sort key 'parallelism'. The below is a
     result from my system with 64 CPUs. The build was well-parallelized
     but contained some serial portions.

       $ perf report -s parallelism
       ...
       #
       # Overhead   Latency  Parallelism
       # ........  ........  ...........
       #
           16.95%     1.54%           62
           13.38%     1.24%           61
           12.50%    70.47%            1
           11.81%     1.06%           63
            7.59%     0.71%           60
            4.33%    12.20%            2
            3.41%     0.33%           59
            2.05%     0.18%           64
            1.75%     1.09%            9
            1.64%     1.85%            5
            ...

   - Support Feodra mini-debuginfo which is a LZMA compressed symbol
     table inside ".gnu_debugdata" ELF section.

  perf annotate:

   - Add --code-with-type option to enable data-type profiling with the
     usual annotate output.

     Instead of focusing on data structure, it shows code annotation
     together with data type it accesses in case the instruction refers
     to a memory location (and it was able to resolve the target data
     type). Currently it only works with --stdio.

       $ perf annotate --stdio --code-with-type
       ...
        Percent |      Source code & Disassembly of vmlinux for cpu/mem-loads,ldlat=30/pp (18 samples, percent: local period)
       ----------------------------------------------------------------------------------------------------------------------
                : 0                0xffffffff81050610 <__fdget>:
           0.00 :   ffffffff81050610:        callq   0xffffffff81c01b80 <__fentry__>           # data-type: (stack operation)
           0.00 :   ffffffff81050615:        pushq   %rbp              # data-type: (stack operation)
           0.00 :   ffffffff81050616:        movq    %rsp, %rbp
           0.00 :   ffffffff81050619:        pushq   %r15              # data-type: (stack operation)
           0.00 :   ffffffff8105061b:        pushq   %r14              # data-type: (stack operation)
           0.00 :   ffffffff8105061d:        pushq   %rbx              # data-type: (stack operation)
           0.00 :   ffffffff8105061e:        subq    $0x10, %rsp
           0.00 :   ffffffff81050622:        movl    %edi, %ebx
           0.00 :   ffffffff81050624:        movq    %gs:0x7efc4814(%rip), %rax  # 0x14e40 <current_task>              # data-type: struct task_struct* +0
           0.00 :   ffffffff8105062c:        movq    0x8d0(%rax), %r14         # data-type: struct task_struct +0x8d0 (files)
           0.00 :   ffffffff81050633:        movl    (%r14), %eax              # data-type: struct files_struct +0 (count.counter)
           0.00 :   ffffffff81050636:        cmpl    $0x1, %eax
           0.00 :   ffffffff81050639:        je      0xffffffff810506a9 <__fdget+0x99>
           0.00 :   ffffffff8105063b:        movq    0x20(%r14), %rcx          # data-type: struct files_struct +0x20 (fdt)
           0.00 :   ffffffff8105063f:        movl    (%rcx), %eax              # data-type: struct fdtable +0 (max_fds)
           0.00 :   ffffffff81050641:        cmpl    %ebx, %eax
           0.00 :   ffffffff81050643:        jbe     0xffffffff810506ef <__fdget+0xdf>
           0.00 :   ffffffff81050649:        movl    %ebx, %r15d
           5.56 :   ffffffff8105064c:        movq    0x8(%rcx), %rdx           # data-type: struct fdtable +0x8 (fd)
  	...

     The "# data-type:" part was added with this change. The first few
     entries are not very interesting. But later you can it accesses a
     couple of fields in the task_struct, files_struct and fdtable.

  perf trace:

   - Support syscall tracing for different ABI. For example it can trace
     system calls for 32-bit applications on 64-bit kernel
     transparently.

   - Add --summary-mode=total option to show global syscall summary. The
     default is 'thread' to show per-thread syscall summary.

  Python support:

   - Add more interfaces to 'perf' module to parse events, and config,
     enable or disable the event list properly so that it can implement
     basic functionalities purely in Python. There is an example code
     for these new interfaces in python/tracepoint.py.

   - Add mypy and pylint support to enable build time checking. Fix some
     code based on the findings from these tools.

  Internals:

   - Introduce io_dir__readdir() API to make directory traveral (usually
     for proc or sysfs) efficient with less memory footprint.

  JSON vendor events:

   - Add events and metrics for ARM Neoverse N3 and V3

   - Update events and metrics on various Intel CPUs

   - Add/update events for a number of SiFive processors"

* tag 'perf-tools-for-v6.15-2025-03-27' of git://git.kernel.org/pub/scm/linux/kernel/git/perf/perf-tools: (229 commits)
  perf bpf-filter: Fix a parsing error with comma
  perf report: Fix a memory leak for perf_env on AMD
  perf trace: Fix wrong size to bpf_map__update_elem call
  perf tools: annotate asm_pure_loop.S
  perf python: Fix setup.py mypy errors
  perf test: Address attr.py mypy error
  perf build: Add pylint build tests
  perf build: Add mypy build tests
  perf build: Rename TEST_LOGS to SHELL_TEST_LOGS
  tools/build: Don't pass test log files to linker
  perf bench sched pipe: fix enforced blocking reads in worker_thread
  perf tools: Fix is_compat_mode build break in ppc64
  perf build: filter all combinations of -flto for libperl
  perf vendor events arm64 AmpereOneX: Fix frontend_bound calculation
  perf vendor events arm64: AmpereOne/AmpereOneX: Mark LD_RETIRED impacted by errata
  perf trace: Fix evlist memory leak
  perf trace: Fix BTF memory leak
  perf trace: Make syscall table stable
  perf syscalltbl: Mask off ABI type for MIPS system calls
  perf build: Remove Makefile.syscalls
  ...
This commit is contained in:
Linus Torvalds
2025-03-31 08:52:33 -07:00
519 changed files with 39203 additions and 9731 deletions

View File

@@ -13,7 +13,7 @@
#endif
#include "../include/asm/inat.h" /* __ignore_sync_check__ */
#include "../include/asm/insn.h" /* __ignore_sync_check__ */
#include "../include/linux/unaligned.h" /* __ignore_sync_check__ */
#include <linux/unaligned.h> /* __ignore_sync_check__ */
#include <linux/errno.h>
#include <linux/kconfig.h>

View File

@@ -129,6 +129,10 @@ objprefix := $(subst ./,,$(OUTPUT)$(dir)/)
obj-y := $(addprefix $(objprefix),$(obj-y))
subdir-obj-y := $(addprefix $(objprefix),$(subdir-obj-y))
# Separate out test log files from real build objects.
test-y := $(filter %_log, $(obj-y))
obj-y := $(filter-out %_log, $(obj-y))
# Final '$(obj)-in.o' object
in-target := $(objprefix)$(obj)-in.o
@@ -139,7 +143,7 @@ $(subdir-y):
$(sort $(subdir-obj-y)): $(subdir-y) ;
$(in-target): $(obj-y) FORCE
$(in-target): $(obj-y) $(test-y) FORCE
$(call rule_mkdir)
$(call if_changed,$(host)ld_multi)

View File

@@ -5,7 +5,7 @@
int main(void)
{
void *backtrace_fns[10];
size_t entries;
int entries;
entries = backtrace(backtrace_fns, 10);
backtrace_symbols_fd(backtrace_fns, entries, 1);

View File

@@ -44,5 +44,5 @@ int main(void)
* Test existence of __NR_bpf and BPF_PROG_LOAD.
* This call should fail if we run the testcase.
*/
return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)) == 0;
}

View File

@@ -16,5 +16,5 @@ int main(void)
const char *version = XSTR(__GLIBC__) "." XSTR(__GLIBC_MINOR__);
#endif
return (long)version;
return version == NULL;
}

View File

@@ -4,5 +4,5 @@
int main(void)
{
debuginfod_client* c = debuginfod_begin();
return (long)c;
return !!c;
}

View File

@@ -9,7 +9,7 @@ int test_libdw(void)
{
Dwarf *dbg = dwarf_begin(0, DWARF_C_READ);
return (long)dbg;
return dbg == NULL;
}
int test_libdw_unwind(void)

View File

@@ -4,5 +4,5 @@
int main(void)
{
return gelf_getnote(NULL, 0, NULL, NULL, NULL);
return gelf_getnote(NULL, 0, NULL, NULL, NULL) == 0;
}

View File

@@ -5,5 +5,5 @@ int main(void)
{
Elf *elf = elf_begin(0, ELF_C_READ, 0);
return (long)elf;
return !!elf;
}

View File

@@ -4,7 +4,7 @@
int main(void)
{
lzma_stream strm = LZMA_STREAM_INIT;
int ret;
lzma_ret ret;
ret = lzma_stream_decoder(&strm, UINT64_MAX, LZMA_CONCATENATED);
return ret ? -1 : 0;

View File

@@ -95,7 +95,7 @@ install_lib: $(LIBFILE)
$(call do_install_mkdir,$(libdir_SQ)); \
cp -fpR $(LIBFILE) $(DESTDIR)$(libdir_SQ)
HDRS := cpu.h debug.h io.h
HDRS := cpu.h debug.h io.h io_dir.h
FD_HDRS := fd/array.h
FS_HDRS := fs/fs.h fs/tracing_path.h
INSTALL_HDRS_PFX := $(DESTDIR)$(prefix)/include/api

105
tools/lib/api/io_dir.h Normal file
View File

@@ -0,0 +1,105 @@
/* SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause) */
/*
* Lightweight directory reading library.
*/
#ifndef __API_IO_DIR__
#define __API_IO_DIR__
#include <dirent.h>
#include <fcntl.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/syscall.h>
#include <linux/limits.h>
#if !defined(SYS_getdents64)
#if defined(__x86_64__) || defined(__arm__)
#define SYS_getdents64 217
#elif defined(__i386__) || defined(__s390x__) || defined(__sh__)
#define SYS_getdents64 220
#elif defined(__alpha__)
#define SYS_getdents64 377
#elif defined(__mips__)
#define SYS_getdents64 308
#elif defined(__powerpc64__) || defined(__powerpc__)
#define SYS_getdents64 202
#elif defined(__sparc64__) || defined(__sparc__)
#define SYS_getdents64 154
#elif defined(__xtensa__)
#define SYS_getdents64 60
#else
#define SYS_getdents64 61
#endif
#endif /* !defined(SYS_getdents64) */
static inline ssize_t perf_getdents64(int fd, void *dirp, size_t count)
{
#ifdef MEMORY_SANITIZER
memset(dirp, 0, count);
#endif
return syscall(SYS_getdents64, fd, dirp, count);
}
struct io_dirent64 {
ino64_t d_ino; /* 64-bit inode number */
off64_t d_off; /* 64-bit offset to next structure */
unsigned short d_reclen; /* Size of this dirent */
unsigned char d_type; /* File type */
char d_name[NAME_MAX + 1]; /* Filename (null-terminated) */
};
struct io_dir {
int dirfd;
ssize_t available_bytes;
struct io_dirent64 *next;
struct io_dirent64 buff[4];
};
static inline void io_dir__init(struct io_dir *iod, int dirfd)
{
iod->dirfd = dirfd;
iod->available_bytes = 0;
}
static inline void io_dir__rewinddir(struct io_dir *iod)
{
lseek(iod->dirfd, 0, SEEK_SET);
iod->available_bytes = 0;
}
static inline struct io_dirent64 *io_dir__readdir(struct io_dir *iod)
{
struct io_dirent64 *entry;
if (iod->available_bytes <= 0) {
ssize_t rc = perf_getdents64(iod->dirfd, iod->buff, sizeof(iod->buff));
if (rc <= 0)
return NULL;
iod->available_bytes = rc;
iod->next = iod->buff;
}
entry = iod->next;
iod->next = (struct io_dirent64 *)((char *)entry + entry->d_reclen);
iod->available_bytes -= entry->d_reclen;
return entry;
}
static inline bool io_dir__is_dir(const struct io_dir *iod, struct io_dirent64 *dent)
{
if (dent->d_type == DT_UNKNOWN) {
struct stat st;
if (fstatat(iod->dirfd, dent->d_name, &st, /*flags=*/0))
return false;
if (S_ISDIR(st.st_mode)) {
dent->d_type = DT_DIR;
return true;
}
}
return dent->d_type == DT_DIR;
}
#endif /* __API_IO_DIR__ */

View File

@@ -41,13 +41,6 @@ libdir_relative_SQ = $(subst ','\'',$(libdir_relative))
TEST_ARGS := $(if $(V),-v)
# Set compile option CFLAGS
ifdef EXTRA_CFLAGS
CFLAGS := $(EXTRA_CFLAGS)
else
CFLAGS := -g -Wall
endif
INCLUDES = \
-I$(srctree)/tools/lib/perf/include \
-I$(srctree)/tools/lib/ \
@@ -57,11 +50,12 @@ INCLUDES = \
-I$(srctree)/tools/include/uapi
# Append required CFLAGS
override CFLAGS += $(EXTRA_WARNINGS)
override CFLAGS += -Werror -Wall
override CFLAGS += -g -Werror -Wall
override CFLAGS += -fPIC
override CFLAGS += $(INCLUDES)
override CFLAGS += -fvisibility=hidden
override CFLAGS += $(EXTRA_WARNINGS)
override CFLAGS += $(EXTRA_CFLAGS)
all:

View File

@@ -185,7 +185,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
while (isdigit(*cpu_list)) {
p = NULL;
start_cpu = strtoul(cpu_list, &p, 0);
if (start_cpu >= INT_MAX
if (start_cpu >= INT16_MAX
|| (*p != '\0' && *p != ',' && *p != '-' && *p != '\n'))
goto invalid;
@@ -194,7 +194,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
p = NULL;
end_cpu = strtoul(cpu_list, &p, 0);
if (end_cpu >= INT_MAX || (*p != '\0' && *p != ',' && *p != '\n'))
if (end_cpu >= INT16_MAX || (*p != '\0' && *p != ',' && *p != '\n'))
goto invalid;
if (end_cpu < start_cpu)
@@ -209,7 +209,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
for (; start_cpu <= end_cpu; start_cpu++) {
/* check for duplicates */
for (i = 0; i < nr_cpus; i++)
if (tmp_cpus[i].cpu == (int)start_cpu)
if (tmp_cpus[i].cpu == (int16_t)start_cpu)
goto invalid;
if (nr_cpus == max_entries) {
@@ -219,7 +219,7 @@ struct perf_cpu_map *perf_cpu_map__new(const char *cpu_list)
goto invalid;
tmp_cpus = tmp;
}
tmp_cpus[nr_cpus++].cpu = (int)start_cpu;
tmp_cpus[nr_cpus++].cpu = (int16_t)start_cpu;
}
if (*p)
++p;

View File

@@ -4,10 +4,11 @@
#include <perf/core.h>
#include <stdbool.h>
#include <stdint.h>
/** A wrapper around a CPU to avoid confusion with the perf_cpu_map's map's indices. */
struct perf_cpu {
int cpu;
int16_t cpu;
};
struct perf_cache {

View File

@@ -65,14 +65,40 @@ gtk-y += ui/gtk/
ifdef SHELLCHECK
SHELL_TESTS := $(wildcard *.sh)
TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log)
SHELL_TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log)
else
SHELL_TESTS :=
TEST_LOGS :=
SHELL_TEST_LOGS :=
endif
$(OUTPUT)%.shellcheck_log: %
$(call rule_mkdir)
$(Q)$(call echo-cmd,test)shellcheck -s bash -a -S warning "$<" > $@ || (cat $@ && rm $@ && false)
perf-y += $(TEST_LOGS)
perf-y += $(SHELL_TEST_LOGS)
ifdef MYPY
PY_TESTS := $(shell find python -type f -name '*.py')
MYPY_TEST_LOGS := $(PY_TESTS:python/%=python/%.mypy_log)
else
MYPY_TEST_LOGS :=
endif
$(OUTPUT)%.mypy_log: %
$(call rule_mkdir)
$(Q)$(call echo-cmd,test)mypy "$<" > $@ || (cat $@ && rm $@ && false)
perf-y += $(MYPY_TEST_LOGS)
ifdef PYLINT
PY_TESTS := $(shell find python -type f -name '*.py')
PYLINT_TEST_LOGS := $(PY_TESTS:python/%=python/%.pylint_log)
else
PYLINT_TEST_LOGS :=
endif
$(OUTPUT)%.pylint_log: %
$(call rule_mkdir)
$(Q)$(call echo-cmd,test)pylint "$<" > $@ || (cat $@ && rm $@ && false)
perf-y += $(PYLINT_TEST_LOGS)

View File

@@ -1,7 +1,8 @@
Overhead calculation
--------------------
The overhead can be shown in two columns as 'Children' and 'Self' when
perf collects callchains. The 'self' overhead is simply calculated by
The CPU overhead can be shown in two columns as 'Children' and 'Self'
when perf collects callchains (and corresponding 'Wall' columns for
wall-clock overhead). The 'self' overhead is simply calculated by
adding all period values of the entry - usually a function (symbol).
This is the value that perf shows traditionally and sum of all the
'self' overhead values should be 100%.

View File

@@ -0,0 +1,85 @@
CPU and latency overheads
-------------------------
There are two notions of time: wall-clock time and CPU time.
For a single-threaded program, or a program running on a single-core machine,
these notions are the same. However, for a multi-threaded/multi-process program
running on a multi-core machine, these notions are significantly different.
Each second of wall-clock time we have number-of-cores seconds of CPU time.
Perf can measure overhead for both of these times (shown in 'overhead' and
'latency' columns for CPU and wall-clock time correspondingly).
Optimizing CPU overhead is useful to improve 'throughput', while optimizing
latency overhead is useful to improve 'latency'. It's important to understand
which one is useful in a concrete situation at hand. For example, the former
may be useful to improve max throughput of a CI build server that runs on 100%
CPU utilization, while the latter may be useful to improve user-perceived
latency of a single interactive program build.
These overheads may be significantly different in some cases. For example,
consider a program that executes function 'foo' for 9 seconds with 1 thread,
and then executes function 'bar' for 1 second with 128 threads (consumes
128 seconds of CPU time). The CPU overhead is: 'foo' - 6.6%, 'bar' - 93.4%.
While the latency overhead is: 'foo' - 90%, 'bar' - 10%. If we try to optimize
running time of the program looking at the (wrong in this case) CPU overhead,
we would concentrate on the function 'bar', but it can yield only 10% running
time improvement at best.
By default, perf shows only CPU overhead. To show latency overhead, use
'perf record --latency' and 'perf report':
-----------------------------------
Overhead Latency Command
93.88% 25.79% cc1
1.90% 39.87% gzip
0.99% 10.16% dpkg-deb
0.57% 1.00% as
0.40% 0.46% sh
-----------------------------------
To sort by latency overhead, use 'perf report --latency':
-----------------------------------
Latency Overhead Command
39.87% 1.90% gzip
25.79% 93.88% cc1
10.16% 0.99% dpkg-deb
4.17% 0.29% git
2.81% 0.11% objtool
-----------------------------------
To get insight into the difference between the overheads, you may check
parallelization histogram with '--sort=latency,parallelism,comm,symbol --hierarchy'
flags. It shows fraction of (wall-clock) time the workload utilizes different
numbers of cores ('Parallelism' column). For example, in the following case
the workload utilizes only 1 core most of the time, but also has some
highly-parallel phases, which explains significant difference between
CPU and wall-clock overheads:
-----------------------------------
Latency Overhead Parallelism / Command / Symbol
+ 56.98% 2.29% 1
+ 16.94% 1.36% 2
+ 4.00% 20.13% 125
+ 3.66% 18.25% 124
+ 3.48% 17.66% 126
+ 3.26% 0.39% 3
+ 2.61% 12.93% 123
-----------------------------------
By expanding corresponding lines, you may see what commands/functions run
at the given parallelism level:
-----------------------------------
Latency Overhead Parallelism / Command / Symbol
- 56.98% 2.29% 1
32.80% 1.32% gzip
4.46% 0.18% cc1
2.81% 0.11% objtool
2.43% 0.10% dpkg-source
2.22% 0.09% ld
2.10% 0.08% dpkg-genchanges
-----------------------------------
To see the normal function-level profile for particular parallelism levels
(number of threads actively running on CPUs), you may use '--parallelism'
filter. For example, to see the profile only for low parallelism phases
of a workload use '--latency --parallelism=1-2' flags.

View File

@@ -8,15 +8,15 @@ Part of events are available on core cpu, part of events are available
on atom cpu and even part of events are available on both.
Kernel exports two new cpu pmus via sysfs:
/sys/devices/cpu_core
/sys/devices/cpu_atom
/sys/bus/event_source/devices/cpu_core
/sys/bus/event_source/devices/cpu_atom
The 'cpus' files are created under the directories. For example,
cat /sys/devices/cpu_core/cpus
cat /sys/bus/event_source/devices/cpu_core/cpus
0-15
cat /sys/devices/cpu_atom/cpus
cat /sys/bus/event_source/devices/cpu_atom/cpus
16-23
It indicates cpu0-cpu15 are core cpus and cpu16-cpu23 are atom cpus.
@@ -60,8 +60,8 @@ can't carry pmu information. So now this type is extended to be PMU aware
type. The PMU type ID is stored at attr.config[63:32].
PMU type ID is retrieved from sysfs.
/sys/devices/cpu_atom/type
/sys/devices/cpu_core/type
/sys/bus/event_source/devices/cpu_atom/type
/sys/bus/event_source/devices/cpu_core/type
The new attr.config layout for PERF_TYPE_HARDWARE:

View File

@@ -168,6 +168,10 @@ include::itrace.txt[]
--skip-empty::
Do not display empty (or dummy) events.
--code-with-type::
Show data type info in code annotation (for memory instructions only).
Currently it only works with --stdio option.
SEE ALSO
--------

View File

@@ -27,7 +27,7 @@ Don't print descriptions.
-v::
--long-desc::
Print longer event descriptions.
Print longer event descriptions and all similar PMUs with alphanumeric suffixes.
--debug::
Enable debugging output.
@@ -188,7 +188,7 @@ in the CPU vendor specific documentation.
The available PMUs and their raw parameters can be listed with
ls /sys/devices/*/format
ls /sys/bus/event_source/devices/*/format
For example the raw event "LSD.UOPS" core pmu event above could
be specified as

View File

@@ -179,8 +179,9 @@ CONTENTION OPTIONS
-o::
--lock-owner::
Show lock contention stat by owners. Implies --threads and
requires --use-bpf.
Show lock contention stat by owners. This option can be combined with -t,
which shows owner's per thread lock stats, or -v, which shows owner's
stacktrace. Requires --use-bpf.
-Y::
--type-filter=<value>::

View File

@@ -227,6 +227,10 @@ OPTIONS
'--filter' exists, the new filter expression will be combined with
them by '&&'.
--latency::
Enable data collection for latency profiling.
Use perf report --latency for latency-centric profile.
-a::
--all-cpus::
System-wide collection from all CPUs (default if no target is specified).

View File

@@ -44,7 +44,7 @@ OPTIONS
--comms=::
Only consider symbols in these comms. CSV that understands
file://filename entries. This option will affect the percentage of
the overhead column. See --percentage for more info.
the overhead and latency columns. See --percentage for more info.
--pid=::
Only show events for given process ID (comma separated list).
@@ -54,12 +54,12 @@ OPTIONS
--dsos=::
Only consider symbols in these dsos. CSV that understands
file://filename entries. This option will affect the percentage of
the overhead column. See --percentage for more info.
the overhead and latency columns. See --percentage for more info.
-S::
--symbols=::
Only consider these symbols. CSV that understands
file://filename entries. This option will affect the percentage of
the overhead column. See --percentage for more info.
the overhead and latency columns. See --percentage for more info.
--symbol-filter=::
Only show symbols that match (partially) with this filter.
@@ -68,6 +68,21 @@ OPTIONS
--hide-unresolved::
Only display entries resolved to a symbol.
--parallelism::
Only consider these parallelism levels. Parallelism level is the number
of threads that actively run on CPUs at the time of sample. The flag
accepts single number, comma-separated list, and ranges (for example:
"1", "7,8", "1,64-128"). This is useful in understanding what a program
is doing during sequential/low-parallelism phases as compared to
high-parallelism phases. This option will affect the percentage of
the overhead and latency columns. See --percentage for more info.
Also see the `CPU and latency overheads' section for more details.
--latency::
Show latency-centric profile rather than the default
CPU-consumption-centric profile
(requires perf record --latency flag).
-s::
--sort=::
Sort histogram entries by given key(s) - multiple keys can be specified
@@ -87,6 +102,7 @@ OPTIONS
entries are displayed as "[other]".
- cpu: cpu number the task ran at the time of sample
- socket: processor socket number the task ran at the time of sample
- parallelism: number of running threads at the time of sample
- srcline: filename and line number executed at the time of sample. The
DWARF debugging info must be provided.
- srcfile: file name of the source file of the samples. Requires dwarf
@@ -97,12 +113,14 @@ OPTIONS
- cgroup_id: ID derived from cgroup namespace device and inode numbers.
- cgroup: cgroup pathname in the cgroupfs.
- transaction: Transaction abort flags.
- overhead: Overhead percentage of sample
- overhead_sys: Overhead percentage of sample running in system mode
- overhead_us: Overhead percentage of sample running in user mode
- overhead_guest_sys: Overhead percentage of sample running in system mode
- overhead: CPU overhead percentage of sample.
- latency: latency (wall-clock) overhead percentage of sample.
See the `CPU and latency overheads' section for more details.
- overhead_sys: CPU overhead percentage of sample running in system mode
- overhead_us: CPU overhead percentage of sample running in user mode
- overhead_guest_sys: CPU overhead percentage of sample running in system mode
on guest machine
- overhead_guest_us: Overhead percentage of sample running in user mode on
- overhead_guest_us: CPU overhead percentage of sample running in user mode on
guest machine
- sample: Number of sample
- period: Raw number of event count of sample
@@ -125,8 +143,8 @@ OPTIONS
- weight2: Average value of event specific weight (2nd field of weight_struct).
- weight3: Average value of event specific weight (3rd field of weight_struct).
By default, comm, dso and symbol keys are used.
(i.e. --sort comm,dso,symbol)
By default, overhead, comm, dso and symbol keys are used.
(i.e. --sort overhead,comm,dso,symbol).
If --branch-stack option is used, following sort keys are also
available:
@@ -201,9 +219,9 @@ OPTIONS
--fields=::
Specify output field - multiple keys can be specified in CSV format.
Following fields are available:
overhead, overhead_sys, overhead_us, overhead_children, sample, period,
weight1, weight2, weight3, ins_lat, p_stage_cyc and retire_lat. The
last 3 names are alias for the corresponding weights. When the weight
overhead, latency, overhead_sys, overhead_us, overhead_children, sample,
period, weight1, weight2, weight3, ins_lat, p_stage_cyc and retire_lat.
The last 3 names are alias for the corresponding weights. When the weight
fields are used, they will show the average value of the weight.
Also it can contain any sort key(s).
@@ -289,7 +307,7 @@ OPTIONS
Accumulate callchain of children to parent entry so that then can
show up in the output. The output will have a new "Children" column
and will be sorted on the data. It requires callchains are recorded.
See the `overhead calculation' section for more details. Enabled by
See the `Overhead calculation' section for more details. Enabled by
default, disable with --no-children.
--max-stack::
@@ -442,9 +460,9 @@ OPTIONS
--call-graph option for details.
--percentage::
Determine how to display the overhead percentage of filtered entries.
Filters can be applied by --comms, --dsos and/or --symbols options and
Zoom operations on the TUI (thread, dso, etc).
Determine how to display the CPU and latency overhead percentage
of filtered entries. Filters can be applied by --comms, --dsos, --symbols
and/or --parallelism options and Zoom operations on the TUI (thread, dso, etc).
"relative" means it's relative to filtered entries only so that the
sum of shown entries will be always 100%. "absolute" means it retains
@@ -627,6 +645,8 @@ include::itrace.txt[]
--skip-empty::
Do not print 0 results in the --stat output.
include::cpu-and-latency-overheads.txt[]
include::callchain-overhead-calculation.txt[]
SEE ALSO

View File

@@ -239,13 +239,22 @@ OPTIONS
i.e., -F "" is not allowed.
The brstack output includes branch related information with raw addresses using the
/v/v/v/v/cycles syntax in the following order:
FROM: branch source instruction
TO : branch target instruction
M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
X/- : X=branch inside a transactional region, -=not in transaction region or not supported
A/- : A=TSX abort entry, -=not aborted region or not supported
cycles
FROM/TO/EVENT/INTX/ABORT/CYCLES/TYPE/SPEC syntax in the following order:
FROM : branch source instruction
TO : branch target instruction
EVENT : M=branch target or direction was mispredicted
P=branch target or direction was predicted
N=branch not-taken
-=no event or not supported
INTX : X=branch inside a transactional region
-=branch not in transaction region or not supported
ABORT : A=TSX abort entry
-=not aborted region or not supported
CYCLES: the number of cycles that have elapsed since the last branch was recorded
TYPE : branch type: COND/UNCOND/IND/CALL/IND_CALL/RET etc.
-=not supported
SPEC : branch speculation info: SPEC_WRONG_PATH/NON_SPEC_CORRECT_PATH/SPEC_CORRECT_PATH
-=not supported
The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.

View File

@@ -150,6 +150,10 @@ the thread executes on the designated CPUs. Default is to monitor all CPUs.
To be used with -s or -S, to show stats for the errnos experienced by
syscalls, using only this option will trigger --summary.
--summary-mode=mode::
To be used with -s or -S, to select how to show summary. By default it'll
show the syscall summary by thread. Possible values are: thread, total.
--tool_stats::
Show tool stats such as number of times fd->pathname was discovered thru
hooking the open syscall return + vfs_getname or via reading /proc/pid/fd, etc.

View File

@@ -62,3 +62,7 @@ To show context switches in perf report sample context add --switch-events to pe
To show time in nanoseconds in record/report add --ns
To compare hot regions in two workloads use perf record -b -o file ... ; perf diff --stream file1 file2
To compare scalability of two workload samples use perf diff -c ratio file1 file2
For latency profiling, try: perf record/report --latency
For parallelism histogram, try: perf report --hierarchy --sort latency,parallelism,comm,symbol
To analyze particular parallelism levels, try: perf report --latency --parallelism=32-64
To see how parallelism changes over time, try: perf report -F time,latency,parallelism --time-quantum=1s

View File

@@ -497,13 +497,14 @@ ifeq ($(feature-setns), 1)
$(call detected,CONFIG_SETNS)
endif
ifeq ($(feature-reallocarray), 0)
CFLAGS += -DCOMPAT_NEED_REALLOCARRAY
endif
ifdef CORESIGHT
$(call feature_check,libopencsd)
ifeq ($(feature-libopencsd), 1)
CFLAGS += -DHAVE_CSTRACE_SUPPORT $(LIBOPENCSD_CFLAGS)
ifeq ($(feature-reallocarray), 0)
CFLAGS += -DCOMPAT_NEED_REALLOCARRAY
endif
LDFLAGS += $(LIBOPENCSD_LDFLAGS)
EXTLIBS += $(OPENCSDLIBS)
$(call detected,CONFIG_LIBOPENCSD)
@@ -820,7 +821,7 @@ else
PERL_EMBED_LIBADD = $(call grep-libs,$(PERL_EMBED_LDOPTS))
PERL_EMBED_CCOPTS = $(shell perl -MExtUtils::Embed -e ccopts 2>/dev/null)
PERL_EMBED_CCOPTS := $(filter-out -specs=%,$(PERL_EMBED_CCOPTS))
PERL_EMBED_CCOPTS := $(filter-out -flto=auto -ffat-lto-objects, $(PERL_EMBED_CCOPTS))
PERL_EMBED_CCOPTS := $(filter-out -flto% -ffat-lto-objects, $(PERL_EMBED_CCOPTS))
PERL_EMBED_LDOPTS := $(filter-out -specs=%,$(PERL_EMBED_LDOPTS))
FLAGS_PERL_EMBED=$(PERL_EMBED_CCOPTS) $(PERL_EMBED_LDOPTS)
@@ -1103,9 +1104,6 @@ ifndef NO_AUXTRACE
ifndef NO_AUXTRACE
$(call detected,CONFIG_AUXTRACE)
CFLAGS += -DHAVE_AUXTRACE_SUPPORT
ifeq ($(feature-reallocarray), 0)
CFLAGS += -DCOMPAT_NEED_REALLOCARRAY
endif
endif
endif

View File

@@ -158,7 +158,7 @@ ifneq ($(OUTPUT),)
VPATH += $(OUTPUT)
export VPATH
# create symlink to the original source
SOURCE := $(shell ln -sf $(srctree)/tools/perf $(OUTPUT)/source)
SOURCE := $(shell ln -sfn $(srctree)/tools/perf $(OUTPUT)/source)
endif
# Do not use make's built-in rules
@@ -248,7 +248,7 @@ else
force_fixdep := $(config)
endif
# Runs shellcheck on perf test shell scripts
# Runs shellcheck on perf shell scripts
ifeq ($(NO_SHELLCHECK),1)
SHELLCHECK :=
else
@@ -265,8 +265,18 @@ ifneq ($(SHELLCHECK),)
endif
endif
# Runs mypy on perf python files
ifeq ($(MYPY),1)
MYPY := $(shell which mypy 2> /dev/null)
endif
# Runs pylint on perf python files
ifeq ($(PYLINT),1)
PYLINT := $(shell which pylint 2> /dev/null)
endif
export srctree OUTPUT RM CC CXX LD AR CFLAGS CXXFLAGS V BISON FLEX AWK
export HOSTCC HOSTLD HOSTAR HOSTCFLAGS SHELLCHECK
export HOSTCC HOSTLD HOSTAR HOSTCFLAGS SHELLCHECK MYPY PYLINT
include $(srctree)/tools/build/Makefile.include
@@ -298,7 +308,6 @@ ifeq ($(filter feature-dump,$(MAKECMDGOALS)),feature-dump)
FEATURE_TESTS := all
endif
endif
include $(srctree)/tools/perf/scripts/Makefile.syscalls
include Makefile.config
endif
@@ -518,6 +527,14 @@ beauty_ioctl_outdir := $(beauty_outdir)/ioctl
# Create output directory if not already present
$(shell [ -d '$(beauty_ioctl_outdir)' ] || mkdir -p '$(beauty_ioctl_outdir)')
syscall_array := $(beauty_outdir)/syscalltbl.c
syscall_tbl := $(srctree)/tools/perf/trace/beauty/syscalltbl.sh
syscall_tbl_data := $(srctree)/tools/scripts/syscall.tbl \
$(wildcard $(srctree)/tools/perf/arch/*/entry/syscalls/syscall*.tbl)
$(syscall_array): $(syscall_tbl) $(syscall_tbl_data)
$(Q)$(SHELL) '$(syscall_tbl)' $(srctree)/tools $@
fs_at_flags_array := $(beauty_outdir)/fs_at_flags_array.c
fs_at_flags_tbl := $(srctree)/tools/perf/trace/beauty/fs_at_flags.sh
@@ -837,6 +854,7 @@ build-dir = $(or $(__build-dir),.)
prepare: $(OUTPUT)PERF-VERSION-FILE $(OUTPUT)common-cmds.h archheaders \
arm64-sysreg-defs \
$(syscall_array) \
$(fs_at_flags_array) \
$(clone_flags_array) \
$(drm_ioctl_array) \

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_64.h

View File

@@ -1,5 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_64 +=
syscalltbl = $(srctree)/tools/perf/arch/alpha/entry/syscalls/syscall.tbl

View File

@@ -1,2 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/syscalls_64.h>

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h

View File

@@ -1,3 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 += arc time32 renameat stat64 rlimit

View File

@@ -1,2 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/syscalls_32.h>

View File

@@ -1,4 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 += oabi
syscalltbl = $(srctree)/tools/perf/arch/arm/entry/syscalls/syscall.tbl

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h

View File

@@ -1,2 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/syscalls_32.h>

View File

@@ -45,7 +45,7 @@ static int sample_ustack(struct perf_sample *sample,
int test__arch_unwind_sample(struct perf_sample *sample,
struct thread *thread)
{
struct regs_dump *regs = &sample->user_regs;
struct regs_dump *regs = perf_sample__user_regs(sample);
u64 *buf;
buf = calloc(1, sizeof(u64) * PERF_REGS_MAX);

View File

@@ -18,7 +18,7 @@
void perf_pmu__arch_init(struct perf_pmu *pmu)
{
struct perf_cpu_map *intersect;
struct perf_cpu_map *intersect, *online = cpu_map__online();
#ifdef HAVE_AUXTRACE_SUPPORT
if (!strcmp(pmu->name, CORESIGHT_ETM_PMU_NAME)) {
@@ -41,7 +41,8 @@ void perf_pmu__arch_init(struct perf_pmu *pmu)
}
#endif
/* Workaround some ARM PMU's failing to correctly set CPU maps for online processors. */
intersect = perf_cpu_map__intersect(cpu_map__online(), pmu->cpus);
intersect = perf_cpu_map__intersect(online, pmu->cpus);
perf_cpu_map__put(online);
perf_cpu_map__put(pmu->cpus);
pmu->cpus = intersect;
}

View File

@@ -8,7 +8,7 @@
bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
{
struct unwind_info *ui = arg;
struct regs_dump *user_regs = &ui->sample->user_regs;
struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
Dwarf_Word dwarf_regs[PERF_REG_ARM_MAX];
#define REG(r) ({ \

View File

@@ -1,3 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h
syscall-y += syscalls_64.h

View File

@@ -1,6 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 +=
syscall_abis_64 += renameat rlimit memfd_secret
syscalltbl = $(srctree)/tools/perf/arch/arm64/entry/syscalls/syscall_%.tbl

View File

@@ -1,8 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/bitsperlong.h>
#if __BITS_PER_LONG == 64
#include <asm/syscalls_64.h>
#else
#include <asm/syscalls_32.h>
#endif

View File

@@ -45,7 +45,7 @@ static int sample_ustack(struct perf_sample *sample,
int test__arch_unwind_sample(struct perf_sample *sample,
struct thread *thread)
{
struct regs_dump *regs = &sample->user_regs;
struct regs_dump *regs = perf_sample__user_regs(sample);
u64 *buf;
buf = calloc(1, sizeof(u64) * PERF_REGS_MAX);

View File

@@ -40,6 +40,19 @@ struct arm_spe_recording {
bool *wrapped;
};
/* Iterate config list to detect if the "freq" parameter is set */
static bool arm_spe_is_set_freq(struct evsel *evsel)
{
struct evsel_config_term *term;
list_for_each_entry(term, &evsel->config_terms, list) {
if (term->type == EVSEL__CONFIG_TERM_FREQ)
return true;
}
return false;
}
/*
* arm_spe_find_cpus() returns a new cpu map, and the caller should invoke
* perf_cpu_map__put() to release the map after use.
@@ -389,6 +402,14 @@ static int arm_spe_recording_options(struct auxtrace_record *itr,
return -EINVAL;
}
opts->full_auxtrace = true;
if (opts->user_freq != UINT_MAX ||
arm_spe_is_set_freq(evsel)) {
pr_err("Arm SPE: Frequency is not supported. "
"Set period with -c option or PMU parameter (-e %s/period=NUM/).\n",
evsel->pmu->name);
return -EINVAL;
}
}
}

View File

@@ -8,7 +8,7 @@
bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
{
struct unwind_info *ui = arg;
struct regs_dump *user_regs = &ui->sample->user_regs;
struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
Dwarf_Word dwarf_regs[PERF_REG_ARM64_MAX], dwarf_pc;
#define REG(r) ({ \

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h

View File

@@ -1,3 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 += csky time32 stat64 rlimit

View File

@@ -1,2 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/syscalls_32.h>

View File

@@ -10,7 +10,7 @@
bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
{
struct unwind_info *ui = arg;
struct regs_dump *user_regs = &ui->sample->user_regs;
struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
Dwarf_Word dwarf_regs[PERF_REG_CSKY_MAX];
#define REG(r) ({ \

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_64.h

View File

@@ -1,3 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_64 +=

View File

@@ -1,2 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/syscall_table_64.h>

View File

@@ -10,7 +10,7 @@
bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
{
struct unwind_info *ui = arg;
struct regs_dump *user_regs = &ui->sample->user_regs;
struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
Dwarf_Word dwarf_regs[PERF_REG_LOONGARCH_MAX];
#define REG(r) ({ \

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_64.h

View File

@@ -1,5 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_64 += n64
syscalltbl = $(srctree)/tools/perf/arch/mips/entry/syscalls/syscall_n64.tbl

View File

@@ -1,2 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/syscalls_64.h>

View File

@@ -1,3 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h
syscall-y += syscalls_64.h

View File

@@ -1,6 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 +=
syscall_abis_64 +=
syscalltbl = $(srctree)/tools/perf/arch/parisc/entry/syscalls/syscall.tbl

View File

@@ -1,8 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/bitsperlong.h>
#if __BITS_PER_LONG == 64
#include <asm/syscalls_64.h>
#else
#include <asm/syscalls_32.h>
#endif

View File

@@ -1,3 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h
syscall-y += syscalls_64.h

View File

@@ -1,6 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 += nospu
syscall_abis_64 += nospu
syscalltbl = $(srctree)/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl

View File

@@ -1,8 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/bitsperlong.h>
#if __BITS_PER_LONG == 64
#include <asm/syscalls_64.h>
#else
#include <asm/syscalls_32.h>
#endif

View File

@@ -45,7 +45,7 @@ static int sample_ustack(struct perf_sample *sample,
int test__arch_unwind_sample(struct perf_sample *sample,
struct thread *thread)
{
struct regs_dump *regs = &sample->user_regs;
struct regs_dump *regs = perf_sample__user_regs(sample);
u64 *buf;
buf = calloc(1, sizeof(u64) * PERF_REGS_MAX);

View File

@@ -14,8 +14,8 @@
static bool is_compat_mode(void)
{
u64 base_platform = getauxval(AT_BASE_PLATFORM);
u64 platform = getauxval(AT_PLATFORM);
unsigned long base_platform = getauxval(AT_BASE_PLATFORM);
unsigned long platform = getauxval(AT_PLATFORM);
if (!strcmp((char *)platform, (char *)base_platform))
return false;

View File

@@ -16,7 +16,7 @@ static const int special_regs[3][2] = {
bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
{
struct unwind_info *ui = arg;
struct regs_dump *user_regs = &ui->sample->user_regs;
struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
Dwarf_Word dwarf_regs[32], dwarf_nip;
size_t i;

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_64.h

View File

@@ -1,4 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 += riscv memfd_secret
syscall_abis_64 += riscv rlimit memfd_secret

View File

@@ -1,8 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/bitsperlong.h>
#if __BITS_PER_LONG == 64
#include <asm/syscalls_64.h>
#else
#include <asm/syscalls_32.h>
#endif

View File

@@ -10,7 +10,7 @@
bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
{
struct unwind_info *ui = arg;
struct regs_dump *user_regs = &ui->sample->user_regs;
struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
Dwarf_Word dwarf_regs[32];
#define REG(r) ({ \

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_64.h

View File

@@ -1,5 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_64 += renameat rlimit memfd_secret
syscalltbl = $(srctree)/tools/perf/arch/s390/entry/syscalls/syscall.tbl

View File

@@ -1,2 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/syscalls_64.h>

View File

@@ -11,7 +11,7 @@
bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
{
struct unwind_info *ui = arg;
struct regs_dump *user_regs = &ui->sample->user_regs;
struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
Dwarf_Word dwarf_regs[ARRAY_SIZE(s390_dwarf_regs)];
#define REG(r) ({ \

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h

View File

@@ -1,4 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 +=
syscalltbl = $(srctree)/tools/perf/arch/sh/entry/syscalls/syscall.tbl

View File

@@ -1,2 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/syscalls_32.h>

View File

@@ -1,3 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h
syscall-y += syscalls_64.h

View File

@@ -1,5 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 +=
syscall_abis_64 +=
syscalltbl = $(srctree)/tools/perf/arch/sparc/entry/syscalls/syscall.tbl

View File

@@ -1,8 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/bitsperlong.h>
#if __BITS_PER_LONG == 64
#include <asm/syscalls_64.h>
#else
#include <asm/syscalls_32.h>
#endif

View File

@@ -2,14 +2,14 @@ perf-util-y += util/
perf-test-y += tests/
ifdef SHELLCHECK
TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log)
SHELL_TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log)
else
SHELL_TESTS :=
TEST_LOGS :=
SHELL_TEST_LOGS :=
endif
$(OUTPUT)%.shellcheck_log: %
$(call rule_mkdir)
$(Q)$(call echo-cmd,test)shellcheck -a -S warning "$<" > $@ || (cat $@ && rm $@ && false)
perf-test-y += $(TEST_LOGS)
perf-test-y += $(SHELL_TEST_LOGS)

View File

@@ -410,7 +410,7 @@ static void update_insn_state_x86(struct type_state *state,
retry:
/* Check stack variables with offset */
if (sreg == fbreg) {
if (sreg == fbreg || sreg == state->stack_reg) {
struct type_state_stack *stack;
int offset = src->offset - fboff;
@@ -433,8 +433,13 @@ static void update_insn_state_x86(struct type_state *state,
return;
}
pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d",
insn_offset, -offset, dst->reg1);
if (sreg == fbreg) {
pr_debug_dtp("mov [%x] -%#x(stack) -> reg%d",
insn_offset, -offset, dst->reg1);
} else {
pr_debug_dtp("mov [%x] %#x(reg%d) -> reg%d",
insn_offset, offset, sreg, dst->reg1);
}
pr_debug_type_name(&tsr->type, tsr->kind);
}
/* And then dereference the pointer if it has one */
@@ -561,7 +566,7 @@ static void update_insn_state_x86(struct type_state *state,
return;
/* Check stack variables with offset */
if (dst->reg1 == fbreg) {
if (dst->reg1 == fbreg || dst->reg1 == state->stack_reg) {
struct type_state_stack *stack;
int offset = dst->offset - fboff;
@@ -584,8 +589,13 @@ static void update_insn_state_x86(struct type_state *state,
&tsr->type);
}
pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)",
insn_offset, src->reg1, -offset);
if (dst->reg1 == fbreg) {
pr_debug_dtp("mov [%x] reg%d -> -%#x(stack)",
insn_offset, src->reg1, -offset);
} else {
pr_debug_dtp("mov [%x] reg%d -> %#x(reg%d)",
insn_offset, src->reg1, offset, dst->reg1);
}
pr_debug_type_name(&tsr->type, tsr->kind);
}
/*

View File

@@ -1,3 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h
syscall-y += syscalls_64.h

View File

@@ -1,6 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 += i386
syscall_abis_64 +=
syscalltbl = $(srctree)/tools/perf/arch/x86/entry/syscalls/syscall_%.tbl

View File

@@ -1,8 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/bitsperlong.h>
#if __BITS_PER_LONG == 64
#include <asm/syscalls_64.h>
#else
#include <asm/syscalls_32.h>
#endif

View File

@@ -13,14 +13,14 @@ perf-test-y += amd-ibs-via-core-pmu.o
ifdef SHELLCHECK
SHELL_TESTS := gen-insn-x86-dat.sh
TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log)
SHELL_TEST_LOGS := $(SHELL_TESTS:%=%.shellcheck_log)
else
SHELL_TESTS :=
TEST_LOGS :=
SHELL_TEST_LOGS :=
endif
$(OUTPUT)%.shellcheck_log: %
$(call rule_mkdir)
$(Q)$(call echo-cmd,test)shellcheck -a -S warning "$<" > $@ || (cat $@ && rm $@ && false)
perf-test-y += $(TEST_LOGS)
perf-test-y += $(SHELL_TEST_LOGS)

View File

@@ -53,7 +53,7 @@ static int sample_ustack(struct perf_sample *sample,
int test__arch_unwind_sample(struct perf_sample *sample,
struct thread *thread)
{
struct regs_dump *regs = &sample->user_regs;
struct regs_dump *regs = perf_sample__user_regs(sample);
u64 *buf;
buf = malloc(sizeof(u64) * PERF_REGS_MAX);

View File

@@ -39,28 +39,13 @@ int arch_evlist__cmp(const struct evsel *lhs, const struct evsel *rhs)
* 26,319,024 slots
* 2,427,791 instructions
* 2,683,508 topdown-retiring
*
* If slots event and topdown metrics events are not in same group, the
* topdown metrics events must be first event after the slots event group,
* otherwise topdown metrics events can't be regrouped correctly, e.g.
*
* a. perf stat -e "{instructions,slots},cycles,topdown-retiring" -C0 sleep 1
* e. slots event and metrics event are not in a group and not adjacent
* perf stat -e "{instructions,slots},cycles,topdown-retiring" -C0 sleep 1
* WARNING: events were regrouped to match PMUs
* Performance counter stats for 'CPU(s) 0':
* 17,923,134 slots
* 2,154,855 instructions
* 3,015,058 cycles
* <not supported> topdown-retiring
*
* If slots event and topdown metrics events are in two groups, the group which
* has topdown metrics events must contain only the topdown metrics event,
* otherwise topdown metrics event can't be regrouped correctly as well, e.g.
*
* a. perf stat -e "{instructions,slots},{topdown-retiring,cycles}" -C0 sleep 1
* WARNING: events were regrouped to match PMUs
* Error:
* The sys_perf_event_open() syscall returned with 22 (Invalid argument) for
* event (topdown-retiring)
* 68,433,522 slots
* 8,856,102 topdown-retiring
* 7,791,494 instructions
* 11,469,513 cycles
*/
if (topdown_sys_has_perf_metrics() &&
(arch_evsel__must_be_in_group(lhs) || arch_evsel__must_be_in_group(rhs))) {
@@ -76,12 +61,15 @@ int arch_evlist__cmp(const struct evsel *lhs, const struct evsel *rhs)
* topdown metrics events are already in same group with slots
* event, do nothing.
*/
if (arch_is_topdown_metrics(lhs) && !arch_is_topdown_metrics(rhs) &&
lhs->core.leader != rhs->core.leader)
return -1;
if (!arch_is_topdown_metrics(lhs) && arch_is_topdown_metrics(rhs) &&
lhs->core.leader != rhs->core.leader)
return 1;
if (lhs->core.leader != rhs->core.leader) {
bool lhs_topdown = arch_is_topdown_metrics(lhs);
bool rhs_topdown = arch_is_topdown_metrics(rhs);
if (lhs_topdown && !rhs_topdown)
return -1;
if (!lhs_topdown && rhs_topdown)
return 1;
}
}
/* Retire latency event should not be group leader*/

View File

@@ -32,7 +32,7 @@
#define MAX_PATH 1024
#endif
#define UNCORE_IIO_PMU_PATH "devices/uncore_iio_%d"
#define UNCORE_IIO_PMU_PATH "bus/event_source/devices/uncore_iio_%d"
#define SYSFS_UNCORE_PMU_PATH "%s/"UNCORE_IIO_PMU_PATH
#define PLATFORM_MAPPING_PATH UNCORE_IIO_PMU_PATH"/die%d"

View File

@@ -81,7 +81,7 @@ bool arch_topdown_sample_read(struct evsel *leader)
*/
evlist__for_each_entry(leader->evlist, evsel) {
if (evsel->core.leader != leader->core.leader)
return false;
continue;
if (evsel != leader && arch_is_topdown_metrics(evsel))
return true;
}

View File

@@ -8,7 +8,7 @@
bool libdw__arch_set_initial_registers(Dwfl_Thread *thread, void *arg)
{
struct unwind_info *ui = arg;
struct regs_dump *user_regs = &ui->sample->user_regs;
struct regs_dump *user_regs = perf_sample__user_regs(ui->sample);
Dwarf_Word dwarf_regs[17];
unsigned nregs;

View File

@@ -1,2 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall-y += syscalls_32.h

View File

@@ -1,4 +0,0 @@
# SPDX-License-Identifier: GPL-2.0
syscall_abis_32 +=
syscalltbl = $(srctree)/tools/perf/arch/xtensa/entry/syscalls/syscall.tbl

View File

@@ -1,2 +0,0 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <asm/syscalls_32.h>

View File

@@ -204,17 +204,10 @@ static void *worker_thread(void *__tdata)
}
for (i = 0; i < loops; i++) {
if (!td->nr) {
ret = read_pipe(td);
BUG_ON(ret != sizeof(int));
ret = write(td->pipe_write, &m, sizeof(int));
BUG_ON(ret != sizeof(int));
} else {
ret = write(td->pipe_write, &m, sizeof(int));
BUG_ON(ret != sizeof(int));
ret = read_pipe(td);
BUG_ON(ret != sizeof(int));
}
ret = write(td->pipe_write, &m, sizeof(int));
BUG_ON(ret != sizeof(int));
ret = read_pipe(td);
BUG_ON(ret != sizeof(int));
}
return NULL;

View File

@@ -22,8 +22,7 @@
#define __NR_fork -1
#endif
#define LOOPS_DEFAULT 10000000
static int loops = LOOPS_DEFAULT;
static int loops;
static const struct option options[] = {
OPT_INTEGER('l', "loop", &loops, "Specify number of loops"),
@@ -80,6 +79,18 @@ static int bench_syscall_common(int argc, const char **argv, int syscall)
const char *name = NULL;
int i;
switch (syscall) {
case __NR_fork:
case __NR_execve:
/* Limit default loop to 10000 times to save time */
loops = 10000;
break;
default:
loops = 10000000;
break;
}
/* Options -l and --loops override default above */
argc = parse_options(argc, argv, options, bench_syscall_usage, 0);
gettimeofday(&start, NULL);
@@ -94,16 +105,9 @@ static int bench_syscall_common(int argc, const char **argv, int syscall)
break;
case __NR_fork:
test_fork();
/* Only loop 10000 times to save time */
if (i == 10000)
loops = 10000;
break;
case __NR_execve:
test_execve();
/* Only loop 10000 times to save time */
if (i == 10000)
loops = 10000;
break;
default:
break;
}

View File

@@ -321,14 +321,14 @@ static int process_feature_event(struct perf_session *session,
return 0;
}
static int hist_entry__tty_annotate(struct hist_entry *he,
static int hist_entry__stdio_annotate(struct hist_entry *he,
struct evsel *evsel,
struct perf_annotate *ann)
{
if (!ann->use_stdio2)
return symbol__tty_annotate(&he->ms, evsel);
if (ann->use_stdio2)
return hist_entry__tty_annotate2(he, evsel);
return symbol__tty_annotate2(&he->ms, evsel);
return hist_entry__tty_annotate(he, evsel);
}
static void print_annotate_data_stat(struct annotated_data_stat *s)
@@ -541,7 +541,7 @@ static void hists__find_annotations(struct hists *hists,
if (next != NULL)
nd = next;
} else {
hist_entry__tty_annotate(he, evsel, ann);
hist_entry__stdio_annotate(he, evsel, ann);
nd = rb_next(nd);
}
}
@@ -788,6 +788,8 @@ int cmd_annotate(int argc, const char **argv)
"Show instruction stats for the data type annotation"),
OPT_BOOLEAN(0, "skip-empty", &symbol_conf.skip_empty,
"Do not display empty (or dummy) events in the output"),
OPT_BOOLEAN(0, "code-with-type", &annotate_opts.code_with_type,
"Show data type info in code annotation (memory instructions only)"),
OPT_END()
};
int ret;
@@ -913,6 +915,13 @@ int cmd_annotate(int argc, const char **argv)
annotate_opts.annotate_src = false;
symbol_conf.annotate_data_member = true;
symbol_conf.annotate_data_sample = true;
} else if (annotate_opts.code_with_type) {
symbol_conf.annotate_data_member = true;
if (!annotate.use_stdio) {
pr_err("--code-with-type only works with --stdio.\n");
goto out_delete;
}
}
setup_browser(true);

View File

@@ -3239,6 +3239,7 @@ static int perf_c2c__record(int argc, const char **argv)
{
int rec_argc, i = 0, j;
const char **rec_argv;
char *event_name_storage = NULL;
int ret;
bool all_user = false, all_kernel = false;
bool event_set = false;
@@ -3300,7 +3301,7 @@ static int perf_c2c__record(int argc, const char **argv)
rec_argv[i++] = "--phys-data";
rec_argv[i++] = "--sample-cpu";
ret = perf_mem_events__record_args(rec_argv, &i);
ret = perf_mem_events__record_args(rec_argv, &i, &event_name_storage);
if (ret)
goto out;
@@ -3327,6 +3328,7 @@ static int perf_c2c__record(int argc, const char **argv)
ret = cmd_record(i, rec_argv);
out:
free(event_name_storage);
free(rec_argv);
return ret;
}

View File

@@ -733,6 +733,7 @@ static void make_histogram(struct perf_ftrace *ftrace, int buckets[],
{
int min_latency = ftrace->min_latency;
int max_latency = ftrace->max_latency;
unsigned int bucket_num = ftrace->bucket_num;
char *p, *q;
char *unit;
double num;
@@ -797,10 +798,10 @@ static void make_histogram(struct perf_ftrace *ftrace, int buckets[],
if (num > 0) // 1st entry: [ 1 unit .. bucket_range units ]
i = num / ftrace->bucket_range + 1;
if (num >= max_latency - min_latency)
i = NUM_BUCKET -1;
i = bucket_num -1;
}
if (i >= NUM_BUCKET)
i = NUM_BUCKET - 1;
if ((unsigned)i >= bucket_num)
i = bucket_num - 1;
num += min_latency;
do_inc:
@@ -820,13 +821,14 @@ static void display_histogram(struct perf_ftrace *ftrace, int buckets[])
{
int min_latency = ftrace->min_latency;
bool use_nsec = ftrace->use_nsec;
int i;
unsigned int bucket_num = ftrace->bucket_num;
unsigned int i;
int total = 0;
int bar_total = 46; /* to fit in 80 column */
char bar[] = "###############################################";
int bar_len;
for (i = 0; i < NUM_BUCKET; i++)
for (i = 0; i < bucket_num; i++)
total += buckets[i];
if (total == 0) {
@@ -839,14 +841,17 @@ static void display_histogram(struct perf_ftrace *ftrace, int buckets[])
bar_len = buckets[0] * bar_total / total;
printf(" %4d - %4d %s | %10d | %.*s%*s |\n",
0, min_latency ?: 1, use_nsec ? "ns" : "us",
buckets[0], bar_len, bar, bar_total - bar_len, "");
if (!ftrace->hide_empty || buckets[0])
printf(" %4d - %4d %s | %10d | %.*s%*s |\n",
0, min_latency ?: 1, use_nsec ? "ns" : "us",
buckets[0], bar_len, bar, bar_total - bar_len, "");
for (i = 1; i < NUM_BUCKET - 1; i++) {
for (i = 1; i < bucket_num - 1; i++) {
unsigned int start, stop;
const char *unit = use_nsec ? "ns" : "us";
if (ftrace->hide_empty && !buckets[i])
continue;
if (!ftrace->bucket_range) {
start = (1 << (i - 1));
stop = 1 << i;
@@ -881,11 +886,13 @@ static void display_histogram(struct perf_ftrace *ftrace, int buckets[])
bar_total - bar_len, "");
}
bar_len = buckets[NUM_BUCKET - 1] * bar_total / total;
bar_len = buckets[bucket_num - 1] * bar_total / total;
if (ftrace->hide_empty && !buckets[bucket_num - 1])
goto print_stats;
if (!ftrace->bucket_range) {
printf(" %4d - %-4s %s", 1, "...", use_nsec ? "ms" : "s ");
} else {
unsigned int upper_outlier = (NUM_BUCKET - 2) * ftrace->bucket_range + min_latency;
unsigned int upper_outlier = (bucket_num - 2) * ftrace->bucket_range + min_latency;
if (upper_outlier > ftrace->max_latency)
upper_outlier = ftrace->max_latency;
@@ -897,9 +904,10 @@ static void display_histogram(struct perf_ftrace *ftrace, int buckets[])
printf(" %4d - %4s %s", upper_outlier, "...", use_nsec ? "ns" : "us");
}
}
printf(" | %10d | %.*s%*s |\n", buckets[NUM_BUCKET - 1],
printf(" | %10d | %.*s%*s |\n", buckets[bucket_num - 1],
bar_len, bar, bar_total - bar_len, "");
print_stats:
printf("\n# statistics (in %s)\n", ftrace->use_nsec ? "nsec" : "usec");
printf(" total time: %20.0f\n", latency_stats.mean * latency_stats.n);
printf(" avg time: %20.0f\n", latency_stats.mean);
@@ -997,7 +1005,7 @@ static int __cmd_latency(struct perf_ftrace *ftrace)
struct pollfd pollfd = {
.events = POLLIN,
};
int buckets[NUM_BUCKET] = { };
int *buckets;
trace_fd = prepare_func_latency(ftrace);
if (trace_fd < 0)
@@ -1011,6 +1019,12 @@ static int __cmd_latency(struct perf_ftrace *ftrace)
evlist__start_workload(ftrace->evlist);
buckets = calloc(ftrace->bucket_num, sizeof(*buckets));
if (buckets == NULL) {
pr_err("failed to allocate memory for the buckets\n");
goto out;
}
line[0] = '\0';
while (!done) {
if (poll(&pollfd, 1, -1) < 0)
@@ -1030,7 +1044,7 @@ static int __cmd_latency(struct perf_ftrace *ftrace)
if (workload_exec_errno) {
const char *emsg = str_error_r(workload_exec_errno, buf, sizeof(buf));
pr_err("workload failed: %s\n", emsg);
goto out;
goto out_free_buckets;
}
/* read remaining buffer contents */
@@ -1045,6 +1059,8 @@ static int __cmd_latency(struct perf_ftrace *ftrace)
display_histogram(ftrace, buckets);
out_free_buckets:
free(buckets);
out:
close(trace_fd);
cleanup_func_latency(ftrace);
@@ -1634,7 +1650,9 @@ int cmd_ftrace(int argc, const char **argv)
OPT_UINTEGER(0, "min-latency", &ftrace.min_latency,
"Minimum latency (1st bucket). Works only with --bucket-range."),
OPT_UINTEGER(0, "max-latency", &ftrace.max_latency,
"Maximum latency (last bucket). Works only with --bucket-range and total buckets less than 22."),
"Maximum latency (last bucket). Works only with --bucket-range."),
OPT_BOOLEAN(0, "hide-empty", &ftrace.hide_empty,
"Hide empty buckets in the histogram"),
OPT_PARENT(common_options),
};
const struct option profile_options[] = {
@@ -1751,10 +1769,25 @@ int cmd_ftrace(int argc, const char **argv)
ret = -EINVAL;
goto out_delete_filters;
}
if (ftrace.bucket_range && !ftrace.max_latency) {
/* default max latency should depend on bucket range and num_buckets */
ftrace.max_latency = (NUM_BUCKET - 2) * ftrace.bucket_range +
ftrace.min_latency;
if (ftrace.bucket_range && ftrace.max_latency &&
ftrace.max_latency < ftrace.min_latency + ftrace.bucket_range) {
/* we need at least 1 bucket excluding min and max buckets */
pr_err("--max-latency must be larger than min-latency + bucket-range\n");
parse_options_usage(ftrace_usage, options,
"max-latency", /*short_opt=*/false);
ret = -EINVAL;
goto out_delete_filters;
}
/* set default unless max_latency is set and valid */
ftrace.bucket_num = NUM_BUCKET;
if (ftrace.bucket_range) {
if (ftrace.max_latency)
ftrace.bucket_num = (ftrace.max_latency - ftrace.min_latency) /
ftrace.bucket_range + 2;
else
/* default max latency should depend on bucket range and num_buckets */
ftrace.max_latency = (NUM_BUCKET - 2) * ftrace.bucket_range +
ftrace.min_latency;
}
cmd_func = __cmd_latency;
break;

Some files were not shown because too many files have changed in this diff Show More