| .\" Copyright (c) 2012, Vincent Weaver |
| .\" |
| .\" %%%LICENSE_START(GPLv2+_DOC_FULL) |
| .\" This is free documentation; you can redistribute it and/or |
| .\" modify it under the terms of the GNU General Public License as |
| .\" published by the Free Software Foundation; either version 2 of |
| .\" the License, or (at your option) any later version. |
| .\" |
| .\" The GNU General Public License's references to "object code" |
| .\" and "executables" are to be interpreted as the output of any |
| .\" document formatting or typesetting system, including |
| .\" intermediate and printed output. |
| .\" |
| .\" This manual is distributed in the hope that it will be useful, |
| .\" but WITHOUT ANY WARRANTY; without even the implied warranty of |
| .\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| .\" GNU General Public License for more details. |
| .\" |
| .\" You should have received a copy of the GNU General Public |
| .\" License along with this manual; if not, see |
| .\" <http://www.gnu.org/licenses/>. |
| .\" %%%LICENSE_END |
| .\" |
| .\" This document is based on the perf_event.h header file, the |
| .\" tools/perf/design.txt file, and a lot of bitter experience. |
| .\" |
| .TH PERF_EVENT_OPEN 2 2021-03-22 "Linux" "Linux Programmer's Manual" |
| .SH NAME |
| perf_event_open \- set up performance monitoring |
| .SH SYNOPSIS |
| .nf |
| .B #include <linux/perf_event.h> |
| .B #include <linux/hw_breakpoint.h> |
| .PP |
| .BI "int perf_event_open(struct perf_event_attr *" attr , |
| .BI " pid_t " pid ", int " cpu ", int " group_fd , |
| .BI " unsigned long " flags ); |
| .fi |
| .PP |
| .IR Note : |
| There is no glibc wrapper for this system call; see NOTES. |
| .SH DESCRIPTION |
| Given a list of parameters, |
| .BR perf_event_open () |
| returns a file descriptor, for use in subsequent system calls |
| .RB ( read "(2), " mmap "(2), " prctl "(2), " fcntl "(2), etc.)." |
| .PP |
| A call to |
| .BR perf_event_open () |
| creates a file descriptor that allows measuring performance |
| information. |
| Each file descriptor corresponds to one |
| event that is measured; these can be grouped together |
| to measure multiple events simultaneously. |
| .PP |
| Events can be enabled and disabled in two ways: via |
| .BR ioctl (2) |
| and via |
| .BR prctl (2). |
| When an event is disabled it does not count or generate overflows but does |
| continue to exist and maintain its count value. |
| .PP |
| Events come in two flavors: counting and sampled. |
| A |
| .I counting |
| event is one that is used for counting the aggregate number of events |
| that occur. |
| In general, counting event results are gathered with a |
| .BR read (2) |
| call. |
| A |
| .I sampling |
| event periodically writes measurements to a buffer that can then |
| be accessed via |
| .BR mmap (2). |
| .SS Arguments |
| The |
| .I pid |
| and |
| .I cpu |
| arguments allow specifying which process and CPU to monitor: |
| .TP |
| .BR "pid == 0" " and " "cpu == \-1" |
| This measures the calling process/thread on any CPU. |
| .TP |
| .BR "pid == 0" " and " "cpu >= 0" |
| This measures the calling process/thread only |
| when running on the specified CPU. |
| .TP |
| .BR "pid > 0" " and " "cpu == \-1" |
| This measures the specified process/thread on any CPU. |
| .TP |
| .BR "pid > 0" " and " "cpu >= 0" |
| This measures the specified process/thread only |
| when running on the specified CPU. |
| .TP |
| .BR "pid == \-1" " and " "cpu >= 0" |
| This measures all processes/threads on the specified CPU. |
| This requires |
| .B CAP_PERFMON |
| (since Linux 5.8) or |
| .B CAP_SYS_ADMIN |
| capability or a |
| .I /proc/sys/kernel/perf_event_paranoid |
| value of less than 1. |
| .TP |
| .BR "pid == \-1" " and " "cpu == \-1" |
| This setting is invalid and will return an error. |
| .PP |
| When |
| .I pid |
| is greater than zero, permission to perform this system call |
| is governed by |
| .B CAP_PERFMON |
| (since Linux 5.9) and a ptrace access mode |
| .B PTRACE_MODE_READ_REALCREDS |
| check on older Linux versions; see |
| .BR ptrace (2). |
| .PP |
| The |
| .I group_fd |
| argument allows event groups to be created. |
| An event group has one event which is the group leader. |
| The leader is created first, with |
| .IR group_fd " = \-1." |
| The rest of the group members are created with subsequent |
| .BR perf_event_open () |
| calls with |
| .I group_fd |
| being set to the file descriptor of the group leader. |
| (A single event on its own is created with |
| .IR group_fd " = \-1" |
| and is considered to be a group with only 1 member.) |
| An event group is scheduled onto the CPU as a unit: it will |
| be put onto the CPU only if all of the events in the group can be put onto |
| the CPU. |
| This means that the values of the member events can be |
| meaningfully compared\(emadded, divided (to get ratios), and so on\(emwith each |
| other, since they have counted events for the same set of executed |
| instructions. |
| .PP |
| The |
| .I flags |
| argument is formed by ORing together zero or more of the following values: |
| .TP |
| .BR PERF_FLAG_FD_CLOEXEC " (since Linux 3.14)" |
| .\" commit a21b0b354d4ac39be691f51c53562e2c24443d9e |
| This flag enables the close-on-exec flag for the created |
| event file descriptor, |
| so that the file descriptor is automatically closed on |
| .BR execve (2). |
| Setting the close-on-exec flags at creation time, rather than later with |
| .BR fcntl (2), |
| avoids potential race conditions where the calling thread invokes |
| .BR perf_event_open () |
| and |
| .BR fcntl (2) |
| at the same time as another thread calls |
| .BR fork (2) |
| then |
| .BR execve (2). |
| .TP |
| .BR PERF_FLAG_FD_NO_GROUP |
| This flag tells the event to ignore the |
| .I group_fd |
| parameter except for the purpose of setting up output redirection |
| using the |
| .B PERF_FLAG_FD_OUTPUT |
| flag. |
| .TP |
| .BR PERF_FLAG_FD_OUTPUT " (broken since Linux 2.6.35)" |
| .\" commit ac9721f3f54b27a16c7e1afb2481e7ee95a70318 |
| This flag re-routes the event's sampled output to instead |
| be included in the mmap buffer of the event specified by |
| .IR group_fd . |
| .TP |
| .BR PERF_FLAG_PID_CGROUP " (since Linux 2.6.39)" |
| .\" commit e5d1367f17ba6a6fed5fd8b74e4d5720923e0c25 |
| This flag activates per-container system-wide monitoring. |
| A container |
| is an abstraction that isolates a set of resources for finer-grained |
| control (CPUs, memory, etc.). |
| In this mode, the event is measured |
| only if the thread running on the monitored CPU belongs to the designated |
| container (cgroup). |
| The cgroup is identified by passing a file descriptor |
| opened on its directory in the cgroupfs filesystem. |
| For instance, if the |
| cgroup to monitor is called |
| .IR test , |
| then a file descriptor opened on |
| .I /dev/cgroup/test |
| (assuming cgroupfs is mounted on |
| .IR /dev/cgroup ) |
| must be passed as the |
| .I pid |
| parameter. |
| cgroup monitoring is available only |
| for system-wide events and may therefore require extra permissions. |
| .PP |
| The |
| .I perf_event_attr |
| structure provides detailed configuration information |
| for the event being created. |
| .PP |
| .in +4n |
| .EX |
| struct perf_event_attr { |
| __u32 type; /* Type of event */ |
| __u32 size; /* Size of attribute structure */ |
| __u64 config; /* Type\-specific configuration */ |
| |
| union { |
| __u64 sample_period; /* Period of sampling */ |
| __u64 sample_freq; /* Frequency of sampling */ |
| }; |
| |
| __u64 sample_type; /* Specifies values included in sample */ |
| __u64 read_format; /* Specifies values returned in read */ |
| |
| __u64 disabled : 1, /* off by default */ |
| inherit : 1, /* children inherit it */ |
| pinned : 1, /* must always be on PMU */ |
| exclusive : 1, /* only group on PMU */ |
| exclude_user : 1, /* don\(aqt count user */ |
| exclude_kernel : 1, /* don\(aqt count kernel */ |
| exclude_hv : 1, /* don\(aqt count hypervisor */ |
| exclude_idle : 1, /* don\(aqt count when idle */ |
| mmap : 1, /* include mmap data */ |
| comm : 1, /* include comm data */ |
| freq : 1, /* use freq, not period */ |
| inherit_stat : 1, /* per task counts */ |
| enable_on_exec : 1, /* next exec enables */ |
| task : 1, /* trace fork/exit */ |
| watermark : 1, /* wakeup_watermark */ |
| precise_ip : 2, /* skid constraint */ |
| mmap_data : 1, /* non\-exec mmap data */ |
| sample_id_all : 1, /* sample_type all events */ |
| exclude_host : 1, /* don\(aqt count in host */ |
| exclude_guest : 1, /* don\(aqt count in guest */ |
| exclude_callchain_kernel : 1, |
| /* exclude kernel callchains */ |
| exclude_callchain_user : 1, |
| /* exclude user callchains */ |
| mmap2 : 1, /* include mmap with inode data */ |
| comm_exec : 1, /* flag comm events that are |
| due to exec */ |
| use_clockid : 1, /* use clockid for time fields */ |
| context_switch : 1, /* context switch data */ |
| write_backward : 1, /* Write ring buffer from end |
| to beginning */ |
| namespaces : 1, /* include namespaces data */ |
| ksymbol : 1, /* include ksymbol events */ |
| bpf_event : 1, /* include bpf events */ |
| aux_output : 1, /* generate AUX records |
| instead of events */ |
| cgroup : 1, /* include cgroup events */ |
| text_poke : 1, /* include text poke events */ |
| |
| __reserved_1 : 30; |
| |
| union { |
| __u32 wakeup_events; /* wakeup every n events */ |
| __u32 wakeup_watermark; /* bytes before wakeup */ |
| }; |
| |
| __u32 bp_type; /* breakpoint type */ |
| |
| union { |
| __u64 bp_addr; /* breakpoint address */ |
| __u64 kprobe_func; /* for perf_kprobe */ |
| __u64 uprobe_path; /* for perf_uprobe */ |
| __u64 config1; /* extension of config */ |
| }; |
| |
| union { |
| __u64 bp_len; /* breakpoint length */ |
| __u64 kprobe_addr; /* with kprobe_func == NULL */ |
| __u64 probe_offset; /* for perf_[k,u]probe */ |
| __u64 config2; /* extension of config1 */ |
| }; |
| __u64 branch_sample_type; /* enum perf_branch_sample_type */ |
| __u64 sample_regs_user; /* user regs to dump on samples */ |
| __u32 sample_stack_user; /* size of stack to dump on |
| samples */ |
| __s32 clockid; /* clock to use for time fields */ |
| __u64 sample_regs_intr; /* regs to dump on samples */ |
| __u32 aux_watermark; /* aux bytes before wakeup */ |
| __u16 sample_max_stack; /* max frames in callchain */ |
| __u16 __reserved_2; /* align to u64 */ |
| |
| }; |
| .EE |
| .in |
| .PP |
| The fields of the |
| .I perf_event_attr |
| structure are described in more detail below: |
| .TP |
| .I type |
| This field specifies the overall event type. |
| It has one of the following values: |
| .RS |
| .TP |
| .B PERF_TYPE_HARDWARE |
| This indicates one of the "generalized" hardware events provided |
| by the kernel. |
| See the |
| .I config |
| field definition for more details. |
| .TP |
| .B PERF_TYPE_SOFTWARE |
| This indicates one of the software-defined events provided by the kernel |
| (even if no hardware support is available). |
| .TP |
| .B PERF_TYPE_TRACEPOINT |
| This indicates a tracepoint |
| provided by the kernel tracepoint infrastructure. |
| .TP |
| .B PERF_TYPE_HW_CACHE |
| This indicates a hardware cache event. |
| This has a special encoding, described in the |
| .I config |
| field definition. |
| .TP |
| .B PERF_TYPE_RAW |
| This indicates a "raw" implementation-specific event in the |
| .IR config " field." |
| .TP |
| .BR PERF_TYPE_BREAKPOINT " (since Linux 2.6.33)" |
| .\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e |
| This indicates a hardware breakpoint as provided by the CPU. |
| Breakpoints can be read/write accesses to an address as well as |
| execution of an instruction address. |
| .TP |
| dynamic PMU |
| Since Linux 2.6.38, |
| .\" commit 2e80a82a49c4c7eca4e35734380f28298ba5db19 |
| .BR perf_event_open () |
| can support multiple PMUs. |
| To enable this, a value exported by the kernel can be used in the |
| .I type |
| field to indicate which PMU to use. |
| The value to use can be found in the sysfs filesystem: |
| there is a subdirectory per PMU instance under |
| .IR /sys/bus/event_source/devices . |
| In each subdirectory there is a |
| .I type |
| file whose content is an integer that can be used in the |
| .I type |
| field. |
| For instance, |
| .I /sys/bus/event_source/devices/cpu/type |
| contains the value for the core CPU PMU, which is usually 4. |
| .TP |
| .BR kprobe " and " uprobe " (since Linux 4.17)" |
| .\" commit 65074d43fc77bcae32776724b7fa2696923c78e4 |
| .\" commit e12f03d7031a977356e3d7b75a68c2185ff8d155 |
| .\" commit 33ea4b24277b06dbc55d7f5772a46f029600255e |
| These two dynamic PMUs create a kprobe/uprobe and attach it to the |
| file descriptor generated by perf_event_open. |
| The kprobe/uprobe will be destroyed on the destruction of the file descriptor. |
| See fields |
| .IR kprobe_func , |
| .IR uprobe_path , |
| .IR kprobe_addr , |
| and |
| .I probe_offset |
| for more details. |
| .RE |
| .TP |
| .I "size" |
| The size of the |
| .I perf_event_attr |
| structure for forward/backward compatibility. |
| Set this using |
| .I sizeof(struct perf_event_attr) |
| to allow the kernel to see |
| the struct size at the time of compilation. |
| .IP |
| The related define |
| .B PERF_ATTR_SIZE_VER0 |
| is set to 64; this was the size of the first published struct. |
| .B PERF_ATTR_SIZE_VER1 |
| is 72, corresponding to the addition of breakpoints in Linux 2.6.33. |
| .\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2 |
| .\" this was added much later when PERF_ATTR_SIZE_VER2 happened |
| .\" but the actual attr_size had increased in 2.6.33 |
| .B PERF_ATTR_SIZE_VER2 |
| is 80 corresponding to the addition of branch sampling in Linux 3.4. |
| .\" commit cb5d76999029ae7a517cb07dfa732c1b5a934fc2 |
| .B PERF_ATTR_SIZE_VER3 |
| is 96 corresponding to the addition |
| of |
| .I sample_regs_user |
| and |
| .I sample_stack_user |
| in Linux 3.7. |
| .\" commit 1659d129ed014b715b0b2120e6fd929bdd33ed03 |
| .B PERF_ATTR_SIZE_VER4 |
| is 104 corresponding to the addition of |
| .I sample_regs_intr |
| in Linux 3.19. |
| .\" commit 60e2364e60e86e81bc6377f49779779e6120977f |
| .B PERF_ATTR_SIZE_VER5 |
| is 112 corresponding to the addition of |
| .I aux_watermark |
| in Linux 4.1. |
| .\" commit 1a5941312414c71dece6717da9a0fa1303127afa |
| .TP |
| .I "config" |
| This specifies which event you want, in conjunction with |
| the |
| .I type |
| field. |
| The |
| .I config1 |
| and |
| .I config2 |
| fields are also taken into account in cases where 64 bits is not |
| enough to fully specify the event. |
| The encoding of these fields are event dependent. |
| .IP |
| There are various ways to set the |
| .I config |
| field that are dependent on the value of the previously |
| described |
| .I type |
| field. |
| What follows are various possible settings for |
| .I config |
| separated out by |
| .IR type . |
| .IP |
| If |
| .I type |
| is |
| .BR PERF_TYPE_HARDWARE , |
| we are measuring one of the generalized hardware CPU events. |
| Not all of these are available on all platforms. |
| Set |
| .I config |
| to one of the following: |
| .RS 12 |
| .TP |
| .B PERF_COUNT_HW_CPU_CYCLES |
| Total cycles. |
| Be wary of what happens during CPU frequency scaling. |
| .TP |
| .B PERF_COUNT_HW_INSTRUCTIONS |
| Retired instructions. |
| Be careful, these can be affected by various |
| issues, most notably hardware interrupt counts. |
| .TP |
| .B PERF_COUNT_HW_CACHE_REFERENCES |
| Cache accesses. |
| Usually this indicates Last Level Cache accesses but this may |
| vary depending on your CPU. |
| This may include prefetches and coherency messages; again this |
| depends on the design of your CPU. |
| .TP |
| .B PERF_COUNT_HW_CACHE_MISSES |
| Cache misses. |
| Usually this indicates Last Level Cache misses; this is intended to be |
| used in conjunction with the |
| .B PERF_COUNT_HW_CACHE_REFERENCES |
| event to calculate cache miss rates. |
| .TP |
| .B PERF_COUNT_HW_BRANCH_INSTRUCTIONS |
| Retired branch instructions. |
| Prior to Linux 2.6.35, this used |
| the wrong event on AMD processors. |
| .\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2 |
| .TP |
| .B PERF_COUNT_HW_BRANCH_MISSES |
| Mispredicted branch instructions. |
| .TP |
| .B PERF_COUNT_HW_BUS_CYCLES |
| Bus cycles, which can be different from total cycles. |
| .TP |
| .BR PERF_COUNT_HW_STALLED_CYCLES_FRONTEND " (since Linux 3.0)" |
| .\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a |
| Stalled cycles during issue. |
| .TP |
| .BR PERF_COUNT_HW_STALLED_CYCLES_BACKEND " (since Linux 3.0)" |
| .\" commit 8f62242246351b5a4bc0c1f00c0c7003edea128a |
| Stalled cycles during retirement. |
| .TP |
| .BR PERF_COUNT_HW_REF_CPU_CYCLES " (since Linux 3.3)" |
| .\" commit c37e17497e01fc0f5d2d6feb5723b210b3ab8890 |
| Total cycles; not affected by CPU frequency scaling. |
| .RE |
| .IP |
| If |
| .I type |
| is |
| .BR PERF_TYPE_SOFTWARE , |
| we are measuring software events provided by the kernel. |
| Set |
| .I config |
| to one of the following: |
| .RS 12 |
| .TP |
| .B PERF_COUNT_SW_CPU_CLOCK |
| This reports the CPU clock, a high-resolution per-CPU timer. |
| .TP |
| .B PERF_COUNT_SW_TASK_CLOCK |
| This reports a clock count specific to the task that is running. |
| .TP |
| .B PERF_COUNT_SW_PAGE_FAULTS |
| This reports the number of page faults. |
| .TP |
| .B PERF_COUNT_SW_CONTEXT_SWITCHES |
| This counts context switches. |
| Until Linux 2.6.34, these were all reported as user-space |
| events, after that they are reported as happening in the kernel. |
| .\" commit e49a5bd38159dfb1928fd25b173bc9de4bbadb21 |
| .TP |
| .B PERF_COUNT_SW_CPU_MIGRATIONS |
| This reports the number of times the process |
| has migrated to a new CPU. |
| .TP |
| .B PERF_COUNT_SW_PAGE_FAULTS_MIN |
| This counts the number of minor page faults. |
| These did not require disk I/O to handle. |
| .TP |
| .B PERF_COUNT_SW_PAGE_FAULTS_MAJ |
| This counts the number of major page faults. |
| These required disk I/O to handle. |
| .TP |
| .BR PERF_COUNT_SW_ALIGNMENT_FAULTS " (since Linux 2.6.33)" |
| .\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497 |
| This counts the number of alignment faults. |
| These happen when unaligned memory accesses happen; the kernel |
| can handle these but it reduces performance. |
| This happens only on some architectures (never on x86). |
| .TP |
| .BR PERF_COUNT_SW_EMULATION_FAULTS " (since Linux 2.6.33)" |
| .\" commit f7d7986060b2890fc26db6ab5203efbd33aa2497 |
| This counts the number of emulation faults. |
| The kernel sometimes traps on unimplemented instructions |
| and emulates them for user space. |
| This can negatively impact performance. |
| .TP |
| .BR PERF_COUNT_SW_DUMMY " (since Linux 3.12)" |
| .\" commit fa0097ee690693006ab1aea6c01ad3c851b65c77 |
| This is a placeholder event that counts nothing. |
| Informational sample record types such as mmap or comm |
| must be associated with an active event. |
| This dummy event allows gathering such records without requiring |
| a counting event. |
| .RE |
| .PP |
| .RS |
| If |
| .I type |
| is |
| .BR PERF_TYPE_TRACEPOINT , |
| then we are measuring kernel tracepoints. |
| The value to use in |
| .I config |
| can be obtained from under debugfs |
| .I tracing/events/*/*/id |
| if ftrace is enabled in the kernel. |
| .RE |
| .PP |
| .RS |
| If |
| .I type |
| is |
| .BR PERF_TYPE_HW_CACHE , |
| then we are measuring a hardware CPU cache event. |
| To calculate the appropriate |
| .I config |
| value, use the following equation: |
| .RS 4 |
| .PP |
| .in +4n |
| .EX |
| config = (perf_hw_cache_id) | |
| (perf_hw_cache_op_id << 8) | |
| (perf_hw_cache_op_result_id << 16); |
| .EE |
| .in |
| .PP |
| where |
| .I perf_hw_cache_id |
| is one of: |
| .RS 4 |
| .TP |
| .B PERF_COUNT_HW_CACHE_L1D |
| for measuring Level 1 Data Cache |
| .TP |
| .B PERF_COUNT_HW_CACHE_L1I |
| for measuring Level 1 Instruction Cache |
| .TP |
| .B PERF_COUNT_HW_CACHE_LL |
| for measuring Last-Level Cache |
| .TP |
| .B PERF_COUNT_HW_CACHE_DTLB |
| for measuring the Data TLB |
| .TP |
| .B PERF_COUNT_HW_CACHE_ITLB |
| for measuring the Instruction TLB |
| .TP |
| .B PERF_COUNT_HW_CACHE_BPU |
| for measuring the branch prediction unit |
| .TP |
| .BR PERF_COUNT_HW_CACHE_NODE " (since Linux 3.1)" |
| .\" commit 89d6c0b5bdbb1927775584dcf532d98b3efe1477 |
| for measuring local memory accesses |
| .RE |
| .PP |
| and |
| .I perf_hw_cache_op_id |
| is one of: |
| .RS 4 |
| .TP |
| .B PERF_COUNT_HW_CACHE_OP_READ |
| for read accesses |
| .TP |
| .B PERF_COUNT_HW_CACHE_OP_WRITE |
| for write accesses |
| .TP |
| .B PERF_COUNT_HW_CACHE_OP_PREFETCH |
| for prefetch accesses |
| .RE |
| .PP |
| and |
| .I perf_hw_cache_op_result_id |
| is one of: |
| .RS 4 |
| .TP |
| .B PERF_COUNT_HW_CACHE_RESULT_ACCESS |
| to measure accesses |
| .TP |
| .B PERF_COUNT_HW_CACHE_RESULT_MISS |
| to measure misses |
| .RE |
| .RE |
| .PP |
| If |
| .I type |
| is |
| .BR PERF_TYPE_RAW , |
| then a custom "raw" |
| .I config |
| value is needed. |
| Most CPUs support events that are not covered by the "generalized" events. |
| These are implementation defined; see your CPU manual (for example |
| the Intel Volume 3B documentation or the AMD BIOS and Kernel Developer |
| Guide). |
| The libpfm4 library can be used to translate from the name in the |
| architectural manuals to the raw hex value |
| .BR perf_event_open () |
| expects in this field. |
| .PP |
| If |
| .I type |
| is |
| .BR PERF_TYPE_BREAKPOINT , |
| then leave |
| .I config |
| set to zero. |
| Its parameters are set in other places. |
| .PP |
| If |
| .I type |
| is |
| .B kprobe |
| or |
| .BR uprobe , |
| set |
| .I retprobe |
| (bit 0 of |
| .IR config , |
| see |
| .IR /sys/bus/event_source/devices/[k,u]probe/format/retprobe ) |
| for kretprobe/uretprobe. |
| See fields |
| .IR kprobe_func , |
| .IR uprobe_path , |
| .IR kprobe_addr , |
| and |
| .I probe_offset |
| for more details. |
| .RE |
| .TP |
| .IR kprobe_func ", " uprobe_path ", " kprobe_addr ", and " probe_offset |
| These fields describe the kprobe/uprobe for dynamic PMUs |
| .B kprobe |
| and |
| .BR uprobe . |
| For |
| .BR kprobe : |
| use |
| .I kprobe_func |
| and |
| .IR probe_offset , |
| or use |
| .I kprobe_addr |
| and leave |
| .I kprobe_func |
| as NULL. |
| For |
| .BR uprobe : |
| use |
| .I uprobe_path |
| and |
| .IR probe_offset . |
| .TP |
| .IR sample_period ", " sample_freq |
| A "sampling" event is one that generates an overflow notification |
| every N events, where N is given by |
| .IR sample_period . |
| A sampling event has |
| .IR sample_period " > 0." |
| When an overflow occurs, requested data is recorded |
| in the mmap buffer. |
| The |
| .I sample_type |
| field controls what data is recorded on each overflow. |
| .IP |
| .I sample_freq |
| can be used if you wish to use frequency rather than period. |
| In this case, you set the |
| .I freq |
| flag. |
| The kernel will adjust the sampling period |
| to try and achieve the desired rate. |
| The rate of adjustment is a |
| timer tick. |
| .TP |
| .I sample_type |
| The various bits in this field specify which values to include |
| in the sample. |
| They will be recorded in a ring-buffer, |
| which is available to user space using |
| .BR mmap (2). |
| The order in which the values are saved in the |
| sample are documented in the MMAP Layout subsection below; |
| it is not the |
| .I "enum perf_event_sample_format" |
| order. |
| .RS |
| .TP |
| .B PERF_SAMPLE_IP |
| Records instruction pointer. |
| .TP |
| .B PERF_SAMPLE_TID |
| Records the process and thread IDs. |
| .TP |
| .B PERF_SAMPLE_TIME |
| Records a timestamp. |
| .TP |
| .B PERF_SAMPLE_ADDR |
| Records an address, if applicable. |
| .TP |
| .B PERF_SAMPLE_READ |
| Record counter values for all events in a group, not just the group leader. |
| .TP |
| .B PERF_SAMPLE_CALLCHAIN |
| Records the callchain (stack backtrace). |
| .TP |
| .B PERF_SAMPLE_ID |
| Records a unique ID for the opened event's group leader. |
| .TP |
| .B PERF_SAMPLE_CPU |
| Records CPU number. |
| .TP |
| .B PERF_SAMPLE_PERIOD |
| Records the current sampling period. |
| .TP |
| .B PERF_SAMPLE_STREAM_ID |
| Records a unique ID for the opened event. |
| Unlike |
| .B PERF_SAMPLE_ID |
| the actual ID is returned, not the group leader. |
| This ID is the same as the one returned by |
| .BR PERF_FORMAT_ID . |
| .TP |
| .B PERF_SAMPLE_RAW |
| Records additional data, if applicable. |
| Usually returned by tracepoint events. |
| .TP |
| .BR PERF_SAMPLE_BRANCH_STACK " (since Linux 3.4)" |
| .\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e |
| This provides a record of recent branches, as provided |
| by CPU branch sampling hardware (such as Intel Last Branch Record). |
| Not all hardware supports this feature. |
| .IP |
| See the |
| .I branch_sample_type |
| field for how to filter which branches are reported. |
| .TP |
| .BR PERF_SAMPLE_REGS_USER " (since Linux 3.7)" |
| .\" commit 4018994f3d8785275ef0e7391b75c3462c029e56 |
| Records the current user-level CPU register state |
| (the values in the process before the kernel was called). |
| .TP |
| .BR PERF_SAMPLE_STACK_USER " (since Linux 3.7)" |
| .\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7 |
| Records the user level stack, allowing stack unwinding. |
| .TP |
| .BR PERF_SAMPLE_WEIGHT " (since Linux 3.10)" |
| .\" commit c3feedf2aaf9ac8bad6f19f5d21e4ee0b4b87e9c |
| Records a hardware provided weight value that expresses how |
| costly the sampled event was. |
| This allows the hardware to highlight expensive events in |
| a profile. |
| .TP |
| .BR PERF_SAMPLE_DATA_SRC " (since Linux 3.10)" |
| .\" commit d6be9ad6c960f43800a6f118932bc8a5a4eadcd1 |
| Records the data source: where in the memory hierarchy |
| the data associated with the sampled instruction came from. |
| This is available only if the underlying hardware |
| supports this feature. |
| .TP |
| .BR PERF_SAMPLE_IDENTIFIER " (since Linux 3.12)" |
| .\" commit ff3d527cebc1fa3707c617bfe9e74f53fcfb0955 |
| Places the |
| .B SAMPLE_ID |
| value in a fixed position in the record, |
| either at the beginning (for sample events) or at the end |
| (if a non-sample event). |
| .IP |
| This was necessary because a sample stream may have |
| records from various different event sources with different |
| .I sample_type |
| settings. |
| Parsing the event stream properly was not possible because the |
| format of the record was needed to find |
| .BR SAMPLE_ID , |
| but |
| the format could not be found without knowing what |
| event the sample belonged to (causing a circular |
| dependency). |
| .IP |
| The |
| .B PERF_SAMPLE_IDENTIFIER |
| setting makes the event stream always parsable |
| by putting |
| .B SAMPLE_ID |
| in a fixed location, even though |
| it means having duplicate |
| .B SAMPLE_ID |
| values in records. |
| .TP |
| .BR PERF_SAMPLE_TRANSACTION " (since Linux 3.13)" |
| .\" commit fdfbbd07e91f8fe387140776f3fd94605f0c89e5 |
| Records reasons for transactional memory abort events |
| (for example, from Intel TSX transactional memory support). |
| .IP |
| The |
| .I precise_ip |
| setting must be greater than 0 and a transactional memory abort |
| event must be measured or no values will be recorded. |
| Also note that some perf_event measurements, such as sampled |
| cycle counting, may cause extraneous aborts (by causing an |
| interrupt during a transaction). |
| .TP |
| .BR PERF_SAMPLE_REGS_INTR " (since Linux 3.19)" |
| .\" commit 60e2364e60e86e81bc6377f49779779e6120977f |
| Records a subset of the current CPU register state |
| as specified by |
| .IR sample_regs_intr . |
| Unlike |
| .B PERF_SAMPLE_REGS_USER |
| the register values will return kernel register |
| state if the overflow happened while kernel |
| code is running. |
| If the CPU supports hardware sampling of |
| register state (i.e., PEBS on Intel x86) and |
| .I precise_ip |
| is set higher than zero then the register |
| values returned are those captured by |
| hardware at the time of the sampled |
| instruction's retirement. |
| .TP |
| .BR PERF_SAMPLE_PHYS_ADDR " (since Linux 4.13)" |
| .\" commit fc7ce9c74c3ad232b084d80148654f926d01ece7 |
| Records physical address of data like in |
| .BR PERF_SAMPLE_ADDR . |
| .TP |
| .BR PERF_SAMPLE_CGROUP " (since Linux 5.7)" |
| .\" commit 96aaab686505c449e24d76e76507290dcc30e008 |
| Records (perf_event) cgroup ID of the process. |
| This corresponds to the |
| .I id |
| field in the |
| .B PERF_RECORD_CGROUP |
| event. |
| .RE |
| .TP |
| .I read_format |
| This field specifies the format of the data returned by |
| .BR read (2) |
| on a |
| .BR perf_event_open () |
| file descriptor. |
| .RS |
| .TP |
| .B PERF_FORMAT_TOTAL_TIME_ENABLED |
| Adds the 64-bit |
| .I time_enabled |
| field. |
| This can be used to calculate estimated totals if |
| the PMU is overcommitted and multiplexing is happening. |
| .TP |
| .B PERF_FORMAT_TOTAL_TIME_RUNNING |
| Adds the 64-bit |
| .I time_running |
| field. |
| This can be used to calculate estimated totals if |
| the PMU is overcommitted and multiplexing is happening. |
| .TP |
| .B PERF_FORMAT_ID |
| Adds a 64-bit unique value that corresponds to the event group. |
| .TP |
| .B PERF_FORMAT_GROUP |
| Allows all counter values in an event group to be read with one read. |
| .RE |
| .TP |
| .I disabled |
| The |
| .I disabled |
| bit specifies whether the counter starts out disabled or enabled. |
| If disabled, the event can later be enabled by |
| .BR ioctl (2), |
| .BR prctl (2), |
| or |
| .IR enable_on_exec . |
| .IP |
| When creating an event group, typically the group leader is initialized |
| with |
| .I disabled |
| set to 1 and any child events are initialized with |
| .I disabled |
| set to 0. |
| Despite |
| .I disabled |
| being 0, the child events will not start until the group leader |
| is enabled. |
| .TP |
| .I inherit |
| The |
| .I inherit |
| bit specifies that this counter should count events of child |
| tasks as well as the task specified. |
| This applies only to new children, not to any existing children at |
| the time the counter is created (nor to any new children of |
| existing children). |
| .IP |
| Inherit does not work for some combinations of |
| .IR read_format |
| values, such as |
| .BR PERF_FORMAT_GROUP . |
| .TP |
| .I pinned |
| The |
| .I pinned |
| bit specifies that the counter should always be on the CPU if at all |
| possible. |
| It applies only to hardware counters and only to group leaders. |
| If a pinned counter cannot be put onto the CPU (e.g., because there are |
| not enough hardware counters or because of a conflict with some other |
| event), then the counter goes into an 'error' state, where reads |
| return end-of-file (i.e., |
| .BR read (2) |
| returns 0) until the counter is subsequently enabled or disabled. |
| .TP |
| .I exclusive |
| The |
| .I exclusive |
| bit specifies that when this counter's group is on the CPU, |
| it should be the only group using the CPU's counters. |
| In the future this may allow monitoring programs to |
| support PMU features that need to run alone so that they do not |
| disrupt other hardware counters. |
| .IP |
| Note that many unexpected situations may prevent events with the |
| .I exclusive |
| bit set from ever running. |
| This includes any users running a system-wide |
| measurement as well as any kernel use of the performance counters |
| (including the commonly enabled NMI Watchdog Timer interface). |
| .TP |
| .I exclude_user |
| If this bit is set, the count excludes events that happen in user space. |
| .TP |
| .I exclude_kernel |
| If this bit is set, the count excludes events that happen in kernel space. |
| .TP |
| .I exclude_hv |
| If this bit is set, the count excludes events that happen in the |
| hypervisor. |
| This is mainly for PMUs that have built-in support for handling this |
| (such as POWER). |
| Extra support is needed for handling hypervisor measurements on most |
| machines. |
| .TP |
| .I exclude_idle |
| If set, don't count when the CPU is running the idle task. |
| While you can currently enable this for any event type, it is ignored |
| for all but software events. |
| .TP |
| .I mmap |
| The |
| .I mmap |
| bit enables generation of |
| .B PERF_RECORD_MMAP |
| samples for every |
| .BR mmap (2) |
| call that has |
| .B PROT_EXEC |
| set. |
| This allows tools to notice new executable code being mapped into |
| a program (dynamic shared libraries for example) |
| so that addresses can be mapped back to the original code. |
| .TP |
| .I comm |
| The |
| .I comm |
| bit enables tracking of process command name as modified by the |
| .BR execve (2) |
| and |
| .BR prctl (PR_SET_NAME) |
| system calls as well as writing to |
| .IR /proc/self/comm . |
| If the |
| .I comm_exec |
| flag is also successfully set (possible since Linux 3.16), |
| .\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871 |
| then the misc flag |
| .B PERF_RECORD_MISC_COMM_EXEC |
| can be used to differentiate the |
| .BR execve (2) |
| case from the others. |
| .TP |
| .I freq |
| If this bit is set, then |
| .I sample_frequency |
| not |
| .I sample_period |
| is used when setting up the sampling interval. |
| .TP |
| .I inherit_stat |
| This bit enables saving of event counts on context switch for |
| inherited tasks. |
| This is meaningful only if the |
| .I inherit |
| field is set. |
| .TP |
| .I enable_on_exec |
| If this bit is set, a counter is automatically |
| enabled after a call to |
| .BR execve (2). |
| .TP |
| .I task |
| If this bit is set, then |
| fork/exit notifications are included in the ring buffer. |
| .TP |
| .I watermark |
| If set, have an overflow notification happen when we cross the |
| .I wakeup_watermark |
| boundary. |
| Otherwise, overflow notifications happen after |
| .I wakeup_events |
| samples. |
| .TP |
| .IR precise_ip " (since Linux 2.6.35)" |
| .\" commit ab608344bcbde4f55ec4cd911b686b0ce3eae076 |
| This controls the amount of skid. |
| Skid is how many instructions |
| execute between an event of interest happening and the kernel |
| being able to stop and record the event. |
| Smaller skid is |
| better and allows more accurate reporting of which events |
| correspond to which instructions, but hardware is often limited |
| with how small this can be. |
| .IP |
| The possible values of this field are the following: |
| .RS |
| .IP 0 3 |
| .B SAMPLE_IP |
| can have arbitrary skid. |
| .IP 1 |
| .B SAMPLE_IP |
| must have constant skid. |
| .IP 2 |
| .B SAMPLE_IP |
| requested to have 0 skid. |
| .IP 3 |
| .B SAMPLE_IP |
| must have 0 skid. |
| See also the description of |
| .BR PERF_RECORD_MISC_EXACT_IP . |
| .RE |
| .TP |
| .IR mmap_data " (since Linux 2.6.36)" |
| .\" commit 3af9e859281bda7eb7c20b51879cf43aa788ac2e |
| This is the counterpart of the |
| .I mmap |
| field. |
| This enables generation of |
| .B PERF_RECORD_MMAP |
| samples for |
| .BR mmap (2) |
| calls that do not have |
| .B PROT_EXEC |
| set (for example data and SysV shared memory). |
| .TP |
| .IR sample_id_all " (since Linux 2.6.38)" |
| .\" commit c980d1091810df13f21aabbce545fd98f545bbf7 |
| If set, then TID, TIME, ID, STREAM_ID, and CPU can |
| additionally be included in |
| .RB non- PERF_RECORD_SAMPLE s |
| if the corresponding |
| .I sample_type |
| is selected. |
| .IP |
| If |
| .B PERF_SAMPLE_IDENTIFIER |
| is specified, then an additional ID value is included |
| as the last value to ease parsing the record stream. |
| This may lead to the |
| .I id |
| value appearing twice. |
| .IP |
| The layout is described by this pseudo-structure: |
| .IP |
| .in +4n |
| .EX |
| struct sample_id { |
| { u32 pid, tid; } /* if PERF_SAMPLE_TID set */ |
| { u64 time; } /* if PERF_SAMPLE_TIME set */ |
| { u64 id; } /* if PERF_SAMPLE_ID set */ |
| { u64 stream_id;} /* if PERF_SAMPLE_STREAM_ID set */ |
| { u32 cpu, res; } /* if PERF_SAMPLE_CPU set */ |
| { u64 id; } /* if PERF_SAMPLE_IDENTIFIER set */ |
| }; |
| .EE |
| .in |
| .TP |
| .IR exclude_host " (since Linux 3.2)" |
| .\" commit a240f76165e6255384d4bdb8139895fac7988799 |
| When conducting measurements that include processes running |
| VM instances (i.e., have executed a |
| .B KVM_RUN |
| .BR ioctl (2)), |
| only measure events happening inside a guest instance. |
| This is only meaningful outside the guests; this setting does |
| not change counts gathered inside of a guest. |
| Currently, this functionality is x86 only. |
| .TP |
| .IR exclude_guest " (since Linux 3.2)" |
| .\" commit a240f76165e6255384d4bdb8139895fac7988799 |
| When conducting measurements that include processes running |
| VM instances (i.e., have executed a |
| .B KVM_RUN |
| .BR ioctl (2)), |
| do not measure events happening inside guest instances. |
| This is only meaningful outside the guests; this setting does |
| not change counts gathered inside of a guest. |
| Currently, this functionality is x86 only. |
| .TP |
| .IR exclude_callchain_kernel " (since Linux 3.7)" |
| .\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91 |
| Do not include kernel callchains. |
| .TP |
| .IR exclude_callchain_user " (since Linux 3.7)" |
| .\" commit d077526485d5c9b12fe85d0b2b3b7041e6bc5f91 |
| Do not include user callchains. |
| .TP |
| .IR mmap2 " (since Linux 3.16)" |
| .\" commit 13d7a2410fa637f450a29ecb515ac318ee40c741 |
| .\" This is tricky; was committed during 3.12 development |
| .\" but right before release was disabled. |
| .\" So while you could select mmap2 starting with 3.12 |
| .\" it did not work until 3.16 |
| .\" commit a5a5ba72843dd05f991184d6cb9a4471acce1005 |
| Generate an extended executable mmap record that contains enough |
| additional information to uniquely identify shared mappings. |
| The |
| .I mmap |
| flag must also be set for this to work. |
| .TP |
| .IR comm_exec " (since Linux 3.16)" |
| .\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871 |
| This is purely a feature-detection flag, it does not change |
| kernel behavior. |
| If this flag can successfully be set, then, when |
| .I comm |
| is enabled, the |
| .B PERF_RECORD_MISC_COMM_EXEC |
| flag will be set in the |
| .I misc |
| field of a comm record header if the rename event being |
| reported was caused by a call to |
| .BR execve (2). |
| This allows tools to distinguish between the various |
| types of process renaming. |
| .TP |
| .IR use_clockid " (since Linux 4.1)" |
| .\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b |
| This allows selecting which internal Linux clock to use |
| when generating timestamps via the |
| .I clockid |
| field. |
| This can make it easier to correlate perf sample times with |
| timestamps generated by other tools. |
| .TP |
| .IR context_switch " (since Linux 4.3)" |
| .\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4 |
| This enables the generation of |
| .B PERF_RECORD_SWITCH |
| records when a context switch occurs. |
| It also enables the generation of |
| .B PERF_RECORD_SWITCH_CPU_WIDE |
| records when sampling in CPU-wide mode. |
| This functionality is in addition to existing tracepoint and |
| software events for measuring context switches. |
| The advantage of this method is that it will give full |
| information even with strict |
| .I perf_event_paranoid |
| settings. |
| .TP |
| .IR write_backward " (since Linux 4.6)" |
| .\" commit 9ecda41acb971ebd07c8fb35faf24005c0baea12 |
| This causes the ring buffer to be written from the end to the beginning. |
| This is to support reading from overwritable ring buffer. |
| .TP |
| .IR namespaces " (since Linux 4.11)" |
| .\" commit e422267322cd319e2695a535e47c5b1feeac45eb |
| This enables the generation of |
| .B PERF_RECORD_NAMESPACES |
| records when a task enters a new namespace. |
| Each namespace has a combination of device and inode numbers. |
| .TP |
| .IR ksymbol " (since Linux 5.0)" |
| .\" commit 76193a94522f1d4edf2447a536f3f796ce56343b |
| This enables the generation of |
| .B PERF_RECORD_KSYMBOL |
| records when new kernel symbols are registered or unregistered. |
| This is analyzing dynamic kernel functions like eBPF. |
| .TP |
| .IR bpf_event " (since Linux 5.0)" |
| .\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106 |
| This enables the generation of |
| .B PERF_RECORD_BPF_EVENT |
| records when an eBPF program is loaded or unloaded. |
| .TP |
| .IR auxevent " (since Linux 5.4)" |
| .\" commit ab43762ef010967e4ccd53627f70a2eecbeafefb |
| This allows normal (non-AUX) events to generate data for AUX events |
| if the hardware supports it. |
| .TP |
| .IR cgroup " (since Linux 5.7)" |
| .\" commit 96aaab686505c449e24d76e76507290dcc30e008 |
| This enables the generation of |
| .B PERF_RECORD_CGROUP |
| records when a new cgroup is created (and activated). |
| .TP |
| .IR text_poke " (since Linux 5.8)" |
| .\" commit e17d43b93e544f5016c0251d2074c15568d5d963 |
| This enables the generation of |
| .B PERF_RECORD_TEXT_POKE |
| records when there's a changes to the kernel text |
| (i.e., self-modifying code). |
| .TP |
| .IR wakeup_events ", " wakeup_watermark |
| This union sets how many samples |
| .RI ( wakeup_events ) |
| or bytes |
| .RI ( wakeup_watermark ) |
| happen before an overflow notification happens. |
| Which one is used is selected by the |
| .I watermark |
| bit flag. |
| .IP |
| .I wakeup_events |
| counts only |
| .B PERF_RECORD_SAMPLE |
| record types. |
| To receive overflow notification for all |
| .B PERF_RECORD |
| types choose watermark and set |
| .I wakeup_watermark |
| to 1. |
| .IP |
| Prior to Linux 3.0, setting |
| .\" commit f506b3dc0ec454a16d40cab9ee5d75435b39dc50 |
| .I wakeup_events |
| to 0 resulted in no overflow notifications; |
| more recent kernels treat 0 the same as 1. |
| .TP |
| .IR bp_type " (since Linux 2.6.33)" |
| .\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e |
| This chooses the breakpoint type. |
| It is one of: |
| .RS |
| .TP |
| .B HW_BREAKPOINT_EMPTY |
| No breakpoint. |
| .TP |
| .B HW_BREAKPOINT_R |
| Count when we read the memory location. |
| .TP |
| .B HW_BREAKPOINT_W |
| Count when we write the memory location. |
| .TP |
| .B HW_BREAKPOINT_RW |
| Count when we read or write the memory location. |
| .TP |
| .B HW_BREAKPOINT_X |
| Count when we execute code at the memory location. |
| .PP |
| The values can be combined via a bitwise or, but the |
| combination of |
| .B HW_BREAKPOINT_R |
| or |
| .B HW_BREAKPOINT_W |
| with |
| .B HW_BREAKPOINT_X |
| is not allowed. |
| .RE |
| .TP |
| .IR bp_addr " (since Linux 2.6.33)" |
| .\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e |
| This is the address of the breakpoint. |
| For execution breakpoints, this is the memory address of the instruction |
| of interest; for read and write breakpoints, it is the memory address |
| of the memory location of interest. |
| .TP |
| .IR config1 " (since Linux 2.6.39)" |
| .\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6 |
| .I config1 |
| is used for setting events that need an extra register or otherwise |
| do not fit in the regular config field. |
| Raw OFFCORE_EVENTS on Nehalem/Westmere/SandyBridge use this field |
| on Linux 3.3 and later kernels. |
| .TP |
| .IR bp_len " (since Linux 2.6.33)" |
| .\" commit 24f1e32c60c45c89a997c73395b69c8af6f0a84e |
| .I bp_len |
| is the length of the breakpoint being measured if |
| .I type |
| is |
| .BR PERF_TYPE_BREAKPOINT . |
| Options are |
| .BR HW_BREAKPOINT_LEN_1 , |
| .BR HW_BREAKPOINT_LEN_2 , |
| .BR HW_BREAKPOINT_LEN_4 , |
| and |
| .BR HW_BREAKPOINT_LEN_8 . |
| For an execution breakpoint, set this to |
| .IR sizeof(long) . |
| .TP |
| .IR config2 " (since Linux 2.6.39)" |
| .\" commit a7e3ed1e470116c9d12c2f778431a481a6be8ab6 |
| .I config2 |
| is a further extension of the |
| .I config1 |
| field. |
| .TP |
| .IR branch_sample_type " (since Linux 3.4)" |
| .\" commit bce38cd53e5ddba9cb6d708c4ef3d04a4016ec7e |
| If |
| .B PERF_SAMPLE_BRANCH_STACK |
| is enabled, then this specifies what branches to include |
| in the branch record. |
| .IP |
| The first part of the value is the privilege level, which |
| is a combination of one of the values listed below. |
| If the user does not set privilege level explicitly, the kernel |
| will use the event's privilege level. |
| Event and branch privilege levels do not have to match. |
| .RS |
| .TP |
| .B PERF_SAMPLE_BRANCH_USER |
| Branch target is in user space. |
| .TP |
| .B PERF_SAMPLE_BRANCH_KERNEL |
| Branch target is in kernel space. |
| .TP |
| .B PERF_SAMPLE_BRANCH_HV |
| Branch target is in hypervisor. |
| .TP |
| .B PERF_SAMPLE_BRANCH_PLM_ALL |
| A convenience value that is the three preceding values ORed together. |
| .PP |
| In addition to the privilege value, at least one or more of the |
| following bits must be set. |
| .TP |
| .B PERF_SAMPLE_BRANCH_ANY |
| Any branch type. |
| .TP |
| .B PERF_SAMPLE_BRANCH_ANY_CALL |
| Any call branch (includes direct calls, indirect calls, and far jumps). |
| .TP |
| .B PERF_SAMPLE_BRANCH_IND_CALL |
| Indirect calls. |
| .TP |
| .BR PERF_SAMPLE_BRANCH_CALL " (since Linux 4.4)" |
| .\" commit c229bf9dc179d2023e185c0f705bdf68484c1e73 |
| Direct calls. |
| .TP |
| .B PERF_SAMPLE_BRANCH_ANY_RETURN |
| Any return branch. |
| .TP |
| .BR PERF_SAMPLE_BRANCH_IND_JUMP " (since Linux 4.2)" |
| .\" commit c9fdfa14c3792c0160849c484e83aa57afd80ccc |
| Indirect jumps. |
| .TP |
| .BR PERF_SAMPLE_BRANCH_COND " (since Linux 3.16)" |
| .\" commit bac52139f0b7ab31330e98fd87fc5a2664951050 |
| Conditional branches. |
| .TP |
| .BR PERF_SAMPLE_BRANCH_ABORT_TX " (since Linux 3.11)" |
| .\" commit 135c5612c460f89657c4698fe2ea753f6f667963 |
| Transactional memory aborts. |
| .TP |
| .BR PERF_SAMPLE_BRANCH_IN_TX " (since Linux 3.11)" |
| .\" commit 135c5612c460f89657c4698fe2ea753f6f667963 |
| Branch in transactional memory transaction. |
| .TP |
| .BR PERF_SAMPLE_BRANCH_NO_TX " (since Linux 3.11)" |
| .\" commit 135c5612c460f89657c4698fe2ea753f6f667963 |
| Branch not in transactional memory transaction. |
| .BR PERF_SAMPLE_BRANCH_CALL_STACK " (since Linux 4.1)" |
| .\" commit 2c44b1936bb3b135a3fac8b3493394d42e51cf70 |
| Branch is part of a hardware-generated call stack. |
| This requires hardware support, currently only found |
| on Intel x86 Haswell or newer. |
| .RE |
| .TP |
| .IR sample_regs_user " (since Linux 3.7)" |
| .\" commit 4018994f3d8785275ef0e7391b75c3462c029e56 |
| This bit mask defines the set of user CPU registers to dump on samples. |
| The layout of the register mask is architecture-specific and |
| is described in the kernel header file |
| .IR arch/ARCH/include/uapi/asm/perf_regs.h . |
| .TP |
| .IR sample_stack_user " (since Linux 3.7)" |
| .\" commit c5ebcedb566ef17bda7b02686e0d658a7bb42ee7 |
| This defines the size of the user stack to dump if |
| .B PERF_SAMPLE_STACK_USER |
| is specified. |
| .TP |
| .IR clockid " (since Linux 4.1)" |
| .\" commit 34f439278cef7b1177f8ce24f9fc81dfc6221d3b |
| If |
| .I use_clockid |
| is set, then this field selects which internal Linux timer to |
| use for timestamps. |
| The available timers are defined in |
| .IR linux/time.h , |
| with |
| .BR CLOCK_MONOTONIC , |
| .BR CLOCK_MONOTONIC_RAW , |
| .BR CLOCK_REALTIME , |
| .BR CLOCK_BOOTTIME , |
| and |
| .B CLOCK_TAI |
| currently supported. |
| .TP |
| .IR aux_watermark " (since Linux 4.1)" |
| .\" commit 1a5941312414c71dece6717da9a0fa1303127afa |
| This specifies how much data is required to trigger a |
| .B PERF_RECORD_AUX |
| sample. |
| .TP |
| .IR sample_max_stack " (since Linux 4.8)" |
| .\" commit 97c79a38cd454602645f0470ffb444b3b75ce574 |
| When |
| .I sample_type |
| includes |
| .BR PERF_SAMPLE_CALLCHAIN , |
| this field specifies how many stack frames to report when |
| generating the callchain. |
| .SS Reading results |
| Once a |
| .BR perf_event_open () |
| file descriptor has been opened, the values |
| of the events can be read from the file descriptor. |
| The values that are there are specified by the |
| .I read_format |
| field in the |
| .I attr |
| structure at open time. |
| .PP |
| If you attempt to read into a buffer that is not big enough to hold the |
| data, the error |
| .B ENOSPC |
| results. |
| .PP |
| Here is the layout of the data returned by a read: |
| .IP * 2 |
| If |
| .B PERF_FORMAT_GROUP |
| was specified to allow reading all events in a group at once: |
| .IP |
| .in +4n |
| .EX |
| struct read_format { |
| u64 nr; /* The number of events */ |
| u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */ |
| u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */ |
| struct { |
| u64 value; /* The value of the event */ |
| u64 id; /* if PERF_FORMAT_ID */ |
| } values[nr]; |
| }; |
| .EE |
| .in |
| .IP * |
| If |
| .B PERF_FORMAT_GROUP |
| was |
| .I not |
| specified: |
| .IP |
| .in +4n |
| .EX |
| struct read_format { |
| u64 value; /* The value of the event */ |
| u64 time_enabled; /* if PERF_FORMAT_TOTAL_TIME_ENABLED */ |
| u64 time_running; /* if PERF_FORMAT_TOTAL_TIME_RUNNING */ |
| u64 id; /* if PERF_FORMAT_ID */ |
| }; |
| .EE |
| .in |
| .PP |
| The values read are as follows: |
| .TP |
| .I nr |
| The number of events in this file descriptor. |
| Available only if |
| .B PERF_FORMAT_GROUP |
| was specified. |
| .TP |
| .IR time_enabled ", " time_running |
| Total time the event was enabled and running. |
| Normally these values are the same. |
| Multiplexing happens if the number of events is more than the |
| number of available PMU counter slots. |
| In that case the events run only part of the time and the |
| .I time_enabled |
| and |
| .I time running |
| values can be used to scale an estimated value for the count. |
| .TP |
| .I value |
| An unsigned 64-bit value containing the counter result. |
| .TP |
| .I id |
| A globally unique value for this particular event; only present if |
| .B PERF_FORMAT_ID |
| was specified in |
| .IR read_format . |
| .SS MMAP layout |
| When using |
| .BR perf_event_open () |
| in sampled mode, asynchronous events |
| (like counter overflow or |
| .B PROT_EXEC |
| mmap tracking) |
| are logged into a ring-buffer. |
| This ring-buffer is created and accessed through |
| .BR mmap (2). |
| .PP |
| The mmap size should be 1+2^n pages, where the first page is a |
| metadata page |
| .RI ( "struct perf_event_mmap_page" ) |
| that contains various |
| bits of information such as where the ring-buffer head is. |
| .PP |
| Before kernel 2.6.39, there is a bug that means you must allocate an mmap |
| ring buffer when sampling even if you do not plan to access it. |
| .PP |
| The structure of the first metadata mmap page is as follows: |
| .PP |
| .in +4n |
| .EX |
| struct perf_event_mmap_page { |
| __u32 version; /* version number of this structure */ |
| __u32 compat_version; /* lowest version this is compat with */ |
| __u32 lock; /* seqlock for synchronization */ |
| __u32 index; /* hardware counter identifier */ |
| __s64 offset; /* add to hardware counter value */ |
| __u64 time_enabled; /* time event active */ |
| __u64 time_running; /* time event on CPU */ |
| union { |
| __u64 capabilities; |
| struct { |
| __u64 cap_usr_time / cap_usr_rdpmc / cap_bit0 : 1, |
| cap_bit0_is_deprecated : 1, |
| cap_user_rdpmc : 1, |
| cap_user_time : 1, |
| cap_user_time_zero : 1, |
| }; |
| }; |
| __u16 pmc_width; |
| __u16 time_shift; |
| __u32 time_mult; |
| __u64 time_offset; |
| __u64 __reserved[120]; /* Pad to 1 k */ |
| __u64 data_head; /* head in the data section */ |
| __u64 data_tail; /* user\-space written tail */ |
| __u64 data_offset; /* where the buffer starts */ |
| __u64 data_size; /* data buffer size */ |
| __u64 aux_head; |
| __u64 aux_tail; |
| __u64 aux_offset; |
| __u64 aux_size; |
| |
| } |
| .EE |
| .in |
| .PP |
| The following list describes the fields in the |
| .I perf_event_mmap_page |
| structure in more detail: |
| .TP |
| .I version |
| Version number of this structure. |
| .TP |
| .I compat_version |
| The lowest version this is compatible with. |
| .TP |
| .I lock |
| A seqlock for synchronization. |
| .TP |
| .I index |
| A unique hardware counter identifier. |
| .TP |
| .I offset |
| When using rdpmc for reads this offset value |
| must be added to the one returned by rdpmc to get |
| the current total event count. |
| .TP |
| .I time_enabled |
| Time the event was active. |
| .TP |
| .I time_running |
| Time the event was running. |
| .TP |
| .IR cap_usr_time " / " cap_usr_rdpmc " / " cap_bit0 " (since Linux 3.4)" |
| .\" commit c7206205d00ab375839bd6c7ddb247d600693c09 |
| There was a bug in the definition of |
| .I cap_usr_time |
| and |
| .I cap_usr_rdpmc |
| from Linux 3.4 until Linux 3.11. |
| Both bits were defined to point to the same location, so it was |
| impossible to know if |
| .I cap_usr_time |
| or |
| .I cap_usr_rdpmc |
| were actually set. |
| .IP |
| Starting with Linux 3.12, these are renamed to |
| .\" commit fa7315871046b9a4c48627905691dbde57e51033 |
| .I cap_bit0 |
| and you should use the |
| .I cap_user_time |
| and |
| .I cap_user_rdpmc |
| fields instead. |
| .TP |
| .IR cap_bit0_is_deprecated " (since Linux 3.12)" |
| .\" commit fa7315871046b9a4c48627905691dbde57e51033 |
| If set, this bit indicates that the kernel supports |
| the properly separated |
| .I cap_user_time |
| and |
| .I cap_user_rdpmc |
| bits. |
| .IP |
| If not-set, it indicates an older kernel where |
| .I cap_usr_time |
| and |
| .I cap_usr_rdpmc |
| map to the same bit and thus both features should |
| be used with caution. |
| .TP |
| .IR cap_user_rdpmc " (since Linux 3.12)" |
| .\" commit fa7315871046b9a4c48627905691dbde57e51033 |
| If the hardware supports user-space read of performance counters |
| without syscall (this is the "rdpmc" instruction on x86), then |
| the following code can be used to do a read: |
| .IP |
| .in +4n |
| .EX |
| u32 seq, time_mult, time_shift, idx, width; |
| u64 count, enabled, running; |
| u64 cyc, time_offset; |
| |
| do { |
| seq = pc\->lock; |
| barrier(); |
| enabled = pc\->time_enabled; |
| running = pc\->time_running; |
| |
| if (pc\->cap_usr_time && enabled != running) { |
| cyc = rdtsc(); |
| time_offset = pc\->time_offset; |
| time_mult = pc\->time_mult; |
| time_shift = pc\->time_shift; |
| } |
| |
| idx = pc\->index; |
| count = pc\->offset; |
| |
| if (pc\->cap_usr_rdpmc && idx) { |
| width = pc\->pmc_width; |
| count += rdpmc(idx \- 1); |
| } |
| |
| barrier(); |
| } while (pc\->lock != seq); |
| .EE |
| .in |
| .TP |
| .IR cap_user_time " (since Linux 3.12)" |
| .\" commit fa7315871046b9a4c48627905691dbde57e51033 |
| This bit indicates the hardware has a constant, nonstop |
| timestamp counter (TSC on x86). |
| .TP |
| .IR cap_user_time_zero " (since Linux 3.12)" |
| .\" commit fa7315871046b9a4c48627905691dbde57e51033 |
| Indicates the presence of |
| .I time_zero |
| which allows mapping timestamp values to |
| the hardware clock. |
| .TP |
| .I pmc_width |
| If |
| .IR cap_usr_rdpmc , |
| this field provides the bit-width of the value |
| read using the rdpmc or equivalent instruction. |
| This can be used to sign extend the result like: |
| .IP |
| .in +4n |
| .EX |
| pmc <<= 64 \- pmc_width; |
| pmc >>= 64 \- pmc_width; // signed shift right |
| count += pmc; |
| .EE |
| .in |
| .TP |
| .IR time_shift ", " time_mult ", " time_offset |
| .IP |
| If |
| .IR cap_usr_time , |
| these fields can be used to compute the time |
| delta since |
| .I time_enabled |
| (in nanoseconds) using rdtsc or similar. |
| .IP |
| .in +4n |
| .EX |
| u64 quot, rem; |
| u64 delta; |
| |
| quot = cyc >> time_shift; |
| rem = cyc & (((u64)1 << time_shift) \- 1); |
| delta = time_offset + quot * time_mult + |
| ((rem * time_mult) >> time_shift); |
| .EE |
| .in |
| .IP |
| Where |
| .IR time_offset , |
| .IR time_mult , |
| .IR time_shift , |
| and |
| .I cyc |
| are read in the |
| seqcount loop described above. |
| This delta can then be added to |
| enabled and possible running (if idx), improving the scaling: |
| .IP |
| .in +4n |
| .EX |
| enabled += delta; |
| if (idx) |
| running += delta; |
| quot = count / running; |
| rem = count % running; |
| count = quot * enabled + (rem * enabled) / running; |
| .EE |
| .in |
| .TP |
| .IR time_zero " (since Linux 3.12)" |
| .\" commit fa7315871046b9a4c48627905691dbde57e51033 |
| .IP |
| If |
| .I cap_usr_time_zero |
| is set, then the hardware clock (the TSC timestamp counter on x86) |
| can be calculated from the |
| .IR time_zero , |
| .IR time_mult , |
| and |
| .I time_shift |
| values: |
| .IP |
| .in +4n |
| .EX |
| time = timestamp \- time_zero; |
| quot = time / time_mult; |
| rem = time % time_mult; |
| cyc = (quot << time_shift) + (rem << time_shift) / time_mult; |
| .EE |
| .in |
| .IP |
| And vice versa: |
| .IP |
| .in +4n |
| .EX |
| quot = cyc >> time_shift; |
| rem = cyc & (((u64)1 << time_shift) \- 1); |
| timestamp = time_zero + quot * time_mult + |
| ((rem * time_mult) >> time_shift); |
| .EE |
| .in |
| .TP |
| .I data_head |
| This points to the head of the data section. |
| The value continuously increases, it does not wrap. |
| The value needs to be manually wrapped by the size of the mmap buffer |
| before accessing the samples. |
| .IP |
| On SMP-capable platforms, after reading the |
| .I data_head |
| value, |
| user space should issue an rmb(). |
| .TP |
| .I data_tail |
| When the mapping is |
| .BR PROT_WRITE , |
| the |
| .I data_tail |
| value should be written by user space to reflect the last read data. |
| In this case, the kernel will not overwrite unread data. |
| .TP |
| .IR data_offset " (since Linux 4.1)" |
| .\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f |
| Contains the offset of the location in the mmap buffer |
| where perf sample data begins. |
| .TP |
| .IR data_size " (since Linux 4.1)" |
| .\" commit e8c6deac69629c0cb97c3d3272f8631ef17f8f0f |
| Contains the size of the perf sample region within |
| the mmap buffer. |
| .TP |
| .IR aux_head ", " aux_tail ", " aux_offset ", " aux_size " (since Linux 4.1)" |
| .\" commit 45bfb2e50471abbbfd83d40d28c986078b0d24ff |
| The AUX region allows |
| .BR mmap (2)-ing |
| a separate sample buffer for |
| high-bandwidth data streams (separate from the main perf sample buffer). |
| An example of a high-bandwidth stream is instruction tracing support, |
| as is found in newer Intel processors. |
| .IP |
| To set up an AUX area, first |
| .I aux_offset |
| needs to be set with an offset greater than |
| .IR data_offset + data_size |
| and |
| .I aux_size |
| needs to be set to the desired buffer size. |
| The desired offset and size must be page aligned, and the size |
| must be a power of two. |
| These values are then passed to mmap in order to map the AUX buffer. |
| Pages in the AUX buffer are included as part of the |
| .B RLIMIT_MEMLOCK |
| resource limit (see |
| .BR setrlimit (2)), |
| and also as part of the |
| .I perf_event_mlock_kb |
| allowance. |
| .IP |
| By default, the AUX buffer will be truncated if it will not fit |
| in the available space in the ring buffer. |
| If the AUX buffer is mapped as a read only buffer, then it will |
| operate in ring buffer mode where old data will be overwritten |
| by new. |
| In overwrite mode, it might not be possible to infer where the |
| new data began, and it is the consumer's job to disable |
| measurement while reading to avoid possible data races. |
| .IP |
| The |
| .I aux_head |
| and |
| .I aux_tail |
| ring buffer pointers have the same behavior and ordering |
| rules as the previous described |
| .I data_head |
| and |
| .IR data_tail . |
| .PP |
| The following 2^n ring-buffer pages have the layout described below. |
| .PP |
| If |
| .I perf_event_attr.sample_id_all |
| is set, then all event types will |
| have the sample_type selected fields related to where/when (identity) |
| an event took place (TID, TIME, ID, CPU, STREAM_ID) described in |
| .B PERF_RECORD_SAMPLE |
| below, it will be stashed just after the |
| .I perf_event_header |
| and the fields already present for the existing |
| fields, that is, at the end of the payload. |
| This allows a newer perf.data |
| file to be supported by older perf tools, with the new optional |
| fields being ignored. |
| .PP |
| The mmap values start with a header: |
| .PP |
| .in +4n |
| .EX |
| struct perf_event_header { |
| __u32 type; |
| __u16 misc; |
| __u16 size; |
| }; |
| .EE |
| .in |
| .PP |
| Below, we describe the |
| .I perf_event_header |
| fields in more detail. |
| For ease of reading, |
| the fields with shorter descriptions are presented first. |
| .TP |
| .I size |
| This indicates the size of the record. |
| .TP |
| .I misc |
| The |
| .I misc |
| field contains additional information about the sample. |
| .IP |
| The CPU mode can be determined from this value by masking with |
| .B PERF_RECORD_MISC_CPUMODE_MASK |
| and looking for one of the following (note these are not |
| bit masks, only one can be set at a time): |
| .RS |
| .TP |
| .B PERF_RECORD_MISC_CPUMODE_UNKNOWN |
| Unknown CPU mode. |
| .TP |
| .B PERF_RECORD_MISC_KERNEL |
| Sample happened in the kernel. |
| .TP |
| .B PERF_RECORD_MISC_USER |
| Sample happened in user code. |
| .TP |
| .B PERF_RECORD_MISC_HYPERVISOR |
| Sample happened in the hypervisor. |
| .TP |
| .BR PERF_RECORD_MISC_GUEST_KERNEL " (since Linux 2.6.35)" |
| .\" commit 39447b386c846bbf1c56f6403c5282837486200f |
| Sample happened in the guest kernel. |
| .TP |
| .B PERF_RECORD_MISC_GUEST_USER " (since Linux 2.6.35)" |
| .\" commit 39447b386c846bbf1c56f6403c5282837486200f |
| Sample happened in guest user code. |
| .RE |
| .PP |
| .RS |
| Since the following three statuses are generated by |
| different record types, they alias to the same bit: |
| .TP |
| .BR PERF_RECORD_MISC_MMAP_DATA " (since Linux 3.10)" |
| .\" commit 2fe85427e3bf65d791700d065132772fc26e4d75 |
| This is set when the mapping is not executable; |
| otherwise the mapping is executable. |
| .TP |
| .BR PERF_RECORD_MISC_COMM_EXEC " (since Linux 3.16)" |
| .\" commit 82b897782d10fcc4930c9d4a15b175348fdd2871 |
| This is set for a |
| .B PERF_RECORD_COMM |
| record on kernels more recent than Linux 3.16 |
| if a process name change was caused by an |
| .BR execve (2) |
| system call. |
| .TP |
| .BR PERF_RECORD_MISC_SWITCH_OUT " (since Linux 4.3)" |
| .\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4 |
| When a |
| .B PERF_RECORD_SWITCH |
| or |
| .B PERF_RECORD_SWITCH_CPU_WIDE |
| record is generated, this bit indicates that the |
| context switch is away from the current process |
| (instead of into the current process). |
| .RE |
| .PP |
| .RS |
| In addition, the following bits can be set: |
| .TP |
| .B PERF_RECORD_MISC_EXACT_IP |
| This indicates that the content of |
| .B PERF_SAMPLE_IP |
| points |
| to the actual instruction that triggered the event. |
| See also |
| .IR perf_event_attr.precise_ip . |
| .TP |
| .BR PERF_RECORD_MISC_EXT_RESERVED " (since Linux 2.6.35)" |
| .\" commit 1676b8a077c352085d52578fb4f29350b58b6e74 |
| This indicates there is extended data available (currently not used). |
| .TP |
| .B PERF_RECORD_MISC_PROC_MAP_PARSE_TIMEOUT |
| .\" commit 930e6fcd2bcce9bcd9d4aa7e755678d33f3fe6f4 |
| This bit is not set by the kernel. |
| It is reserved for the user-space perf utility to indicate that |
| .I /proc/i[pid]/maps |
| parsing was taking too long and was stopped, and thus the mmap |
| records may be truncated. |
| .RE |
| .TP |
| .I type |
| The |
| .I type |
| value is one of the below. |
| The values in the corresponding record (that follows the header) |
| depend on the |
| .I type |
| selected as shown. |
| .RS |
| .TP 4 |
| .B PERF_RECORD_MMAP |
| The MMAP events record the |
| .B PROT_EXEC |
| mappings so that we can correlate |
| user-space IPs to code. |
| They have the following structure: |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u32 pid, tid; |
| u64 addr; |
| u64 len; |
| u64 pgoff; |
| char filename[]; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I pid |
| is the process ID. |
| .TP |
| .I tid |
| is the thread ID. |
| .TP |
| .I addr |
| is the address of the allocated memory. |
| .I len |
| is the length of the allocated memory. |
| .I pgoff |
| is the page offset of the allocated memory. |
| .I filename |
| is a string describing the backing of the allocated memory. |
| .RE |
| .TP |
| .B PERF_RECORD_LOST |
| This record indicates when events are lost. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u64 id; |
| u64 lost; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I id |
| is the unique event ID for the samples that were lost. |
| .TP |
| .I lost |
| is the number of events that were lost. |
| .RE |
| .TP |
| .B PERF_RECORD_COMM |
| This record indicates a change in the process name. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u32 pid; |
| u32 tid; |
| char comm[]; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I pid |
| is the process ID. |
| .TP |
| .I tid |
| is the thread ID. |
| .TP |
| .I comm |
| is a string containing the new name of the process. |
| .RE |
| .TP |
| .B PERF_RECORD_EXIT |
| This record indicates a process exit event. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u32 pid, ppid; |
| u32 tid, ptid; |
| u64 time; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .TP |
| .BR PERF_RECORD_THROTTLE ", " PERF_RECORD_UNTHROTTLE |
| This record indicates a throttle/unthrottle event. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u64 time; |
| u64 id; |
| u64 stream_id; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .TP |
| .B PERF_RECORD_FORK |
| This record indicates a fork event. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u32 pid, ppid; |
| u32 tid, ptid; |
| u64 time; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .TP |
| .B PERF_RECORD_READ |
| This record indicates a read event. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u32 pid, tid; |
| struct read_format values; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .TP |
| .B PERF_RECORD_SAMPLE |
| This record indicates a sample. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u64 sample_id; /* if PERF_SAMPLE_IDENTIFIER */ |
| u64 ip; /* if PERF_SAMPLE_IP */ |
| u32 pid, tid; /* if PERF_SAMPLE_TID */ |
| u64 time; /* if PERF_SAMPLE_TIME */ |
| u64 addr; /* if PERF_SAMPLE_ADDR */ |
| u64 id; /* if PERF_SAMPLE_ID */ |
| u64 stream_id; /* if PERF_SAMPLE_STREAM_ID */ |
| u32 cpu, res; /* if PERF_SAMPLE_CPU */ |
| u64 period; /* if PERF_SAMPLE_PERIOD */ |
| struct read_format v; |
| /* if PERF_SAMPLE_READ */ |
| u64 nr; /* if PERF_SAMPLE_CALLCHAIN */ |
| u64 ips[nr]; /* if PERF_SAMPLE_CALLCHAIN */ |
| u32 size; /* if PERF_SAMPLE_RAW */ |
| char data[size]; /* if PERF_SAMPLE_RAW */ |
| u64 bnr; /* if PERF_SAMPLE_BRANCH_STACK */ |
| struct perf_branch_entry lbr[bnr]; |
| /* if PERF_SAMPLE_BRANCH_STACK */ |
| u64 abi; /* if PERF_SAMPLE_REGS_USER */ |
| u64 regs[weight(mask)]; |
| /* if PERF_SAMPLE_REGS_USER */ |
| u64 size; /* if PERF_SAMPLE_STACK_USER */ |
| char data[size]; /* if PERF_SAMPLE_STACK_USER */ |
| u64 dyn_size; /* if PERF_SAMPLE_STACK_USER && |
| size != 0 */ |
| u64 weight; /* if PERF_SAMPLE_WEIGHT */ |
| u64 data_src; /* if PERF_SAMPLE_DATA_SRC */ |
| u64 transaction; /* if PERF_SAMPLE_TRANSACTION */ |
| u64 abi; /* if PERF_SAMPLE_REGS_INTR */ |
| u64 regs[weight(mask)]; |
| /* if PERF_SAMPLE_REGS_INTR */ |
| u64 phys_addr; /* if PERF_SAMPLE_PHYS_ADDR */ |
| u64 cgroup; /* if PERF_SAMPLE_CGROUP */ |
| }; |
| .EE |
| .in |
| .RS 4 |
| .TP 4 |
| .I sample_id |
| If |
| .B PERF_SAMPLE_IDENTIFIER |
| is enabled, a 64-bit unique ID is included. |
| This is a duplication of the |
| .B PERF_SAMPLE_ID |
| .I id |
| value, but included at the beginning of the sample |
| so parsers can easily obtain the value. |
| .TP |
| .I ip |
| If |
| .B PERF_SAMPLE_IP |
| is enabled, then a 64-bit instruction |
| pointer value is included. |
| .TP |
| .IR pid ", " tid |
| If |
| .B PERF_SAMPLE_TID |
| is enabled, then a 32-bit process ID |
| and 32-bit thread ID are included. |
| .TP |
| .I time |
| If |
| .B PERF_SAMPLE_TIME |
| is enabled, then a 64-bit timestamp |
| is included. |
| This is obtained via local_clock() which is a hardware timestamp |
| if available and the jiffies value if not. |
| .TP |
| .I addr |
| If |
| .B PERF_SAMPLE_ADDR |
| is enabled, then a 64-bit address is included. |
| This is usually the address of a tracepoint, |
| breakpoint, or software event; otherwise the value is 0. |
| .TP |
| .I id |
| If |
| .B PERF_SAMPLE_ID |
| is enabled, a 64-bit unique ID is included. |
| If the event is a member of an event group, the group leader ID is returned. |
| This ID is the same as the one returned by |
| .BR PERF_FORMAT_ID . |
| .TP |
| .I stream_id |
| If |
| .B PERF_SAMPLE_STREAM_ID |
| is enabled, a 64-bit unique ID is included. |
| Unlike |
| .B PERF_SAMPLE_ID |
| the actual ID is returned, not the group leader. |
| This ID is the same as the one returned by |
| .BR PERF_FORMAT_ID . |
| .TP |
| .IR cpu ", " res |
| If |
| .B PERF_SAMPLE_CPU |
| is enabled, this is a 32-bit value indicating |
| which CPU was being used, in addition to a reserved (unused) |
| 32-bit value. |
| .TP |
| .I period |
| If |
| .B PERF_SAMPLE_PERIOD |
| is enabled, a 64-bit value indicating |
| the current sampling period is written. |
| .TP |
| .I v |
| If |
| .B PERF_SAMPLE_READ |
| is enabled, a structure of type read_format |
| is included which has values for all events in the event group. |
| The values included depend on the |
| .I read_format |
| value used at |
| .BR perf_event_open () |
| time. |
| .TP |
| .IR nr ", " ips[nr] |
| If |
| .B PERF_SAMPLE_CALLCHAIN |
| is enabled, then a 64-bit number is included |
| which indicates how many following 64-bit instruction pointers will |
| follow. |
| This is the current callchain. |
| .TP |
| .IR size ", " data[size] |
| If |
| .B PERF_SAMPLE_RAW |
| is enabled, then a 32-bit value indicating size |
| is included followed by an array of 8-bit values of length size. |
| The values are padded with 0 to have 64-bit alignment. |
| .IP |
| This RAW record data is opaque with respect to the ABI. |
| The ABI doesn't make any promises with respect to the stability |
| of its content, it may vary depending |
| on event, hardware, and kernel version. |
| .TP |
| .IR bnr ", " lbr[bnr] |
| If |
| .B PERF_SAMPLE_BRANCH_STACK |
| is enabled, then a 64-bit value indicating |
| the number of records is included, followed by |
| .I bnr |
| .I perf_branch_entry |
| structures which each include the fields: |
| .RS |
| .TP |
| .I from |
| This indicates the source instruction (may not be a branch). |
| .TP |
| .I to |
| The branch target. |
| .TP |
| .I mispred |
| The branch target was mispredicted. |
| .TP |
| .I predicted |
| The branch target was predicted. |
| .TP |
| .IR in_tx " (since Linux 3.11)" |
| .\" commit 135c5612c460f89657c4698fe2ea753f6f667963 |
| The branch was in a transactional memory transaction. |
| .TP |
| .IR abort " (since Linux 3.11)" |
| .\" commit 135c5612c460f89657c4698fe2ea753f6f667963 |
| The branch was in an aborted transactional memory transaction. |
| .TP |
| .IR cycles " (since Linux 4.3)" |
| .\" commit 71ef3c6b9d4665ee7afbbe4c208a98917dcfc32f |
| This reports the number of cycles elapsed since the |
| previous branch stack update. |
| .PP |
| The entries are from most to least recent, so the first entry |
| has the most recent branch. |
| .PP |
| Support for |
| .IR mispred , |
| .IR predicted , |
| and |
| .I cycles |
| is optional; if not supported, those |
| values will be 0. |
| .PP |
| The type of branches recorded is specified by the |
| .I branch_sample_type |
| field. |
| .RE |
| .TP |
| .IR abi ", " regs[weight(mask)] |
| If |
| .B PERF_SAMPLE_REGS_USER |
| is enabled, then the user CPU registers are recorded. |
| .IP |
| The |
| .I abi |
| field is one of |
| .BR PERF_SAMPLE_REGS_ABI_NONE , |
| .BR PERF_SAMPLE_REGS_ABI_32 , |
| or |
| .BR PERF_SAMPLE_REGS_ABI_64 . |
| .IP |
| The |
| .I regs |
| field is an array of the CPU registers that were specified by |
| the |
| .I sample_regs_user |
| attr field. |
| The number of values is the number of bits set in the |
| .I sample_regs_user |
| bit mask. |
| .TP |
| .IR size ", " data[size] ", " dyn_size |
| If |
| .B PERF_SAMPLE_STACK_USER |
| is enabled, then the user stack is recorded. |
| This can be used to generate stack backtraces. |
| .I size |
| is the size requested by the user in |
| .I sample_stack_user |
| or else the maximum record size. |
| .I data |
| is the stack data (a raw dump of the memory pointed to by the |
| stack pointer at the time of sampling). |
| .I dyn_size |
| is the amount of data actually dumped (can be less than |
| .IR size ). |
| Note that |
| .I dyn_size |
| is omitted if |
| .I size |
| is 0. |
| .TP |
| .I weight |
| If |
| .B PERF_SAMPLE_WEIGHT |
| is enabled, then a 64-bit value provided by the hardware |
| is recorded that indicates how costly the event was. |
| This allows expensive events to stand out more clearly |
| in profiles. |
| .TP |
| .I data_src |
| If |
| .B PERF_SAMPLE_DATA_SRC |
| is enabled, then a 64-bit value is recorded that is made up of |
| the following fields: |
| .RS |
| .TP 4 |
| .I mem_op |
| Type of opcode, a bitwise combination of: |
| .IP |
| .PD 0 |
| .RS |
| .TP 24 |
| .B PERF_MEM_OP_NA |
| Not available |
| .TP |
| .B PERF_MEM_OP_LOAD |
| Load instruction |
| .TP |
| .B PERF_MEM_OP_STORE |
| Store instruction |
| .TP |
| .B PERF_MEM_OP_PFETCH |
| Prefetch |
| .TP |
| .B PERF_MEM_OP_EXEC |
| Executable code |
| .RE |
| .PD |
| .TP |
| .I mem_lvl |
| Memory hierarchy level hit or miss, a bitwise combination of |
| the following, shifted left by |
| .BR PERF_MEM_LVL_SHIFT : |
| .IP |
| .PD 0 |
| .RS |
| .TP 24 |
| .B PERF_MEM_LVL_NA |
| Not available |
| .TP |
| .B PERF_MEM_LVL_HIT |
| Hit |
| .TP |
| .B PERF_MEM_LVL_MISS |
| Miss |
| .TP |
| .B PERF_MEM_LVL_L1 |
| Level 1 cache |
| .TP |
| .B PERF_MEM_LVL_LFB |
| Line fill buffer |
| .TP |
| .B PERF_MEM_LVL_L2 |
| Level 2 cache |
| .TP |
| .B PERF_MEM_LVL_L3 |
| Level 3 cache |
| .TP |
| .B PERF_MEM_LVL_LOC_RAM |
| Local DRAM |
| .TP |
| .B PERF_MEM_LVL_REM_RAM1 |
| Remote DRAM 1 hop |
| .TP |
| .B PERF_MEM_LVL_REM_RAM2 |
| Remote DRAM 2 hops |
| .TP |
| .B PERF_MEM_LVL_REM_CCE1 |
| Remote cache 1 hop |
| .TP |
| .B PERF_MEM_LVL_REM_CCE2 |
| Remote cache 2 hops |
| .TP |
| .B PERF_MEM_LVL_IO |
| I/O memory |
| .TP |
| .B PERF_MEM_LVL_UNC |
| Uncached memory |
| .RE |
| .PD |
| .TP |
| .I mem_snoop |
| Snoop mode, a bitwise combination of the following, shifted left by |
| .BR PERF_MEM_SNOOP_SHIFT : |
| .IP |
| .PD 0 |
| .RS |
| .TP 24 |
| .B PERF_MEM_SNOOP_NA |
| Not available |
| .TP |
| .B PERF_MEM_SNOOP_NONE |
| No snoop |
| .TP |
| .B PERF_MEM_SNOOP_HIT |
| Snoop hit |
| .TP |
| .B PERF_MEM_SNOOP_MISS |
| Snoop miss |
| .TP |
| .B PERF_MEM_SNOOP_HITM |
| Snoop hit modified |
| .RE |
| .PD |
| .TP |
| .I mem_lock |
| Lock instruction, a bitwise combination of the following, shifted left by |
| .BR PERF_MEM_LOCK_SHIFT : |
| .IP |
| .PD 0 |
| .RS |
| .TP 24 |
| .B PERF_MEM_LOCK_NA |
| Not available |
| .TP |
| .B PERF_MEM_LOCK_LOCKED |
| Locked transaction |
| .RE |
| .PD |
| .TP |
| .I mem_dtlb |
| TLB access hit or miss, a bitwise combination of the following, shifted |
| left by |
| .BR PERF_MEM_TLB_SHIFT : |
| .IP |
| .PD 0 |
| .RS |
| .TP 24 |
| .B PERF_MEM_TLB_NA |
| Not available |
| .TP |
| .B PERF_MEM_TLB_HIT |
| Hit |
| .TP |
| .B PERF_MEM_TLB_MISS |
| Miss |
| .TP |
| .B PERF_MEM_TLB_L1 |
| Level 1 TLB |
| .TP |
| .B PERF_MEM_TLB_L2 |
| Level 2 TLB |
| .TP |
| .B PERF_MEM_TLB_WK |
| Hardware walker |
| .TP |
| .B PERF_MEM_TLB_OS |
| OS fault handler |
| .RE |
| .PD |
| .RE |
| .TP |
| .I transaction |
| If the |
| .B PERF_SAMPLE_TRANSACTION |
| flag is set, then a 64-bit field is recorded describing |
| the sources of any transactional memory aborts. |
| .IP |
| The field is a bitwise combination of the following values: |
| .RS |
| .TP |
| .B PERF_TXN_ELISION |
| Abort from an elision type transaction (Intel-CPU-specific). |
| .TP |
| .B PERF_TXN_TRANSACTION |
| Abort from a generic transaction. |
| .TP |
| .B PERF_TXN_SYNC |
| Synchronous abort (related to the reported instruction). |
| .TP |
| .B PERF_TXN_ASYNC |
| Asynchronous abort (not related to the reported instruction). |
| .TP |
| .B PERF_TXN_RETRY |
| Retryable abort (retrying the transaction may have succeeded). |
| .TP |
| .B PERF_TXN_CONFLICT |
| Abort due to memory conflicts with other threads. |
| .TP |
| .B PERF_TXN_CAPACITY_WRITE |
| Abort due to write capacity overflow. |
| .TP |
| .B PERF_TXN_CAPACITY_READ |
| Abort due to read capacity overflow. |
| .RE |
| .IP |
| In addition, a user-specified abort code can be obtained from |
| the high 32 bits of the field by shifting right by |
| .B PERF_TXN_ABORT_SHIFT |
| and masking with the value |
| .BR PERF_TXN_ABORT_MASK . |
| .TP |
| .IR abi ", " regs[weight(mask)] |
| If |
| .B PERF_SAMPLE_REGS_INTR |
| is enabled, then the user CPU registers are recorded. |
| .IP |
| The |
| .I abi |
| field is one of |
| .BR PERF_SAMPLE_REGS_ABI_NONE , |
| .BR PERF_SAMPLE_REGS_ABI_32 , |
| or |
| .BR PERF_SAMPLE_REGS_ABI_64 . |
| .IP |
| The |
| .I regs |
| field is an array of the CPU registers that were specified by |
| the |
| .I sample_regs_intr |
| attr field. |
| The number of values is the number of bits set in the |
| .I sample_regs_intr |
| bit mask. |
| .TP |
| .I phys_addr |
| If the |
| .B PERF_SAMPLE_PHYS_ADDR |
| flag is set, then the 64-bit physical address is recorded. |
| .TP |
| .I cgroup |
| If the |
| .B PERF_SAMPLE_CGROUP |
| flag is set, |
| then the 64-bit cgroup ID (for the perf_event subsystem) is recorded. |
| To get the pathname of the cgroup, the ID should match to one in a |
| .B PERF_RECORD_CGROUP . |
| .RE |
| .TP |
| .B PERF_RECORD_MMAP2 |
| This record includes extended information on |
| .BR mmap (2) |
| calls returning executable mappings. |
| The format is similar to that of the |
| .B PERF_RECORD_MMAP |
| record, but includes extra values that allow uniquely identifying |
| shared mappings. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u32 pid; |
| u32 tid; |
| u64 addr; |
| u64 len; |
| u64 pgoff; |
| u32 maj; |
| u32 min; |
| u64 ino; |
| u64 ino_generation; |
| u32 prot; |
| u32 flags; |
| char filename[]; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I pid |
| is the process ID. |
| .TP |
| .I tid |
| is the thread ID. |
| .TP |
| .I addr |
| is the address of the allocated memory. |
| .TP |
| .I len |
| is the length of the allocated memory. |
| .TP |
| .I pgoff |
| is the page offset of the allocated memory. |
| .TP |
| .I maj |
| is the major ID of the underlying device. |
| .TP |
| .I min |
| is the minor ID of the underlying device. |
| .TP |
| .I ino |
| is the inode number. |
| .TP |
| .I ino_generation |
| is the inode generation. |
| .TP |
| .I prot |
| is the protection information. |
| .TP |
| .I flags |
| is the flags information. |
| .TP |
| .I filename |
| is a string describing the backing of the allocated memory. |
| .RE |
| .TP |
| .BR PERF_RECORD_AUX " (since Linux 4.1)" |
| .\" commit 68db7e98c3a6ebe7284b6cf14906ed7c55f3f7f0 |
| This record reports that new data is available in the separate |
| AUX buffer region. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u64 aux_offset; |
| u64 aux_size; |
| u64 flags; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I aux_offset |
| offset in the AUX mmap region where the new data begins. |
| .TP |
| .I aux_size |
| size of the data made available. |
| .TP |
| .I flags |
| describes the AUX update. |
| .RS |
| .TP |
| .B PERF_AUX_FLAG_TRUNCATED |
| if set, then the data returned was truncated to fit the available |
| buffer size. |
| .TP |
| .B PERF_AUX_FLAG_OVERWRITE |
| .\" commit 2023a0d2829e521fe6ad6b9907f3f90bfbf57142 |
| if set, then the data returned has overwritten previous data. |
| .RE |
| .RE |
| .TP |
| .BR PERF_RECORD_ITRACE_START " (since Linux 4.1)" |
| .\" ec0d7729bbaed4b9d2d3fada693278e13a3d1368 |
| This record indicates which process has initiated an instruction |
| trace event, allowing tools to properly correlate the instruction |
| addresses in the AUX buffer with the proper executable. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u32 pid; |
| u32 tid; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I pid |
| process ID of the thread starting an instruction trace. |
| .TP |
| .I tid |
| thread ID of the thread starting an instruction trace. |
| .RE |
| .TP |
| .BR PERF_RECORD_LOST_SAMPLES " (since Linux 4.2)" |
| .\" f38b0dbb491a6987e198aa6b428db8692a6480f8 |
| When using hardware sampling (such as Intel PEBS) this record |
| indicates some number of samples that may have been lost. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u64 lost; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I lost |
| the number of potentially lost samples. |
| .RE |
| .TP |
| .BR PERF_RECORD_SWITCH " (since Linux 4.3)" |
| .\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4 |
| This record indicates a context switch has happened. |
| The |
| .B PERF_RECORD_MISC_SWITCH_OUT |
| bit in the |
| .I misc |
| field indicates whether it was a context switch into |
| or away from the current process. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .TP |
| .BR PERF_RECORD_SWITCH_CPU_WIDE " (since Linux 4.3)" |
| .\" commit 45ac1403f564f411c6a383a2448688ba8dd705a4 |
| As with |
| .B PERF_RECORD_SWITCH |
| this record indicates a context switch has happened, |
| but it only occurs when sampling in CPU-wide mode |
| and provides additional information on the process |
| being switched to/from. |
| The |
| .B PERF_RECORD_MISC_SWITCH_OUT |
| bit in the |
| .I misc |
| field indicates whether it was a context switch into |
| or away from the current process. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u32 next_prev_pid; |
| u32 next_prev_tid; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I next_prev_pid |
| The process ID of the previous (if switching in) |
| or next (if switching out) process on the CPU. |
| .TP |
| .I next_prev_tid |
| The thread ID of the previous (if switching in) |
| or next (if switching out) thread on the CPU. |
| .RE |
| .TP |
| .BR PERF_RECORD_NAMESPACES " (since Linux 4.11)" |
| .\" commit e422267322cd319e2695a535e47c5b1feeac45eb |
| This record includes various namespace information of a process. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u32 pid; |
| u32 tid; |
| u64 nr_namespaces; |
| struct { u64 dev, inode } [nr_namespaces]; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I pid |
| is the process ID |
| .TP |
| .I tid |
| is the thread ID |
| .TP |
| .I nr_namespace |
| is the number of namespaces in this record |
| .RE |
| .IP |
| Each namespace has |
| .I dev |
| and |
| .I inode |
| fields and is recorded in the |
| fixed position like below: |
| .RS |
| .TP |
| .BR NET_NS_INDEX = 0 |
| Network namespace |
| .TP |
| .BR UTS_NS_INDEX = 1 |
| UTS namespace |
| .TP |
| .BR IPC_NS_INDEX = 2 |
| IPC namespace |
| .TP |
| .BR PID_NS_INDEX = 3 |
| PID namespace |
| .TP |
| .BR USER_NS_INDEX = 4 |
| User namespace |
| .TP |
| .BR MNT_NS_INDEX = 5 |
| Mount namespace |
| .TP |
| .BR CGROUP_NS_INDEX = 6 |
| Cgroup namespace |
| .RE |
| .TP |
| .BR PERF_RECORD_KSYMBOL " (since Linux 5.0)" |
| .\" commit 76193a94522f1d4edf2447a536f3f796ce56343b |
| This record indicates kernel symbol register/unregister events. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u64 addr; |
| u32 len; |
| u16 ksym_type; |
| u16 flags; |
| char name[]; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I addr |
| is the address of the kernel symbol. |
| .TP |
| .I len |
| is the length of the kernel symbol. |
| .TP |
| .I ksym_type |
| is the type of the kernel symbol. |
| Currently the following types are available: |
| .RS |
| .TP |
| .B PERF_RECORD_KSYMBOL_TYPE_BPF |
| The kernel symbol is a BPF function. |
| .RE |
| .TP |
| .I flags |
| If the |
| .B PERF_RECORD_KSYMBOL_FLAGS_UNREGISTER |
| is set, then this event is for unregistering the kernel symbol. |
| .RE |
| .TP |
| .BR PERF_RECORD_BPF_EVENT " (since Linux 5.0)" |
| .\" commit 6ee52e2a3fe4ea35520720736e6791df1fb67106 |
| This record indicates BPF program is loaded or unloaded. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u16 type; |
| u16 flags; |
| u32 id; |
| u8 tag[BPF_TAG_SIZE]; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I type |
| is one of the following values: |
| .RS |
| .TP |
| .B PERF_BPF_EVENT_PROG_LOAD |
| A BPF program is loaded |
| .TP |
| .B PERF_BPF_EVENT_PROG_UNLOAD |
| A BPF program is unloaded |
| .RE |
| .TP |
| .I id |
| is the ID of the BPF program. |
| .TP |
| .I tag |
| is the tag of the BPF program. |
| Currently, |
| .B BPF_TAG_SIZE |
| is defined as 8. |
| .RE |
| .TP |
| .BR PERF_RECORD_CGROUP " (since Linux 5.7)" |
| .\" commit 96aaab686505c449e24d76e76507290dcc30e008 |
| This record indicates a new cgroup is created and activated. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u64 id; |
| char path[]; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I id |
| is the cgroup identifier. |
| This can be also retrieved by |
| .BR name_to_handle_at (2) |
| on the cgroup path (as a file handle). |
| .TP |
| .I path |
| is the path of the cgroup from the root. |
| .RE |
| .TP |
| .BR PERF_RECORD_TEXT_POKE " (since Linux 5.8)" |
| .\" commit e17d43b93e544f5016c0251d2074c15568d5d963 |
| This record indicates a change in the kernel text. |
| This includes addition and removal of the text |
| and the corresponding length is zero in this case. |
| .IP |
| .in +4n |
| .EX |
| struct { |
| struct perf_event_header header; |
| u64 addr; |
| u16 old_len; |
| u16 new_len; |
| u8 bytes[]; |
| struct sample_id sample_id; |
| }; |
| .EE |
| .in |
| .RS |
| .TP |
| .I addr |
| is the address of the change |
| .TP |
| .I old_len |
| is the old length |
| .TP |
| .I new_len |
| is the new length |
| .TP |
| .I bytes |
| contains old bytes immediately followed by new bytes. |
| .RE |
| .RE |
| .SS Overflow handling |
| Events can be set to notify when a threshold is crossed, |
| indicating an overflow. |
| Overflow conditions can be captured by monitoring the |
| event file descriptor with |
| .BR poll (2), |
| .BR select (2), |
| or |
| .BR epoll (7). |
| Alternatively, the overflow events can be captured via sa signal handler, |
| by enabling I/O signaling on the file descriptor; see the discussion of the |
| .BR F_SETOWN |
| and |
| .BR F_SETSIG |
| operations in |
| .BR fcntl (2). |
| .PP |
| Overflows are generated only by sampling events |
| .RI ( sample_period |
| must have a nonzero value). |
| .PP |
| There are two ways to generate overflow notifications. |
| .PP |
| The first is to set a |
| .I wakeup_events |
| or |
| .I wakeup_watermark |
| value that will trigger if a certain number of samples |
| or bytes have been written to the mmap ring buffer. |
| In this case, |
| .B POLL_IN |
| is indicated. |
| .PP |
| The other way is by use of the |
| .B PERF_EVENT_IOC_REFRESH |
| ioctl. |
| This ioctl adds to a counter that decrements each time the event overflows. |
| When nonzero, |
| .B POLL_IN |
| is indicated, but |
| once the counter reaches 0 |
| .B POLL_HUP |
| is indicated and |
| the underlying event is disabled. |
| .PP |
| Refreshing an event group leader refreshes all siblings and |
| refreshing with a parameter of 0 currently enables infinite |
| refreshes; |
| these behaviors are unsupported and should not be relied on. |
| .\" See https://lkml.org/lkml/2011/5/24/337 |
| .PP |
| Starting with Linux 3.18, |
| .\" commit 179033b3e064d2cd3f5f9945e76b0a0f0fbf4883 |
| .B POLL_HUP |
| is indicated if the event being monitored is attached to a different |
| process and that process exits. |
| .SS rdpmc instruction |
| Starting with Linux 3.4 on x86, you can use the |
| .\" commit c7206205d00ab375839bd6c7ddb247d600693c09 |
| .I rdpmc |
| instruction to get low-latency reads without having to enter the kernel. |
| Note that using |
| .I rdpmc |
| is not necessarily faster than other methods for reading event values. |
| .PP |
| Support for this can be detected with the |
| .I cap_usr_rdpmc |
| field in the mmap page; documentation on how |
| to calculate event values can be found in that section. |
| .PP |
| Originally, when rdpmc support was enabled, any process (not just ones |
| with an active perf event) could use the rdpmc instruction to access |
| the counters. |
| Starting with Linux 4.0, |
| .\" 7911d3f7af14a614617e38245fedf98a724e46a9 |
| rdpmc support is only allowed if an event is currently enabled |
| in a process's context. |
| To restore the old behavior, write the value 2 to |
| .IR /sys/devices/cpu/rdpmc . |
| .SS perf_event ioctl calls |
| Various ioctls act on |
| .BR perf_event_open () |
| file descriptors: |
| .TP |
| .B PERF_EVENT_IOC_ENABLE |
| This enables the individual event or event group specified by the |
| file descriptor argument. |
| .IP |
| If the |
| .B PERF_IOC_FLAG_GROUP |
| bit is set in the ioctl argument, then all events in a group are |
| enabled, even if the event specified is not the group leader |
| (but see BUGS). |
| .TP |
| .B PERF_EVENT_IOC_DISABLE |
| This disables the individual counter or event group specified by the |
| file descriptor argument. |
| .IP |
| Enabling or disabling the leader of a group enables or disables the |
| entire group; that is, while the group leader is disabled, none of the |
| counters in the group will count. |
| Enabling or disabling a member of a group other than the leader |
| affects only that counter; disabling a non-leader |
| stops that counter from counting but doesn't affect any other counter. |
| .IP |
| If the |
| .B PERF_IOC_FLAG_GROUP |
| bit is set in the ioctl argument, then all events in a group are |
| disabled, even if the event specified is not the group leader |
| (but see BUGS). |
| .TP |
| .B PERF_EVENT_IOC_REFRESH |
| Non-inherited overflow counters can use this |
| to enable a counter for a number of overflows specified by the argument, |
| after which it is disabled. |
| Subsequent calls of this ioctl add the argument value to the current |
| count. |
| An overflow notification with |
| .B POLL_IN |
| set will happen on each overflow until the |
| count reaches 0; when that happens a notification with |
| .B POLL_HUP |
| set is sent and the event is disabled. |
| Using an argument of 0 is considered undefined behavior. |
| .TP |
| .B PERF_EVENT_IOC_RESET |
| Reset the event count specified by the |
| file descriptor argument to zero. |
| This resets only the counts; there is no way to reset the |
| multiplexing |
| .I time_enabled |
| or |
| .I time_running |
| values. |
| .IP |
| If the |
| .B PERF_IOC_FLAG_GROUP |
| bit is set in the ioctl argument, then all events in a group are |
| reset, even if the event specified is not the group leader |
| (but see BUGS). |
| .TP |
| .B PERF_EVENT_IOC_PERIOD |
| This updates the overflow period for the event. |
| .IP |
| Since Linux 3.7 (on ARM) |
| .\" commit 3581fe0ef37ce12ac7a4f74831168352ae848edc |
| and Linux 3.14 (all other architectures), |
| .\" commit bad7192b842c83e580747ca57104dd51fe08c223 |
| the new period takes effect immediately. |
| On older kernels, the new period did not take effect until |
| after the next overflow. |
| .IP |
| The argument is a pointer to a 64-bit value containing the |
| desired new period. |
| .IP |
| Prior to Linux 2.6.36, |
| .\" commit ad0cf3478de8677f720ee06393b3147819568d6a |
| this ioctl always failed due to a bug |
| in the kernel. |
| .TP |
| .B PERF_EVENT_IOC_SET_OUTPUT |
| This tells the kernel to report event notifications to the specified |
| file descriptor rather than the default one. |
| The file descriptors must all be on the same CPU. |
| .IP |
| The argument specifies the desired file descriptor, or \-1 if |
| output should be ignored. |
| .TP |
| .BR PERF_EVENT_IOC_SET_FILTER " (since Linux 2.6.33)" |
| .\" commit 6fb2915df7f0747d9044da9dbff5b46dc2e20830 |
| This adds an ftrace filter to this event. |
| .IP |
| The argument is a pointer to the desired ftrace filter. |
| .TP |
| .BR PERF_EVENT_IOC_ID " (since Linux 3.12)" |
| .\" commit cf4957f17f2a89984915ea808876d9c82225b862 |
| This returns the event ID value for the given event file descriptor. |
| .IP |
| The argument is a pointer to a 64-bit unsigned integer |
| to hold the result. |
| .TP |
| .BR PERF_EVENT_IOC_SET_BPF " (since Linux 4.1)" |
| .\" commit 2541517c32be2531e0da59dfd7efc1ce844644f5 |
| This allows attaching a Berkeley Packet Filter (BPF) |
| program to an existing kprobe tracepoint event. |
| You need |
| .B CAP_PERFMON |
| (since Linux 5.8) or |
| .B CAP_SYS_ADMIN |
| privileges to use this ioctl. |
| .IP |
| The argument is a BPF program file descriptor that was created by |
| a previous |
| .BR bpf (2) |
| system call. |
| .TP |
| .BR PERF_EVENT_IOC_PAUSE_OUTPUT " (since Linux 4.7)" |
| .\" commit 86e7972f690c1017fd086cdfe53d8524e68c661c |
| This allows pausing and resuming the event's ring-buffer. |
| A paused ring-buffer does not prevent generation of samples, |
| but simply discards them. |
| The discarded samples are considered lost, and cause a |
| .BR PERF_RECORD_LOST |
| sample to be generated when possible. |
| An overflow signal may still be triggered by the discarded sample |
| even though the ring-buffer remains empty. |
| .IP |
| The argument is an unsigned 32-bit integer. |
| A nonzero value pauses the ring-buffer, while a |
| zero value resumes the ring-buffer. |
| .TP |
| .BR PERF_EVENT_MODIFY_ATTRIBUTES " (since Linux 4.17)" |
| .\" commit 32ff77e8cc9e66cc4fb38098f64fd54cc8f54573 |
| This allows modifying an existing event without the overhead |
| of closing and reopening a new event. |
| Currently this is supported only for breakpoint events. |
| .IP |
| The argument is a pointer to a |
| .I perf_event_attr |
| structure containing the updated event settings. |
| .TP |
| .BR PERF_EVENT_IOC_QUERY_BPF " (since Linux 4.16)" |
| .\" commit f371b304f12e31fe30207c41ca7754564e0ea4dc |
| This allows querying which Berkeley Packet Filter (BPF) |
| programs are attached to an existing kprobe tracepoint. |
| You can only attach one BPF program per event, but you can |
| have multiple events attached to a tracepoint. |
| Querying this value on one tracepoint event returns the ID |
| of all BPF programs in all events attached to the tracepoint. |
| You need |
| .B CAP_PERFMON |
| (since Linux 5.8) or |
| .B CAP_SYS_ADMIN |
| privileges to use this ioctl. |
| .IP |
| The argument is a pointer to a structure |
| .in +4n |
| .EX |
| struct perf_event_query_bpf { |
| __u32 ids_len; |
| __u32 prog_cnt; |
| __u32 ids[0]; |
| }; |
| .EE |
| .in |
| .IP |
| The |
| .I ids_len |
| field indicates the number of ids that can fit in the provided |
| .I ids |
| array. |
| The |
| .I prog_cnt |
| value is filled in by the kernel with the number of attached |
| BPF programs. |
| The |
| .I ids |
| array is filled with the ID of each attached BPF program. |
| If there are more programs than will fit in the array, then the |
| kernel will return |
| .B ENOSPC |
| and |
| .I ids_len |
| will indicate the number of program IDs that were successfully copied. |
| .\" |
| .SS Using prctl(2) |
| A process can enable or disable all currently open event groups |
| using the |
| .BR prctl (2) |
| .B PR_TASK_PERF_EVENTS_ENABLE |
| and |
| .B PR_TASK_PERF_EVENTS_DISABLE |
| operations. |
| This applies only to events created locally by the calling process. |
| This does not apply to events created by other processes attached |
| to the calling process or inherited events from a parent process. |
| Only group leaders are enabled and disabled, |
| not any other members of the groups. |
| .SS perf_event related configuration files |
| Files in |
| .I /proc/sys/kernel/ |
| .RS 4 |
| .TP |
| .I /proc/sys/kernel/perf_event_paranoid |
| The |
| .I perf_event_paranoid |
| file can be set to restrict access to the performance counters. |
| .IP |
| .PD 0 |
| .RS |
| .IP 2 4 |
| allow only user-space measurements (default since Linux 4.6). |
| .\" default changed in commit 0161028b7c8aebef64194d3d73e43bc3b53b5c66 |
| .IP 1 |
| allow both kernel and user measurements (default before Linux 4.6). |
| .IP 0 |
| allow access to CPU-specific data but not raw tracepoint samples. |
| .IP \-1 |
| no restrictions. |
| .RE |
| .PD |
| .IP |
| The existence of the |
| .I perf_event_paranoid |
| file is the official method for determining if a kernel supports |
| .BR perf_event_open (). |
| .TP |
| .I /proc/sys/kernel/perf_event_max_sample_rate |
| This sets the maximum sample rate. |
| Setting this too high can allow |
| users to sample at a rate that impacts overall machine performance |
| and potentially lock up the machine. |
| The default value is |
| 100000 (samples per second). |
| .TP |
| .I /proc/sys/kernel/perf_event_max_stack |
| .\" Introduced in c5dfd78eb79851e278b7973031b9ca363da87a7e |
| This file sets the maximum depth of stack frame entries reported |
| when generating a call trace. |
| .TP |
| .I /proc/sys/kernel/perf_event_mlock_kb |
| Maximum number of pages an unprivileged user can |
| .BR mlock (2). |
| The default is 516 (kB). |
| .RE |
| .PP |
| Files in |
| .I /sys/bus/event_source/devices/ |
| .PP |
| .RS 4 |
| Since Linux 2.6.34, the kernel supports having multiple PMUs |
| available for monitoring. |
| Information on how to program these PMUs can be found under |
| .IR /sys/bus/event_source/devices/ . |
| Each subdirectory corresponds to a different PMU. |
| .TP |
| .IR /sys/bus/event_source/devices/*/type " (since Linux 2.6.38)" |
| .\" commit abe43400579d5de0078c2d3a760e6598e183f871 |
| This contains an integer that can be used in the |
| .I type |
| field of |
| .I perf_event_attr |
| to indicate that you wish to use this PMU. |
| .TP |
| .IR /sys/bus/event_source/devices/cpu/rdpmc " (since Linux 3.4)" |
| .\" commit 0c9d42ed4cee2aa1dfc3a260b741baae8615744f |
| If this file is 1, then direct user-space access to the |
| performance counter registers is allowed via the rdpmc instruction. |
| This can be disabled by echoing 0 to the file. |
| .IP |
| As of Linux 4.0 |
| .\" a66734297f78707ce39d756b656bfae861d53f62 |
| .\" 7911d3f7af14a614617e38245fedf98a724e46a9 |
| the behavior has changed, so that 1 now means only allow access |
| to processes with active perf events, with 2 indicating the old |
| allow-anyone-access behavior. |
| .TP |
| .IR /sys/bus/event_source/devices/*/format/ " (since Linux 3.4)" |
| .\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33 |
| This subdirectory contains information on the architecture-specific |
| subfields available for programming the various |
| .I config |
| fields in the |
| .I perf_event_attr |
| struct. |
| .IP |
| The content of each file is the name of the config field, followed |
| by a colon, followed by a series of integer bit ranges separated by |
| commas. |
| For example, the file |
| .I event |
| may contain the value |
| .I config1:1,6\-10,44 |
| which indicates that event is an attribute that occupies bits 1,6\(en10, and 44 |
| of |
| .IR perf_event_attr::config1 . |
| .TP |
| .IR /sys/bus/event_source/devices/*/events/ " (since Linux 3.4)" |
| .\" commit 641cc938815dfd09f8fa1ec72deb814f0938ac33 |
| This subdirectory contains files with predefined events. |
| The contents are strings describing the event settings |
| expressed in terms of the fields found in the previously mentioned |
| .I ./format/ |
| directory. |
| These are not necessarily complete lists of all events supported by |
| a PMU, but usually a subset of events deemed useful or interesting. |
| .IP |
| The content of each file is a list of attribute names |
| separated by commas. |
| Each entry has an optional value (either hex or decimal). |
| If no value is specified, then it is assumed to be a single-bit |
| field with a value of 1. |
| An example entry may look like this: |
| .IR event=0x2,inv,ldlat=3 . |
| .TP |
| .I /sys/bus/event_source/devices/*/uevent |
| This file is the standard kernel device interface |
| for injecting hotplug events. |
| .TP |
| .IR /sys/bus/event_source/devices/*/cpumask " (since Linux 3.7)" |
| .\" commit 314d9f63f385096580e9e2a06eaa0745d92fe4ac |
| The |
| .I cpumask |
| file contains a comma-separated list of integers that |
| indicate a representative CPU number for each socket (package) |
| on the motherboard. |
| This is needed when setting up uncore or northbridge events, as |
| those PMUs present socket-wide events. |
| .RE |
| .SH RETURN VALUE |
| On success, |
| .BR perf_event_open () |
| returns the new file descriptor. |
| On error, \-1 is returned and |
| .I errno |
| is set to indicate the error. |
| .SH ERRORS |
| The errors returned by |
| .BR perf_event_open () |
| can be inconsistent, and may |
| vary across processor architectures and performance monitoring units. |
| .TP |
| .B E2BIG |
| Returned if the |
| .I perf_event_attr |
| .I size |
| value is too small |
| (smaller than |
| .BR PERF_ATTR_SIZE_VER0 ), |
| too big (larger than the page size), |
| or larger than the kernel supports and the extra bytes are not zero. |
| When |
| .B E2BIG |
| is returned, the |
| .I perf_event_attr |
| .I size |
| field is overwritten by the kernel to be the size of the structure |
| it was expecting. |
| .TP |
| .B EACCES |
| Returned when the requested event requires |
| .B CAP_PERFMON |
| (since Linux 5.8) or |
| .B CAP_SYS_ADMIN |
| permissions (or a more permissive perf_event paranoid setting). |
| Some common cases where an unprivileged process |
| may encounter this error: |
| attaching to a process owned by a different user; |
| monitoring all processes on a given CPU (i.e., specifying the |
| .I pid |
| argument as \-1); |
| and not setting |
| .I exclude_kernel |
| when the paranoid setting requires it. |
| .TP |
| .B EBADF |
| Returned if the |
| .I group_fd |
| file descriptor is not valid, or, if |
| .B PERF_FLAG_PID_CGROUP |
| is set, |
| the cgroup file descriptor in |
| .I pid |
| is not valid. |
| .TP |
| .BR EBUSY " (since Linux 4.1)" |
| .\" bed5b25ad9c8a2f5d735ef0bc746ec870c01c1b0 |
| Returned if another event already has exclusive |
| access to the PMU. |
| .TP |
| .B EFAULT |
| Returned if the |
| .I attr |
| pointer points at an invalid memory address. |
| .TP |
| .B EINVAL |
| Returned if the specified event is invalid. |
| There are many possible reasons for this. |
| A not-exhaustive list: |
| .I sample_freq |
| is higher than the maximum setting; |
| the |
| .I cpu |
| to monitor does not exist; |
| .I read_format |
| is out of range; |
| .I sample_type |
| is out of range; |
| the |
| .I flags |
| value is out of range; |
| .I exclusive |
| or |
| .I pinned |
| set and the event is not a group leader; |
| the event |
| .I config |
| values are out of range or set reserved bits; |
| the generic event selected is not supported; or |
| there is not enough room to add the selected event. |
| .TP |
| .B EINTR |
| Returned when trying to mix perf and ftrace handling |
| for a uprobe. |
| .TP |
| .B EMFILE |
| Each opened event uses one file descriptor. |
| If a large number of events are opened, |
| the per-process limit on the number of open file descriptors will be reached, |
| and no more events can be created. |
| .TP |
| .B ENODEV |
| Returned when the event involves a feature not supported |
| by the current CPU. |
| .TP |
| .B ENOENT |
| Returned if the |
| .I type |
| setting is not valid. |
| This error is also returned for |
| some unsupported generic events. |
| .TP |
| .B ENOSPC |
| Prior to Linux 3.3, if there was not enough room for the event, |
| .\" commit aa2bc1ade59003a379ffc485d6da2d92ea3370a6 |
| .B ENOSPC |
| was returned. |
| In Linux 3.3, this was changed to |
| .BR EINVAL . |
| .B ENOSPC |
| is still returned if you try to add more breakpoint events |
| than supported by the hardware. |
| .TP |
| .B ENOSYS |
| Returned if |
| .B PERF_SAMPLE_STACK_USER |
| is set in |
| .I sample_type |
| and it is not supported by hardware. |
| .TP |
| .B EOPNOTSUPP |
| Returned if an event requiring a specific hardware feature is |
| requested but there is no hardware support. |
| This includes requesting low-skid events if not supported, |
| branch tracing if it is not available, sampling if no PMU |
| interrupt is available, and branch stacks for software events. |
| .TP |
| .BR EOVERFLOW " (since Linux 4.8)" |
| .\" 97c79a38cd454602645f0470ffb444b3b75ce574 |
| Returned if |
| .B PERF_SAMPLE_CALLCHAIN |
| is requested and |
| .I sample_max_stack |
| is larger than the maximum specified in |
| .IR /proc/sys/kernel/perf_event_max_stack . |
| .TP |
| .B EPERM |
| Returned on many (but not all) architectures when an unsupported |
| .IR exclude_hv ", " exclude_idle ", " exclude_user ", or " exclude_kernel |
| setting is specified. |
| .IP |
| It can also happen, as with |
| .BR EACCES , |
| when the requested event requires |
| .B CAP_PERFMON |
| (since Linux 5.8) or |
| .B CAP_SYS_ADMIN |
| permissions (or a more permissive perf_event paranoid setting). |
| This includes setting a breakpoint on a kernel address, |
| and (since Linux 3.13) setting a kernel function-trace tracepoint. |
| .\" commit a4e95fc2cbb31d70a65beffeaf8773f881328c34 |
| .TP |
| .B ESRCH |
| Returned if attempting to attach to a process that does not exist. |
| .SH VERSION |
| .BR perf_event_open () |
| was introduced in Linux 2.6.31 but was called |
| .\" commit 0793a61d4df8daeac6492dbf8d2f3e5713caae5e |
| .BR perf_counter_open (). |
| It was renamed in Linux 2.6.32. |
| .\" commit cdd6c482c9ff9c55475ee7392ec8f672eddb7be6 |
| .SH CONFORMING TO |
| This |
| .BR perf_event_open () |
| system call Linux-specific |
| and should not be used in programs intended to be portable. |
| .SH NOTES |
| Glibc does not provide a wrapper for this system call; call it using |
| .BR syscall (2). |
| See the example below. |
| .PP |
| The official way of knowing if |
| .BR perf_event_open () |
| support is enabled is checking |
| for the existence of the file |
| .IR /proc/sys/kernel/perf_event_paranoid . |
| .PP |
| .B CAP_PERFMON |
| capability (since Linux 5.8) provides secure approach to |
| performance monitoring and observability operations in a system |
| according to the principal of least privilege (POSIX IEEE 1003.1e). |
| Accessing system performance monitoring and observability operations |
| using |
| .B CAP_PERFMON |
| rather than the much more powerful |
| .B CAP_SYS_ADMIN |
| excludes chances to misuse credentials and makes operations more secure. |
| .B CAP_SYS_ADMIN |
| usage for secure system performance monitoring and observability |
| is discouraged in favor of the |
| .B CAP_PERFMON |
| capability. |
| .SH BUGS |
| The |
| .B F_SETOWN_EX |
| option to |
| .BR fcntl (2) |
| is needed to properly get overflow signals in threads. |
| This was introduced in Linux 2.6.32. |
| .\" commit ba0a6c9f6fceed11c6a99e8326f0477fe383e6b5 |
| .PP |
| Prior to Linux 2.6.33 (at least for x86), |
| .\" commit b690081d4d3f6a23541493f1682835c3cd5c54a1 |
| the kernel did not check |
| if events could be scheduled together until read time. |
| The same happens on all known kernels if the NMI watchdog is enabled. |
| This means to see if a given set of events works you have to |
| .BR perf_event_open (), |
| start, then read before you know for sure you |
| can get valid measurements. |
| .PP |
| Prior to Linux 2.6.34, |
| .\" FIXME . cannot find a kernel commit for this one |
| event constraints were not enforced by the kernel. |
| In that case, some events would silently return "0" if the kernel |
| scheduled them in an improper counter slot. |
| .PP |
| Prior to Linux 2.6.34, there was a bug when multiplexing where the |
| wrong results could be returned. |
| .\" commit 45e16a6834b6af098702e5ea6c9a40de42ff77d8 |
| .PP |
| Kernels from Linux 2.6.35 to Linux 2.6.39 can quickly crash the kernel if |
| "inherit" is enabled and many threads are started. |
| .\" commit 38b435b16c36b0d863efcf3f07b34a6fac9873fd |
| .PP |
| Prior to Linux 2.6.35, |
| .\" commit 050735b08ca8a016bbace4445fa025b88fee770b |
| .B PERF_FORMAT_GROUP |
| did not work with attached processes. |
| .PP |
| There is a bug in the kernel code between |
| Linux 2.6.36 and Linux 3.0 that ignores the |
| "watermark" field and acts as if a wakeup_event |
| was chosen if the union has a |
| nonzero value in it. |
| .\" commit 4ec8363dfc1451f8c8f86825731fe712798ada02 |
| .PP |
| From Linux 2.6.31 to Linux 3.4, the |
| .B PERF_IOC_FLAG_GROUP |
| ioctl argument was broken and would repeatedly operate |
| on the event specified rather than iterating across |
| all sibling events in a group. |
| .\" commit 724b6daa13e100067c30cfc4d1ad06629609dc4e |
| .PP |
| From Linux 3.4 to Linux 3.11, the mmap |
| .\" commit fa7315871046b9a4c48627905691dbde57e51033 |
| .I cap_usr_rdpmc |
| and |
| .I cap_usr_time |
| bits mapped to the same location. |
| Code should migrate to the new |
| .I cap_user_rdpmc |
| and |
| .I cap_user_time |
| fields instead. |
| .PP |
| Always double-check your results! |
| Various generalized events have had wrong values. |
| For example, retired branches measured |
| the wrong thing on AMD machines until Linux 2.6.35. |
| .\" commit f287d332ce835f77a4f5077d2c0ef1e3f9ea42d2 |
| .SH EXAMPLES |
| The following is a short example that measures the total |
| instruction count of a call to |
| .BR printf (3). |
| .PP |
| .EX |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <unistd.h> |
| #include <string.h> |
| #include <sys/ioctl.h> |
| #include <linux/perf_event.h> |
| #include <asm/unistd.h> |
| |
| static long |
| perf_event_open(struct perf_event_attr *hw_event, pid_t pid, |
| int cpu, int group_fd, unsigned long flags) |
| { |
| int ret; |
| |
| ret = syscall(__NR_perf_event_open, hw_event, pid, cpu, |
| group_fd, flags); |
| return ret; |
| } |
| |
| int |
| main(int argc, char **argv) |
| { |
| struct perf_event_attr pe; |
| long long count; |
| int fd; |
| |
| memset(&pe, 0, sizeof(pe)); |
| pe.type = PERF_TYPE_HARDWARE; |
| pe.size = sizeof(pe); |
| pe.config = PERF_COUNT_HW_INSTRUCTIONS; |
| pe.disabled = 1; |
| pe.exclude_kernel = 1; |
| pe.exclude_hv = 1; |
| |
| fd = perf_event_open(&pe, 0, \-1, \-1, 0); |
| if (fd == \-1) { |
| fprintf(stderr, "Error opening leader %llx\en", pe.config); |
| exit(EXIT_FAILURE); |
| } |
| |
| ioctl(fd, PERF_EVENT_IOC_RESET, 0); |
| ioctl(fd, PERF_EVENT_IOC_ENABLE, 0); |
| |
| printf("Measuring instruction count for this printf\en"); |
| |
| ioctl(fd, PERF_EVENT_IOC_DISABLE, 0); |
| read(fd, &count, sizeof(count)); |
| |
| printf("Used %lld instructions\en", count); |
| |
| close(fd); |
| } |
| .EE |
| .SH SEE ALSO |
| .BR perf (1), |
| .BR fcntl (2), |
| .BR mmap (2), |
| .BR open (2), |
| .BR prctl (2), |
| .BR read (2) |
| .PP |
| .IR Documentation/admin\-guide/perf\-security.rst |
| in the kernel source tree |