perf, tools, script: Add brstackasm output for branch stacks
Implement printing full disassembled sequences for branch stacks in perf
script. This allows to directly print hot paths for individual samples,
together with branch misprediction and even cycle count information.
% perf record -b ...
% perf script -F brstackasm
...
00007f0668d54e88 movsx (%rsi), %ecx
00007f0668d54e8b lea -0x30(%rcx), %eax
00007f0668d54e8e cmp $0x9, %al
00007f0668d54e90 jbe 0x68d54eaf
00007f0668d54e92 cmp %cl, %dl
00007f0668d54e94 jnz 0x68d54eb5
00007f0668d54e96 add $0x1, %rdi
00007f0668d54e9a movsx (%rdi), %edx
00007f0668d54e9d add $0x1, %rsi
00007f0668d54ea1 test %dl, %dl
00007f0668d54ea3 jnz _dl_cache_libcmp+11 # PRED 21 cycles
00007f0668d54dfb lea -0x30(%rdx), %eax
00007f0668d54dfe cmp $0x9, %al
00007f0668d54e00 ja _dl_cache_libcmp+152 # PRED 2 cycles
00007f0668d54e88 movsx (%rsi), %ecx
00007f0668d54e8b lea -0x30(%rcx), %eax
00007f0668d54e8e cmp $0x9, %al
00007f0668d54e90 jbe 0x68d54eaf
00007f0668d54e92 cmp %cl, %dl
00007f0668d54e94 jnz 0x68d54eb5 # PRED 3 cycles
00007f0668d54eb5 movsx %dl, %eax
00007f0668d54eb8 sub %ecx, %eax
00007f0668d54eba ret # PRED 1 cycles
00007f0668d54fae test %eax, %eax
00007f0668d54fb0 jz _dl_load_cache_lookup+688
00007f0668d54fb6 jns 0x68d54f70
00007f0668d54fb8 lea 0x1(%r14), %ebx
00007f0668d54fbc cmp %r15d, %ebx
00007f0668d54fbf nop
00007f0668d54fc0 jle 0x68d54f79 # PRED 2 cycles
Open issues:
- Occasionally the path does not reach up to the sample IP, as the LBRs
may be freezed earlier. Use precise events to avoid that.
v2: Remove bogus hunk. Document --max-blocks. Fix some printfs.
Port to latest tree.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 5453637..b6a979a 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@
--fields::
Comma separated list of fields to print. Options are:
comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
- srcline, period, iregs, brstack, brstacksym, flags, asm.
+ srcline, period, iregs, brstack, brstacksym, flags, asm, brstackasm
Field list can be prepended with the type, trace, sw or hw,
to indicate to which event type the field list applies.
e.g., -f sw:comm,tid,time,ip,sym and -f trace:time,cpu,trace
@@ -176,17 +176,24 @@
i.e., -f "" is not allowed.
The brstack output includes branch related information with raw addresses using the
- /v/v/v/v/ syntax in the following order:
+ /v/v/v/v/cycles syntax in the following order:
FROM: branch source instruction
TO : branch target instruction
M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
X/- : X=branch inside a transactional region, -=not in transaction region or not supported
A/- : A=TSX abort entry, -=not aborted region or not supported
+ cycles
The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
When asm is specified the assembler instruction of each sample is printed in disassembled form.
+ When brstackasm is specified the full assembler sequences of branch blocks for each sample
+ is printed (a branch block is a sequence of instructions not containing taken branches).
+ This is the full execution path leading to the sample. This is only supported when the
+ sample was recorded with perf record -b or -j any.
+ The maximum number of branch blocks to print can be configured with the --max-blocks option.
+
-k::
--vmlinux=<file>::
vmlinux pathname
@@ -278,6 +285,9 @@
--force::
Don't do ownership validation.
+--max-blocks=N:
+ Maximum number of branch blocks to print with -F brstackasm
+
SEE ALSO
--------
linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index c8a91a8..147fde1 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -43,6 +43,7 @@
static const char *cpu_list;
static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
static struct perf_stat_config stat_config;
+static int max_blocks;
unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
@@ -68,6 +69,7 @@
PERF_OUTPUT_WEIGHT = 1U << 18,
PERF_OUTPUT_BPF_OUTPUT = 1U << 19,
PERF_OUTPUT_ASM = 1U << 20,
+ PERF_OUTPUT_BRSTACKASM = 1U << 21,
};
struct output_option {
@@ -95,6 +97,7 @@
{.str = "weight", .field = PERF_OUTPUT_WEIGHT},
{.str = "bpf-output", .field = PERF_OUTPUT_BPF_OUTPUT},
{.str = "asm", .field = PERF_OUTPUT_ASM},
+ {.str = "brstackasm", .field = PERF_OUTPUT_BRSTACKASM},
};
/* default set to maintain compatibility with current format */
@@ -294,6 +297,13 @@
"selected.\n");
return -EINVAL;
}
+ if (PRINT_FIELD(BRSTACKASM) &&
+ !(perf_evlist__combined_branch_type(session->evlist) &
+ PERF_SAMPLE_BRANCH_ANY)) {
+ pr_err("Display of branch stack assembler requested, but non all-branch filter set\n");
+ return -EINVAL;
+ }
+
if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -622,6 +632,176 @@
}
}
+#ifdef HAVE_UDIS86
+#define MAXBB 16384UL
+#define MAXINSN 16
+
+static int grab_bb(char *buffer, u64 start, u64 end,
+ struct machine *machine, struct thread *thread,
+ bool *is64bit, u8 *cpumode)
+{
+ int offset, len;
+ struct addr_location al;
+ bool kernel;
+
+ if (!start || !end)
+ return 0;
+
+ kernel = machine__kernel_ip(machine, start);
+ if (kernel)
+ *cpumode = PERF_RECORD_MISC_KERNEL;
+ else
+ *cpumode = PERF_RECORD_MISC_USER;
+ if (kernel != machine__kernel_ip(machine, end))
+ return 0;
+
+ memset(&al, 0, sizeof(al));
+ if (end - start > MAXBB - MAXINSN) {
+ printf("\tbasic block %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",
+ start, end, end - start);
+ return 0;
+ }
+
+ thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+ if (!al.map || !al.map->dso) {
+ printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+ start, end);
+ return 0;
+ }
+ if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+ printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+ start, end);
+ return 0;
+ }
+
+ /* Load maps to ensure dso->is_64_bit has been updated */
+ map__load(al.map, machine->symbol_filter);
+
+ offset = al.map->map_ip(al.map, start);
+ len = dso__data_read_offset(al.map->dso, machine,
+ offset, (u8 *)buffer,
+ end - start + MAXINSN);
+
+ *is64bit = al.map->dso->is_64_bit;
+ return len;
+}
+#endif
+
+static void print_sample_brstackasm(struct perf_sample *sample,
+ struct thread *thread __maybe_unused,
+ struct perf_event_attr *attr __maybe_unused,
+ struct machine *machine __maybe_unused)
+{
+#ifdef HAVE_UDIS86
+ struct branch_stack *br = sample->branch_stack;
+ u64 start, end;
+ int i;
+ static bool ud_initialized = false;
+ static struct perf_ud ud;
+ char buffer[MAXBB];
+ int len;
+ bool last;
+ bool is64bit;
+ int nr;
+
+ if (!(br && br->nr))
+ return;
+ nr = br->nr;
+ if (max_blocks && nr > max_blocks + 1)
+ nr = max_blocks + 1;
+
+ if (!ud_initialized) {
+ ud_initialized = true;
+ ud_init(&ud.ud_obj);
+ ud_set_syntax(&ud.ud_obj, UD_SYN_ATT);
+ ud_set_sym_resolver(&ud.ud_obj, dis_resolve);
+ }
+ ud.thread = thread;
+ ud.cpu = sample->cpu;
+
+ putchar('\n');
+ for (i = nr - 2; i >= 0; i--) {
+ if (verbose > 0 && (br->entries[i].from || br->entries[i].to))
+ printf("%d: %lx-%lx\n", i,
+ br->entries[i].from,
+ br->entries[i].to);
+ start = br->entries[i + 1].to;
+ end = br->entries[i].from;
+
+ /*
+ * Leave extra bytes for the final jump instruction for
+ * which we don't know the length
+ */
+ len = grab_bb(buffer, start, end + MAXINSN,
+ machine, thread, &is64bit,
+ &ud.cpumode);
+ if (len <= 0)
+ continue;
+
+ ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, start);
+ ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+ last = false;
+ while (ud_disassemble(&ud.ud_obj) && !last) {
+ if (ud_insn_ptr(&ud.ud_obj) ==
+ (uint8_t *)buffer + end - start) {
+ printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s",
+ ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj),
+ br->entries[i].flags.predicted ? " PRED" : "",
+ br->entries[i].flags.mispred ? " MISPRED" : "",
+ br->entries[i].flags.in_tx ? " INTX" : "",
+ br->entries[i].flags.abort ? " ABORT" : "");
+ if (br->entries[i].flags.cycles)
+ printf(" %d cycles", br->entries[i].flags.cycles);
+ putchar('\n');
+ last = true;
+ } else {
+ printf("\t%016" PRIx64 "\t%s\n",
+ ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj));
+ }
+ }
+ }
+
+ /*
+ * Hit the branch? In this case we are already done, and the target
+ * has not been executed yet.
+ */
+ if (br->entries[0].from == sample->ip)
+ return;
+ if (br->entries[0].flags.abort)
+ return;
+
+ /*
+ * Print final block upto sample
+ */
+ start = br->entries[0].to;
+ end = sample->ip;
+ len = grab_bb(buffer, start, end, machine, thread, &is64bit,
+ &ud.cpumode);
+ ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+ if (len <= 0) {
+ /* Print at least last IP if basic block did not work */
+ len = grab_bb(buffer, sample->ip, sample->ip + MAXINSN,
+ machine, thread, &is64bit, &ud.cpumode);
+ if (len <= 0)
+ return;
+ ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, sample->ip);
+ if (ud_disassemble(&ud.ud_obj))
+ printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj));
+ return;
+ }
+ ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+ ud_set_pc(&ud.ud_obj, start);
+ while (ud_disassemble(&ud.ud_obj) &&
+ ud_insn_ptr(&ud.ud_obj) <= (uint8_t *)buffer + end - start)
+ printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+ ud_insn_asm(&ud.ud_obj));
+#endif
+}
static void print_sample_addr(struct perf_sample *sample,
struct thread *thread,
@@ -908,6 +1088,8 @@
if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
print_sample_bpf_output(sample);
+ if (PRINT_FIELD(BRSTACKASM))
+ print_sample_brstackasm(sample, thread, attr, machine);
if (PRINT_FIELD(ASM))
print_sample_asm(sample, thread, attr, al, machine);
@@ -2143,6 +2325,8 @@
"Show the mmap events"),
OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
"Show context switch events (if recorded)"),
+ OPT_INTEGER(0, "max-blocks", &max_blocks,
+ "Maximum number of code blocks to dump with brstackasm"),
OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
OPT_BOOLEAN(0, "ns", &nanosecs,
"Use 9 decimal places when displaying time"),