perf, tools, script: Add brstackasm output for branch stacks

Implement printing full disassembled sequences for branch stacks in perf
script. This allows to directly print hot paths for individual samples,
together with branch misprediction and even cycle count information.

% perf record -b ...
% perf script -F brstackasm
...
        00007f0668d54e88        movsx (%rsi), %ecx
        00007f0668d54e8b        lea -0x30(%rcx), %eax
        00007f0668d54e8e        cmp $0x9, %al
        00007f0668d54e90        jbe 0x68d54eaf
        00007f0668d54e92        cmp %cl, %dl
        00007f0668d54e94        jnz 0x68d54eb5
        00007f0668d54e96        add $0x1, %rdi
        00007f0668d54e9a        movsx (%rdi), %edx
        00007f0668d54e9d        add $0x1, %rsi
        00007f0668d54ea1        test %dl, %dl
        00007f0668d54ea3        jnz _dl_cache_libcmp+11       # PRED 21 cycles
        00007f0668d54dfb        lea -0x30(%rdx), %eax
        00007f0668d54dfe        cmp $0x9, %al
        00007f0668d54e00        ja _dl_cache_libcmp+152       # PRED 2 cycles
        00007f0668d54e88        movsx (%rsi), %ecx
        00007f0668d54e8b        lea -0x30(%rcx), %eax
        00007f0668d54e8e        cmp $0x9, %al
        00007f0668d54e90        jbe 0x68d54eaf
        00007f0668d54e92        cmp %cl, %dl
        00007f0668d54e94        jnz 0x68d54eb5                # PRED 3 cycles
        00007f0668d54eb5        movsx %dl, %eax
        00007f0668d54eb8        sub %ecx, %eax
        00007f0668d54eba        ret                           # PRED 1 cycles
        00007f0668d54fae        test %eax, %eax
        00007f0668d54fb0        jz _dl_load_cache_lookup+688
        00007f0668d54fb6        jns 0x68d54f70
        00007f0668d54fb8        lea 0x1(%r14), %ebx
        00007f0668d54fbc        cmp %r15d, %ebx
        00007f0668d54fbf        nop
        00007f0668d54fc0        jle 0x68d54f79                # PRED 2 cycles

Open issues:
- Occasionally the path does not reach up to the sample IP, as the LBRs
may be freezed earlier. Use precise events to avoid that.

v2: Remove bogus hunk. Document --max-blocks. Fix some printfs.
Port to latest tree.
Signed-off-by: Andi Kleen <ak@linux.intel.com>
diff --git a/tools/perf/Documentation/perf-script.txt b/tools/perf/Documentation/perf-script.txt
index 5453637..b6a979a 100644
--- a/tools/perf/Documentation/perf-script.txt
+++ b/tools/perf/Documentation/perf-script.txt
@@ -116,7 +116,7 @@
 --fields::
         Comma separated list of fields to print. Options are:
         comm, tid, pid, time, cpu, event, trace, ip, sym, dso, addr, symoff,
-	srcline, period, iregs, brstack, brstacksym, flags, asm.
+	srcline, period, iregs, brstack, brstacksym, flags, asm, brstackasm
         Field list can be prepended with the type, trace, sw or hw,
         to indicate to which event type the field list applies.
         e.g., -f sw:comm,tid,time,ip,sym  and -f trace:time,cpu,trace
@@ -176,17 +176,24 @@
 	i.e., -f "" is not allowed.
 
 	The brstack output includes branch related information with raw addresses using the
-	/v/v/v/v/ syntax in the following order:
+	/v/v/v/v/cycles syntax in the following order:
 	FROM: branch source instruction
 	TO  : branch target instruction
         M/P/-: M=branch target mispredicted or branch direction was mispredicted, P=target predicted or direction predicted, -=not supported
 	X/- : X=branch inside a transactional region, -=not in transaction region or not supported
 	A/- : A=TSX abort entry, -=not aborted region or not supported
+	cycles
 
 	The brstacksym is identical to brstack, except that the FROM and TO addresses are printed in a symbolic form if possible.
 
 	When asm is specified the assembler instruction of each sample is printed in disassembled form.
 
+	When brstackasm is specified the full assembler sequences of branch blocks for each sample
+	is printed (a branch block is a sequence of instructions not containing taken branches).
+	This is the full execution path leading to the sample. This is only supported when the
+	sample was recorded with perf record -b or -j any.
+	The maximum number of branch blocks to print can be configured with the --max-blocks option.
+
 -k::
 --vmlinux=<file>::
         vmlinux pathname
@@ -278,6 +285,9 @@
 --force::
 	Don't do ownership validation.
 
+--max-blocks=N:
+	Maximum number of branch blocks to print with -F brstackasm
+
 SEE ALSO
 --------
 linkperf:perf-record[1], linkperf:perf-script-perl[1],
diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c
index c8a91a8..147fde1 100644
--- a/tools/perf/builtin-script.c
+++ b/tools/perf/builtin-script.c
@@ -43,6 +43,7 @@
 static const char		*cpu_list;
 static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
 static struct perf_stat_config	stat_config;
+static int 			max_blocks;
 
 unsigned int scripting_max_stack = PERF_MAX_STACK_DEPTH;
 
@@ -68,6 +69,7 @@
 	PERF_OUTPUT_WEIGHT	    = 1U << 18,
 	PERF_OUTPUT_BPF_OUTPUT	    = 1U << 19,
 	PERF_OUTPUT_ASM		    = 1U << 20,
+	PERF_OUTPUT_BRSTACKASM	    = 1U << 21,
 };
 
 struct output_option {
@@ -95,6 +97,7 @@
 	{.str = "weight",   .field = PERF_OUTPUT_WEIGHT},
 	{.str = "bpf-output",   .field = PERF_OUTPUT_BPF_OUTPUT},
 	{.str = "asm", .field = PERF_OUTPUT_ASM},
+	{.str = "brstackasm", .field = PERF_OUTPUT_BRSTACKASM},
 };
 
 /* default set to maintain compatibility with current format */
@@ -294,6 +297,13 @@
 		       "selected.\n");
 		return -EINVAL;
 	}
+	if (PRINT_FIELD(BRSTACKASM) &&
+	    !(perf_evlist__combined_branch_type(session->evlist) &
+	      PERF_SAMPLE_BRANCH_ANY)) {
+		pr_err("Display of branch stack assembler requested, but non all-branch filter set\n");
+		return -EINVAL;
+	}
+
 	if ((PRINT_FIELD(PID) || PRINT_FIELD(TID)) &&
 		perf_evsel__check_stype(evsel, PERF_SAMPLE_TID, "TID",
 					PERF_OUTPUT_TID|PERF_OUTPUT_PID))
@@ -622,6 +632,176 @@
 	}
 }
 
+#ifdef HAVE_UDIS86
+#define MAXBB 16384UL
+#define MAXINSN 16
+
+static int grab_bb(char *buffer, u64 start, u64 end,
+		    struct machine *machine, struct thread *thread,
+		    bool *is64bit, u8 *cpumode)
+{
+	int offset, len;
+	struct addr_location al;
+	bool kernel;
+
+	if (!start || !end)
+		return 0;
+
+	kernel = machine__kernel_ip(machine, start);
+	if (kernel)
+		*cpumode = PERF_RECORD_MISC_KERNEL;
+	else
+		*cpumode = PERF_RECORD_MISC_USER;
+	if (kernel != machine__kernel_ip(machine, end))
+		return 0;
+
+	memset(&al, 0, sizeof(al));
+	if (end - start > MAXBB - MAXINSN) {
+		printf("\tbasic block %" PRIx64 "-%" PRIx64 " (%ld) too long to dump\n",
+		       start, end, end - start);
+		return 0;
+	}
+
+	thread__find_addr_map(thread, *cpumode, MAP__FUNCTION, start, &al);
+	if (!al.map || !al.map->dso) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+				start, end);
+		return 0;
+	}
+	if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR) {
+		printf("\tcannot resolve %" PRIx64 "-%" PRIx64 "\n",
+				start, end);
+		return 0;
+	}
+
+	/* Load maps to ensure dso->is_64_bit has been updated */
+	map__load(al.map, machine->symbol_filter);
+
+	offset = al.map->map_ip(al.map, start);
+	len = dso__data_read_offset(al.map->dso, machine,
+				    offset, (u8 *)buffer,
+				    end - start + MAXINSN);
+
+	*is64bit = al.map->dso->is_64_bit;
+	return len;
+}
+#endif
+
+static void print_sample_brstackasm(struct perf_sample *sample,
+				    struct thread *thread __maybe_unused,
+				    struct perf_event_attr *attr __maybe_unused,
+				    struct machine *machine __maybe_unused)
+{
+#ifdef HAVE_UDIS86
+	struct branch_stack *br = sample->branch_stack;
+	u64 start, end;
+	int i;
+	static bool ud_initialized = false;
+	static struct perf_ud ud;
+	char buffer[MAXBB];
+	int len;
+	bool last;
+	bool is64bit;
+	int nr;
+
+	if (!(br && br->nr))
+		return;
+	nr = br->nr;
+	if (max_blocks && nr > max_blocks + 1)
+		nr = max_blocks + 1;
+
+	if (!ud_initialized) {
+		ud_initialized = true;
+		ud_init(&ud.ud_obj);
+		ud_set_syntax(&ud.ud_obj, UD_SYN_ATT);
+		ud_set_sym_resolver(&ud.ud_obj, dis_resolve);
+	}
+	ud.thread = thread;
+	ud.cpu = sample->cpu;
+
+	putchar('\n');
+	for (i = nr - 2; i >= 0; i--) {
+		if (verbose > 0 && (br->entries[i].from || br->entries[i].to))
+			printf("%d: %lx-%lx\n", i,
+				br->entries[i].from,
+				br->entries[i].to);
+		start = br->entries[i + 1].to;
+		end = br->entries[i].from;
+
+		/*
+		 * Leave extra bytes for the final jump instruction for
+		 * which we don't know the length
+		 */
+		len = grab_bb(buffer, start, end + MAXINSN,
+				machine, thread, &is64bit,
+				&ud.cpumode);
+		if (len <= 0)
+			continue;
+
+		ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+		ud_set_pc(&ud.ud_obj, start);
+		ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+		last = false;
+		while (ud_disassemble(&ud.ud_obj) && !last) {
+			if (ud_insn_ptr(&ud.ud_obj) ==
+					(uint8_t *)buffer + end - start) {
+				printf("\t%016" PRIx64 "\t%-30s\t#%s%s%s%s",
+					ud_insn_off(&ud.ud_obj),
+					ud_insn_asm(&ud.ud_obj),
+					br->entries[i].flags.predicted ? " PRED" : "",
+					br->entries[i].flags.mispred ? " MISPRED" : "",
+					br->entries[i].flags.in_tx ? " INTX" : "",
+					br->entries[i].flags.abort ? " ABORT" : "");
+				if (br->entries[i].flags.cycles)
+					printf(" %d cycles", br->entries[i].flags.cycles);
+				putchar('\n');
+				last = true;
+			} else {
+				printf("\t%016" PRIx64 "\t%s\n",
+						ud_insn_off(&ud.ud_obj),
+					ud_insn_asm(&ud.ud_obj));
+			}
+		}
+	}
+
+	/*
+	 * Hit the branch? In this case we are already done, and the target
+	 * has not been executed yet.
+	 */
+	if (br->entries[0].from == sample->ip)
+		return;
+	if (br->entries[0].flags.abort)
+		return;
+
+	/*
+	 * Print final block upto sample
+	 */
+	start = br->entries[0].to;
+	end = sample->ip;
+	len = grab_bb(buffer, start, end, machine, thread, &is64bit,
+			&ud.cpumode);
+	ud_set_input_buffer(&ud.ud_obj, (uint8_t *)buffer, len);
+	if (len <= 0) {
+		/* Print at least last IP if basic block did not work */
+		len = grab_bb(buffer, sample->ip, sample->ip + MAXINSN,
+				machine, thread, &is64bit, &ud.cpumode);
+		if (len <= 0)
+			return;
+		ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+		ud_set_pc(&ud.ud_obj, sample->ip);
+		if (ud_disassemble(&ud.ud_obj))
+			printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+			       ud_insn_asm(&ud.ud_obj));
+		return;
+	}
+	ud_set_mode(&ud.ud_obj, is64bit ? 64 : 32);
+	ud_set_pc(&ud.ud_obj, start);
+	while (ud_disassemble(&ud.ud_obj) &&
+		ud_insn_ptr(&ud.ud_obj) <= (uint8_t *)buffer + end - start)
+		printf("\t%016" PRIx64 "\t%s\n", ud_insn_off(&ud.ud_obj),
+			       ud_insn_asm(&ud.ud_obj));
+#endif
+}
 
 static void print_sample_addr(struct perf_sample *sample,
 			  struct thread *thread,
@@ -908,6 +1088,8 @@
 	if (perf_evsel__is_bpf_output(evsel) && PRINT_FIELD(BPF_OUTPUT))
 		print_sample_bpf_output(sample);
 
+	if (PRINT_FIELD(BRSTACKASM))
+		print_sample_brstackasm(sample, thread, attr, machine);
 	if (PRINT_FIELD(ASM))
 		print_sample_asm(sample, thread, attr, al, machine);
 
@@ -2143,6 +2325,8 @@
 		    "Show the mmap events"),
 	OPT_BOOLEAN('\0', "show-switch-events", &script.show_switch_events,
 		    "Show context switch events (if recorded)"),
+	OPT_INTEGER(0, "max-blocks", &max_blocks,
+		    "Maximum number of code blocks to dump with brstackasm"),
 	OPT_BOOLEAN('f', "force", &file.force, "don't complain, do it"),
 	OPT_BOOLEAN(0, "ns", &nanosecs,
 		    "Use 9 decimal places when displaying time"),