tracing: POC on top of sframe

Sorta works

Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index be62f0e..7991b32 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3076,6 +3076,104 @@ EXPORT_SYMBOL_GPL(trace_dump_stack);
 #ifdef CONFIG_USER_STACKTRACE_SUPPORT
 static DEFINE_PER_CPU(int, user_stack_count);
 
+static void trace_user_unwind_callback(struct unwind_stacktrace *trace,
+				       u64 ctx_cookie, void *data)
+{
+	struct trace_event_call *call = &event_user_unwind_stack;
+	struct trace_array *tr = data;
+	struct trace_buffer *buffer = tr->array_buffer.buffer;
+	struct userunwind_stack_entry *entry;
+	struct ring_buffer_event *event;
+	struct mm_struct *mm = current->mm;
+	unsigned int trace_ctx;
+	struct vm_area_struct *vma = NULL;
+	unsigned long *caller;
+	unsigned long *inodes;
+	unsigned int *devs;
+	unsigned int offset;
+	int len;
+	int i;
+
+	len = trace->nr * (sizeof(unsigned long) * 2 + sizeof(unsigned int))
+			   + sizeof(*entry);
+
+	trace_ctx = tracing_gen_ctx();
+	event = __trace_buffer_lock_reserve(buffer, TRACE_USER_UNWIND_STACK,
+					    len, trace_ctx);
+	if (!event)
+		return;
+
+	entry	= ring_buffer_event_data(event);
+
+	entry->cookie = ctx_cookie;
+
+	offset = sizeof(*entry);
+	len = sizeof(unsigned long) * trace->nr;
+
+	entry->__data_loc_stack = offset | (len << 16);
+	caller = (void *)entry + offset;
+
+	offset += len;
+	entry->__data_loc_inodes = offset | (len << 16);
+	inodes = (void *)entry + offset;
+
+	offset += len;
+	len = sizeof(unsigned int) * trace->nr;
+	entry->__data_loc_dev = offset | (len << 16);
+	devs = (void *)entry + offset;
+
+	for (i = 0; i < trace->nr; i++) {
+		unsigned long addr = trace->entries[i];
+
+		if (!mm) {
+			caller[i] = addr;
+			inodes[i] = 0;
+			devs[i] = 0;
+			continue;
+		}
+		mmap_read_lock(mm);
+		if (!vma || addr < vma->vm_start || addr >= vma->vm_end)
+			vma = vma_lookup(mm, addr);
+
+		if (!vma) {
+			caller[i] = addr;
+			inodes[i] = 0;
+			devs[i] = 0;
+			mmap_read_unlock(mm);
+			continue;
+		}
+		caller[i] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
+		if (vma->vm_file && vma->vm_file->f_inode) {
+			inodes[i] = vma->vm_file->f_inode->i_ino;
+			devs[i] = vma->vm_file->f_inode->i_sb->s_dev;
+		}
+		mmap_read_unlock(mm);
+	}
+
+	if (!call_filter_check_discard(call, entry, buffer, event))
+		__buffer_unlock_commit(buffer, event);
+}
+
+static void
+ftrace_trace_userstack_delay(struct trace_array *tr,
+			     struct trace_buffer *buffer, unsigned int trace_ctx)
+{
+	struct trace_event_call *call = &event_user_unwind_cookie;
+	struct userunwind_cookie_entry *entry;
+	struct ring_buffer_event *event;
+
+	event = __trace_buffer_lock_reserve(buffer, TRACE_USER_UNWIND_COOKIE,
+					    sizeof(*entry), trace_ctx);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+
+	unwind_user_deferred(&tr->unwinder, &entry->cookie, tr);
+
+	if (!call_filter_check_discard(call, entry, buffer, event))
+		__buffer_unlock_commit(buffer, event);
+}
+
 static void
 ftrace_trace_userstack(struct trace_array *tr,
 		       struct trace_buffer *buffer, unsigned int trace_ctx)
@@ -3093,6 +3191,11 @@ ftrace_trace_userstack(struct trace_array *tr,
 	if (unlikely(in_nmi()))
 		return;
 
+	if (tr->trace_flags & TRACE_ITER_USERSTACKTRACE_DELAY) {
+		ftrace_trace_userstack_delay(tr, buffer, trace_ctx);
+		return;
+	}
+
 	/*
 	 * prevent recursion, since the user stack tracing may
 	 * trigger other kernel events.
@@ -9623,6 +9726,8 @@ trace_array_create_systems(const char *name, const char *systems,
 
 	list_add(&tr->list, &ftrace_trace_arrays);
 
+	unwind_user_register(&tr->unwinder, trace_user_unwind_callback);
+
 	tr->ref++;
 
 	return tr;
@@ -9742,6 +9847,8 @@ static int __remove_instance(struct trace_array *tr)
 	if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
 		return -EBUSY;
 
+	unwind_user_unregister(&tr->unwinder);
+
 	list_del(&tr->list);
 
 	/* Disable all the flags that were enabled coming in */
@@ -10779,6 +10886,8 @@ __init static int tracer_alloc_buffers(void)
 
 	test_can_verify();
 
+	unwind_user_register(&global_trace.unwinder, trace_user_unwind_callback);
+
 	return 0;
 
 out_free_pipe_cpumask:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 266740b..9c17f9ce 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/clocksource.h>
 #include <linux/ring_buffer.h>
+#include <linux/unwind_user.h>
 #include <linux/mmiotrace.h>
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
@@ -48,7 +49,10 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_GRAPH_RETADDR_ENT,
 	TRACE_USER_STACK,
+	/* trace-cmd manually adds blktrace after USER_STACK */
 	TRACE_BLK,
+	TRACE_USER_UNWIND_STACK,
+	TRACE_USER_UNWIND_COOKIE,
 	TRACE_BPUTS,
 	TRACE_HWLAT,
 	TRACE_OSNOISE,
@@ -92,7 +96,10 @@ enum trace_type {
 #define __array_desc(type, container, item, size)
 
 #undef __dynamic_array
-#define __dynamic_array(type, item)	type	item[];
+#define __dynamic_array(type, item)	u32	__data_loc_##item;
+
+#undef __dynamic_field
+#define __dynamic_field(type, item)	type	item[];
 
 #undef __rel_dynamic_array
 #define __rel_dynamic_array(type, item)	type	item[];
@@ -422,6 +429,7 @@ struct trace_array {
 	struct cond_snapshot	*cond_snapshot;
 #endif
 	struct trace_func_repeats	__percpu *last_func_repeats;
+	struct unwind_callback	unwinder;
 	/*
 	 * On boot up, the ring buffer is set to the minimum size, so that
 	 * we do not waste memory on systems that are not using tracing.
@@ -499,6 +507,9 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
+		IF_ASSIGN(var, ent, struct userunwind_stack_entry, TRACE_USER_UNWIND_STACK);\
+		IF_ASSIGN(var, ent, struct userunwind_cookie_entry, TRACE_USER_UNWIND_COOKIE);\
+		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
 		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
 		IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS);	\
@@ -1321,6 +1332,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
 		C(PRINTK,		"trace_printk"),	\
 		C(ANNOTATE,		"annotate"),		\
 		C(USERSTACKTRACE,	"userstacktrace"),	\
+		C(USERSTACKTRACE_DELAY,	"userstacktrace_delay"),\
 		C(SYM_USEROBJ,		"sym-userobj"),		\
 		C(PRINTK_MSGONLY,	"printk-msg-only"),	\
 		C(CONTEXT_INFO,		"context-info"),   /* Print pid/cpu/time */ \
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 82fd174..e928704 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -247,6 +247,34 @@ FTRACE_ENTRY(user_stack, userstack_entry,
 		 (void *)__entry->caller[6], (void *)__entry->caller[7])
 );
 
+FTRACE_ENTRY(user_unwind_stack, userunwind_stack_entry,
+
+	TRACE_USER_UNWIND_STACK,
+
+	F_STRUCT(
+		__field(		u64,		cookie	)
+		__dynamic_array(	unsigned long,	stack	)
+		__dynamic_array(	unsigned long,	inodes	)
+		__dynamic_array(	unsigned int,	dev	)
+	),
+
+	F_printk("cookie=%lld\n%s%s%s", __entry->cookie,
+		 __print_dynamic_array(stack, sizeof(unsigned long)),
+		 __print_dynamic_array(inodes, sizeof(unsigned long)),
+		 __print_dynamic_array(dev, sizeof(unsigned long)))
+);
+
+FTRACE_ENTRY(user_unwind_cookie, userunwind_cookie_entry,
+
+	TRACE_USER_UNWIND_COOKIE,
+
+	F_STRUCT(
+		__field(		u64,		cookie	)
+	),
+
+	F_printk("cookie=%lld", __entry->cookie)
+);
+
 /*
  * trace_printk entry:
  */
@@ -257,7 +285,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
 	F_STRUCT(
 		__field(	unsigned long,	ip	)
 		__field(	const char *,	fmt	)
-		__dynamic_array(	u32,	buf	)
+		__dynamic_field(	u32,	buf	)
 	),
 
 	F_printk("%ps: %s",
@@ -270,7 +298,7 @@ FTRACE_ENTRY_REG(print, print_entry,
 
 	F_STRUCT(
 		__field(	unsigned long,	ip	)
-		__dynamic_array(	char,	buf	)
+		__dynamic_field(	char,	buf	)
 	),
 
 	F_printk("%ps: %s",
@@ -285,7 +313,7 @@ FTRACE_ENTRY(raw_data, raw_data_entry,
 
 	F_STRUCT(
 		__field(	unsigned int,	id	)
-		__dynamic_array(	char,	buf	)
+		__dynamic_field(	char,	buf	)
 	),
 
 	F_printk("id:%04x %08x",
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 1698fc2..831999f 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -58,7 +58,10 @@ static int ftrace_event_register(struct trace_event_call *call,
 #define __array_desc(type, container, item, size)	type item[size];
 
 #undef __dynamic_array
-#define __dynamic_array(type, item)			type item[];
+#define __dynamic_array(type, item)			u32 __data_loc_##item;
+
+#undef __dynamic_field
+#define __dynamic_field(type, item)			type item[];
 
 #undef F_STRUCT
 #define F_STRUCT(args...)				args
@@ -66,6 +69,16 @@ static int ftrace_event_register(struct trace_event_call *call,
 #undef F_printk
 #define F_printk(fmt, args...) fmt, args
 
+/* Only used for ftrace event format output */
+static inline char * __print_dynamic_array(int array, size_t size)
+{
+	return NULL;
+}
+
+#undef __print_dynamic_array
+#define __print_dynamic_array(array, el_size)				\
+	__print_dynamic_array(__entry->__data_loc_##array, el_size)
+
 #undef FTRACE_ENTRY
 #define FTRACE_ENTRY(name, struct_name, id, tstruct, print)		\
 struct ____ftrace_##name {						\
@@ -74,6 +87,7 @@ struct ____ftrace_##name {						\
 static void __always_unused ____ftrace_check_##name(void)		\
 {									\
 	struct ____ftrace_##name *__entry = NULL;			\
+	struct trace_seq __maybe_unused *p = NULL;			\
 									\
 	/* force compile-time check on F_printk() */			\
 	printk(print);							\
@@ -125,6 +139,12 @@ static void __always_unused ____ftrace_check_##name(void)		\
 
 #undef __dynamic_array
 #define __dynamic_array(_type, _item) {					\
+	.type = "__data_loc " #_type "[]", .name = #_item,		\
+	.size = 4, .align = __alignof__(4),				\
+	is_signed_type(_type), .filter_type = FILTER_OTHER },
+
+#undef __dynamic_field
+#define __dynamic_field(_type, _item) {					\
 	.type = #_type "[]", .name = #_item,				\
 	.size = 0, .align = __alignof__(_type),				\
 	is_signed_type(_type), .filter_type = FILTER_OTHER },
@@ -164,6 +184,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = {	\
 #undef __dynamic_array
 #define __dynamic_array(type, item)
 
+#undef __dynamic_field
+#define __dynamic_field(type, item)
+
 #undef F_printk
 #define F_printk(fmt, args...) __stringify(fmt) ", "  __stringify(args)
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index da748b7..d504790 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1278,6 +1278,56 @@ static struct trace_event trace_stack_event = {
 };
 
 /* TRACE_USER_STACK */
+static enum print_line_t trace_user_unwind_stack_print(struct trace_iterator *iter,
+						int flags, struct trace_event *event)
+{
+	struct userunwind_stack_entry *field;
+	struct trace_seq *s = &iter->seq;
+	unsigned long *caller;
+	unsigned int offset;
+	unsigned int len;
+	unsigned int i;
+
+	trace_assign_type(field, iter->ent);
+
+	trace_seq_puts(s, "<user stack unwind>\n");
+
+	trace_seq_printf(s, "cookie=%llx\n", field->cookie);
+
+	/* The stack field is a dynamic pointer */
+	offset = field->__data_loc_stack;
+	len = offset >> 16;
+	offset = offset & 0xffff;
+
+	caller = (void *)iter->ent + offset;
+
+	for (i = 0; i < len; i += sizeof(long)) {
+		unsigned long ip = caller[i];
+
+		if (!ip || trace_seq_has_overflowed(s))
+			break;
+
+		trace_seq_puts(s, " => ");
+		seq_print_user_ip(s, NULL, ip, flags);
+		trace_seq_putc(s, '\n');
+	}
+
+	return trace_handle_return(s);
+}
+
+static enum print_line_t trace_user_unwind_cookie_print(struct trace_iterator *iter,
+						 int flags, struct trace_event *event)
+{
+	struct userunwind_cookie_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+
+	trace_seq_printf(s, "cookie=%llx\n", field->cookie);
+
+	return trace_handle_return(s);
+}
+
 static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 						int flags, struct trace_event *event)
 {
@@ -1321,6 +1371,24 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 	return trace_handle_return(s);
 }
 
+static struct trace_event_functions trace_userunwind_stack_funcs = {
+	.trace		= trace_user_unwind_stack_print,
+};
+
+static struct trace_event trace_userunwind_stack_event = {
+	.type		= TRACE_USER_UNWIND_STACK,
+	.funcs		= &trace_userunwind_stack_funcs,
+};
+
+static struct trace_event_functions trace_userunwind_cookie_funcs = {
+	.trace		= trace_user_unwind_cookie_print,
+};
+
+static struct trace_event trace_userunwind_cookie_event = {
+	.type		= TRACE_USER_UNWIND_COOKIE,
+	.funcs		= &trace_userunwind_cookie_funcs,
+};
+
 static struct trace_event_functions trace_user_stack_funcs = {
 	.trace		= trace_user_stack_print,
 };
@@ -1720,6 +1788,8 @@ static struct trace_event *events[] __initdata = {
 	&trace_ctx_event,
 	&trace_wake_event,
 	&trace_stack_event,
+	&trace_userunwind_cookie_event,
+	&trace_userunwind_stack_event,
 	&trace_user_stack_event,
 	&trace_bputs_event,
 	&trace_bprint_event,