tracing: POC on top of sframe
Sorta works
Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index be62f0e..7991b32 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3076,6 +3076,104 @@ EXPORT_SYMBOL_GPL(trace_dump_stack);
#ifdef CONFIG_USER_STACKTRACE_SUPPORT
static DEFINE_PER_CPU(int, user_stack_count);
+static void trace_user_unwind_callback(struct unwind_stacktrace *trace,
+ u64 ctx_cookie, void *data)
+{
+ struct trace_event_call *call = &event_user_unwind_stack;
+ struct trace_array *tr = data;
+ struct trace_buffer *buffer = tr->array_buffer.buffer;
+ struct userunwind_stack_entry *entry;
+ struct ring_buffer_event *event;
+ struct mm_struct *mm = current->mm;
+ unsigned int trace_ctx;
+ struct vm_area_struct *vma = NULL;
+ unsigned long *caller;
+ unsigned long *inodes;
+ unsigned int *devs;
+ unsigned int offset;
+ int len;
+ int i;
+
+ len = trace->nr * (sizeof(unsigned long) * 2 + sizeof(unsigned int))
+ + sizeof(*entry);
+
+ trace_ctx = tracing_gen_ctx();
+ event = __trace_buffer_lock_reserve(buffer, TRACE_USER_UNWIND_STACK,
+ len, trace_ctx);
+ if (!event)
+ return;
+
+ entry = ring_buffer_event_data(event);
+
+ entry->cookie = ctx_cookie;
+
+ offset = sizeof(*entry);
+ len = sizeof(unsigned long) * trace->nr;
+
+ entry->__data_loc_stack = offset | (len << 16);
+ caller = (void *)entry + offset;
+
+ offset += len;
+ entry->__data_loc_inodes = offset | (len << 16);
+ inodes = (void *)entry + offset;
+
+ offset += len;
+ len = sizeof(unsigned int) * trace->nr;
+ entry->__data_loc_dev = offset | (len << 16);
+ devs = (void *)entry + offset;
+
+ for (i = 0; i < trace->nr; i++) {
+ unsigned long addr = trace->entries[i];
+
+ if (!mm) {
+ caller[i] = addr;
+ inodes[i] = 0;
+ devs[i] = 0;
+ continue;
+ }
+ mmap_read_lock(mm);
+ if (!vma || addr < vma->vm_start || addr >= vma->vm_end)
+ vma = vma_lookup(mm, addr);
+
+ if (!vma) {
+ caller[i] = addr;
+ inodes[i] = 0;
+ devs[i] = 0;
+ mmap_read_unlock(mm);
+ continue;
+ }
+ caller[i] = (addr - vma->vm_start) + (vma->vm_pgoff << PAGE_SHIFT);
+ if (vma->vm_file && vma->vm_file->f_inode) {
+ inodes[i] = vma->vm_file->f_inode->i_ino;
+ devs[i] = vma->vm_file->f_inode->i_sb->s_dev;
+ }
+ mmap_read_unlock(mm);
+ }
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
+}
+
+static void
+ftrace_trace_userstack_delay(struct trace_array *tr,
+ struct trace_buffer *buffer, unsigned int trace_ctx)
+{
+ struct trace_event_call *call = &event_user_unwind_cookie;
+ struct userunwind_cookie_entry *entry;
+ struct ring_buffer_event *event;
+
+ event = __trace_buffer_lock_reserve(buffer, TRACE_USER_UNWIND_COOKIE,
+ sizeof(*entry), trace_ctx);
+ if (!event)
+ return;
+ entry = ring_buffer_event_data(event);
+
+ unwind_user_deferred(&tr->unwinder, &entry->cookie, tr);
+
+ if (!call_filter_check_discard(call, entry, buffer, event))
+ __buffer_unlock_commit(buffer, event);
+}
+
static void
ftrace_trace_userstack(struct trace_array *tr,
struct trace_buffer *buffer, unsigned int trace_ctx)
@@ -3093,6 +3191,11 @@ ftrace_trace_userstack(struct trace_array *tr,
if (unlikely(in_nmi()))
return;
+ if (tr->trace_flags & TRACE_ITER_USERSTACKTRACE_DELAY) {
+ ftrace_trace_userstack_delay(tr, buffer, trace_ctx);
+ return;
+ }
+
/*
* prevent recursion, since the user stack tracing may
* trigger other kernel events.
@@ -9623,6 +9726,8 @@ trace_array_create_systems(const char *name, const char *systems,
list_add(&tr->list, &ftrace_trace_arrays);
+ unwind_user_register(&tr->unwinder, trace_user_unwind_callback);
+
tr->ref++;
return tr;
@@ -9742,6 +9847,8 @@ static int __remove_instance(struct trace_array *tr)
if (tr->ref > 1 || (tr->current_trace && tr->trace_ref))
return -EBUSY;
+ unwind_user_unregister(&tr->unwinder);
+
list_del(&tr->list);
/* Disable all the flags that were enabled coming in */
@@ -10779,6 +10886,8 @@ __init static int tracer_alloc_buffers(void)
test_can_verify();
+ unwind_user_register(&global_trace.unwinder, trace_user_unwind_callback);
+
return 0;
out_free_pipe_cpumask:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 266740b..9c17f9ce 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -8,6 +8,7 @@
#include <linux/sched.h>
#include <linux/clocksource.h>
#include <linux/ring_buffer.h>
+#include <linux/unwind_user.h>
#include <linux/mmiotrace.h>
#include <linux/tracepoint.h>
#include <linux/ftrace.h>
@@ -48,7 +49,10 @@ enum trace_type {
TRACE_GRAPH_ENT,
TRACE_GRAPH_RETADDR_ENT,
TRACE_USER_STACK,
+ /* trace-cmd manually adds blktrace after USER_STACK */
TRACE_BLK,
+ TRACE_USER_UNWIND_STACK,
+ TRACE_USER_UNWIND_COOKIE,
TRACE_BPUTS,
TRACE_HWLAT,
TRACE_OSNOISE,
@@ -92,7 +96,10 @@ enum trace_type {
#define __array_desc(type, container, item, size)
#undef __dynamic_array
-#define __dynamic_array(type, item) type item[];
+#define __dynamic_array(type, item) u32 __data_loc_##item;
+
+#undef __dynamic_field
+#define __dynamic_field(type, item) type item[];
#undef __rel_dynamic_array
#define __rel_dynamic_array(type, item) type item[];
@@ -422,6 +429,7 @@ struct trace_array {
struct cond_snapshot *cond_snapshot;
#endif
struct trace_func_repeats __percpu *last_func_repeats;
+ struct unwind_callback unwinder;
/*
* On boot up, the ring buffer is set to the minimum size, so that
* we do not waste memory on systems that are not using tracing.
@@ -499,6 +507,9 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct ctx_switch_entry, 0); \
IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK); \
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
+ IF_ASSIGN(var, ent, struct userunwind_stack_entry, TRACE_USER_UNWIND_STACK);\
+ IF_ASSIGN(var, ent, struct userunwind_cookie_entry, TRACE_USER_UNWIND_COOKIE);\
+ IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
@@ -1321,6 +1332,7 @@ extern int trace_get_user(struct trace_parser *parser, const char __user *ubuf,
C(PRINTK, "trace_printk"), \
C(ANNOTATE, "annotate"), \
C(USERSTACKTRACE, "userstacktrace"), \
+ C(USERSTACKTRACE_DELAY, "userstacktrace_delay"),\
C(SYM_USEROBJ, "sym-userobj"), \
C(PRINTK_MSGONLY, "printk-msg-only"), \
C(CONTEXT_INFO, "context-info"), /* Print pid/cpu/time */ \
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 82fd174..e928704 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -247,6 +247,34 @@ FTRACE_ENTRY(user_stack, userstack_entry,
(void *)__entry->caller[6], (void *)__entry->caller[7])
);
+FTRACE_ENTRY(user_unwind_stack, userunwind_stack_entry,
+
+ TRACE_USER_UNWIND_STACK,
+
+ F_STRUCT(
+ __field( u64, cookie )
+ __dynamic_array( unsigned long, stack )
+ __dynamic_array( unsigned long, inodes )
+ __dynamic_array( unsigned int, dev )
+ ),
+
+ F_printk("cookie=%lld\n%s%s%s", __entry->cookie,
+ __print_dynamic_array(stack, sizeof(unsigned long)),
+ __print_dynamic_array(inodes, sizeof(unsigned long)),
+ __print_dynamic_array(dev, sizeof(unsigned long)))
+);
+
+FTRACE_ENTRY(user_unwind_cookie, userunwind_cookie_entry,
+
+ TRACE_USER_UNWIND_COOKIE,
+
+ F_STRUCT(
+ __field( u64, cookie )
+ ),
+
+ F_printk("cookie=%lld", __entry->cookie)
+);
+
/*
* trace_printk entry:
*/
@@ -257,7 +285,7 @@ FTRACE_ENTRY(bprint, bprint_entry,
F_STRUCT(
__field( unsigned long, ip )
__field( const char *, fmt )
- __dynamic_array( u32, buf )
+ __dynamic_field( u32, buf )
),
F_printk("%ps: %s",
@@ -270,7 +298,7 @@ FTRACE_ENTRY_REG(print, print_entry,
F_STRUCT(
__field( unsigned long, ip )
- __dynamic_array( char, buf )
+ __dynamic_field( char, buf )
),
F_printk("%ps: %s",
@@ -285,7 +313,7 @@ FTRACE_ENTRY(raw_data, raw_data_entry,
F_STRUCT(
__field( unsigned int, id )
- __dynamic_array( char, buf )
+ __dynamic_field( char, buf )
),
F_printk("id:%04x %08x",
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 1698fc2..831999f 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -58,7 +58,10 @@ static int ftrace_event_register(struct trace_event_call *call,
#define __array_desc(type, container, item, size) type item[size];
#undef __dynamic_array
-#define __dynamic_array(type, item) type item[];
+#define __dynamic_array(type, item) u32 __data_loc_##item;
+
+#undef __dynamic_field
+#define __dynamic_field(type, item) type item[];
#undef F_STRUCT
#define F_STRUCT(args...) args
@@ -66,6 +69,16 @@ static int ftrace_event_register(struct trace_event_call *call,
#undef F_printk
#define F_printk(fmt, args...) fmt, args
+/* Only used for ftrace event format output */
+static inline char * __print_dynamic_array(int array, size_t size)
+{
+ return NULL;
+}
+
+#undef __print_dynamic_array
+#define __print_dynamic_array(array, el_size) \
+ __print_dynamic_array(__entry->__data_loc_##array, el_size)
+
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \
struct ____ftrace_##name { \
@@ -74,6 +87,7 @@ struct ____ftrace_##name { \
static void __always_unused ____ftrace_check_##name(void) \
{ \
struct ____ftrace_##name *__entry = NULL; \
+ struct trace_seq __maybe_unused *p = NULL; \
\
/* force compile-time check on F_printk() */ \
printk(print); \
@@ -125,6 +139,12 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef __dynamic_array
#define __dynamic_array(_type, _item) { \
+ .type = "__data_loc " #_type "[]", .name = #_item, \
+ .size = 4, .align = __alignof__(4), \
+ is_signed_type(_type), .filter_type = FILTER_OTHER },
+
+#undef __dynamic_field
+#define __dynamic_field(_type, _item) { \
.type = #_type "[]", .name = #_item, \
.size = 0, .align = __alignof__(_type), \
is_signed_type(_type), .filter_type = FILTER_OTHER },
@@ -164,6 +184,9 @@ static struct trace_event_fields ftrace_event_fields_##name[] = { \
#undef __dynamic_array
#define __dynamic_array(type, item)
+#undef __dynamic_field
+#define __dynamic_field(type, item)
+
#undef F_printk
#define F_printk(fmt, args...) __stringify(fmt) ", " __stringify(args)
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index da748b7..d504790 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -1278,6 +1278,56 @@ static struct trace_event trace_stack_event = {
};
/* TRACE_USER_STACK */
+static enum print_line_t trace_user_unwind_stack_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ struct userunwind_stack_entry *field;
+ struct trace_seq *s = &iter->seq;
+ unsigned long *caller;
+ unsigned int offset;
+ unsigned int len;
+ unsigned int i;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_puts(s, "<user stack unwind>\n");
+
+ trace_seq_printf(s, "cookie=%llx\n", field->cookie);
+
+ /* The stack field is a dynamic pointer */
+ offset = field->__data_loc_stack;
+ len = offset >> 16;
+ offset = offset & 0xffff;
+
+ caller = (void *)iter->ent + offset;
+
+ for (i = 0; i < len; i += sizeof(long)) {
+ unsigned long ip = caller[i];
+
+ if (!ip || trace_seq_has_overflowed(s))
+ break;
+
+ trace_seq_puts(s, " => ");
+ seq_print_user_ip(s, NULL, ip, flags);
+ trace_seq_putc(s, '\n');
+ }
+
+ return trace_handle_return(s);
+}
+
+static enum print_line_t trace_user_unwind_cookie_print(struct trace_iterator *iter,
+ int flags, struct trace_event *event)
+{
+ struct userunwind_cookie_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(s, "cookie=%llx\n", field->cookie);
+
+ return trace_handle_return(s);
+}
+
static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
int flags, struct trace_event *event)
{
@@ -1321,6 +1371,24 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
return trace_handle_return(s);
}
+static struct trace_event_functions trace_userunwind_stack_funcs = {
+ .trace = trace_user_unwind_stack_print,
+};
+
+static struct trace_event trace_userunwind_stack_event = {
+ .type = TRACE_USER_UNWIND_STACK,
+ .funcs = &trace_userunwind_stack_funcs,
+};
+
+static struct trace_event_functions trace_userunwind_cookie_funcs = {
+ .trace = trace_user_unwind_cookie_print,
+};
+
+static struct trace_event trace_userunwind_cookie_event = {
+ .type = TRACE_USER_UNWIND_COOKIE,
+ .funcs = &trace_userunwind_cookie_funcs,
+};
+
static struct trace_event_functions trace_user_stack_funcs = {
.trace = trace_user_stack_print,
};
@@ -1720,6 +1788,8 @@ static struct trace_event *events[] __initdata = {
&trace_ctx_event,
&trace_wake_event,
&trace_stack_event,
+ &trace_userunwind_cookie_event,
+ &trace_userunwind_stack_event,
&trace_user_stack_event,
&trace_bputs_event,
&trace_bprint_event,