Merge branch 'trace/ftrace/core' into trace/for-next
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index bd36598..2479b2a 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -2003,6 +2003,32 @@
360.774530 | 1) 0.594 us | __phys_addr();
+The function name is always displayed after the closing bracket
+for a function if the start of that function is not in the
+trace buffer.
+
+Display of the function name after the closing bracket may be
+enabled for functions whose start is in the trace buffer,
+allowing easier searching with grep for function durations.
+It is default disabled.
+
+ hide: echo nofuncgraph-tail > trace_options
+ show: echo funcgraph-tail > trace_options
+
+ Example with nofuncgraph-tail (default):
+ 0) | putname() {
+ 0) | kmem_cache_free() {
+ 0) 0.518 us | __phys_addr();
+ 0) 1.757 us | }
+ 0) 2.861 us | }
+
+ Example with funcgraph-tail:
+ 0) | putname() {
+ 0) | kmem_cache_free() {
+ 0) 0.518 us | __phys_addr();
+ 0) 1.757 us | } /* kmem_cache_free() */
+ 0) 2.861 us | } /* putname() */
+
You can put some comments on specific functions by using
trace_printk() For example, if you want to put a comment inside
the __might_sleep() function, you just have to include
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8639819..d440935 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -535,6 +535,36 @@
Say N, unless you absolutely know what you are doing.
+config TRACEPOINT_BENCHMARK
+ bool "Add tracepoint that benchmarks tracepoints"
+ help
+ This option creates the tracepoint "benchmark:benchmark_event".
+ When the tracepoint is enabled, it kicks off a kernel thread that
+ goes into an infinite loop (calling cond_sched() to let other tasks
+ run), and calls the tracepoint. Each iteration will record the time
+ it took to write to the tracepoint and the next iteration that
+ data will be passed to the tracepoint itself. That is, the tracepoint
+ will report the time it took to do the previous tracepoint.
+ The string written to the tracepoint is a static string of 128 bytes
+ to keep the time the same. The initial string is simply a write of
+ "START". The second string records the cold cache time of the first
+ write which is not added to the rest of the calculations.
+
+ As it is a tight loop, it benchmarks as hot cache. That's fine because
+ we care most about hot paths that are probably in cache already.
+
+ An example of the output:
+
+ START
+ first=3672 [COLD CACHED]
+ last=632 first=3672 max=632 min=632 avg=316 std=446 std^2=199712
+ last=278 first=3672 max=632 min=278 avg=303 std=316 std^2=100337
+ last=277 first=3672 max=632 min=277 avg=296 std=258 std^2=67064
+ last=273 first=3672 max=632 min=273 avg=292 std=224 std^2=50411
+ last=273 first=3672 max=632 min=273 avg=288 std=200 std^2=40389
+ last=281 first=3672 max=632 min=273 avg=287 std=183 std^2=33666
+
+
config RING_BUFFER_BENCHMARK
tristate "Ring buffer benchmark stress tester"
depends on RING_BUFFER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 1378e84..2611613 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -17,6 +17,7 @@
KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
endif
+CFLAGS_trace_benchmark.o := -I$(src)
CFLAGS_trace_events_filter.o := -I$(src)
obj-$(CONFIG_TRACE_CLOCK) += trace_clock.o
@@ -62,4 +63,6 @@
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
+
libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0543169..626dbfd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1441,12 +1441,12 @@
void trace_stop_cmdline_recording(void);
-static void trace_save_cmdline(struct task_struct *tsk)
+static int trace_save_cmdline(struct task_struct *tsk)
{
unsigned pid, idx;
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
- return;
+ return 0;
/*
* It's not the end of the world if we don't get
@@ -1455,7 +1455,7 @@
* so if we miss here, then better luck next time.
*/
if (!arch_spin_trylock(&trace_cmdline_lock))
- return;
+ return 0;
idx = map_pid_to_cmdline[tsk->pid];
if (idx == NO_CMDLINE_MAP) {
@@ -1480,9 +1480,11 @@
memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
arch_spin_unlock(&trace_cmdline_lock);
+
+ return 1;
}
-void trace_find_cmdline(int pid, char comm[])
+static void __trace_find_cmdline(int pid, char comm[])
{
unsigned map;
@@ -1501,13 +1503,19 @@
return;
}
- preempt_disable();
- arch_spin_lock(&trace_cmdline_lock);
map = map_pid_to_cmdline[pid];
if (map != NO_CMDLINE_MAP)
strcpy(comm, saved_cmdlines[map]);
else
strcpy(comm, "<...>");
+}
+
+void trace_find_cmdline(int pid, char comm[])
+{
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ __trace_find_cmdline(pid, comm);
arch_spin_unlock(&trace_cmdline_lock);
preempt_enable();
@@ -1521,9 +1529,8 @@
if (!__this_cpu_read(trace_cmdline_save))
return;
- __this_cpu_write(trace_cmdline_save, false);
-
- trace_save_cmdline(tsk);
+ if (trace_save_cmdline(tsk))
+ __this_cpu_write(trace_cmdline_save, false);
}
void
@@ -1975,7 +1982,21 @@
if (alloc_percpu_trace_buffer())
return;
- pr_info("ftrace: Allocated trace_printk buffers\n");
+ /* trace_printk() is for debug use only. Don't use it in production. */
+
+ pr_warning("\n**********************************************************\n");
+ pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warning("** **\n");
+ pr_warning("** trace_printk() being used. Allocating extra memory. **\n");
+ pr_warning("** **\n");
+ pr_warning("** This means that this is a DEBUG kernel and it is **\n");
+ pr_warning("** unsafe for produciton use. **\n");
+ pr_warning("** **\n");
+ pr_warning("** If you see this message and you are not debugging **\n");
+ pr_warning("** the kernel, report this immediately to your vendor! **\n");
+ pr_warning("** **\n");
+ pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n");
+ pr_warning("**********************************************************\n");
/* Expand the buffers to set size */
tracing_update_buffers();
@@ -3685,55 +3706,79 @@
.llseek = generic_file_llseek,
};
-static ssize_t
-tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
- size_t cnt, loff_t *ppos)
+static void *saved_cmdlines_next(struct seq_file *m, void *v, loff_t *pos)
{
- char *buf_comm;
- char *file_buf;
- char *buf;
- int len = 0;
- int pid;
- int i;
+ unsigned int *ptr = v;
- file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
- if (!file_buf)
- return -ENOMEM;
+ if (*pos || m->count)
+ ptr++;
- buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
- if (!buf_comm) {
- kfree(file_buf);
- return -ENOMEM;
- }
+ (*pos)++;
- buf = file_buf;
-
- for (i = 0; i < SAVED_CMDLINES; i++) {
- int r;
-
- pid = map_cmdline_to_pid[i];
- if (pid == -1 || pid == NO_CMDLINE_MAP)
+ for (; ptr < &map_cmdline_to_pid[SAVED_CMDLINES]; ptr++) {
+ if (*ptr == -1 || *ptr == NO_CMDLINE_MAP)
continue;
- trace_find_cmdline(pid, buf_comm);
- r = sprintf(buf, "%d %s\n", pid, buf_comm);
- buf += r;
- len += r;
+ return ptr;
}
- len = simple_read_from_buffer(ubuf, cnt, ppos,
- file_buf, len);
+ return NULL;
+}
- kfree(file_buf);
- kfree(buf_comm);
+static void *saved_cmdlines_start(struct seq_file *m, loff_t *pos)
+{
+ void *v;
+ loff_t l = 0;
- return len;
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ v = &map_cmdline_to_pid[0];
+ while (l <= *pos) {
+ v = saved_cmdlines_next(m, v, &l);
+ if (!v)
+ return NULL;
+ }
+
+ return v;
+}
+
+static void saved_cmdlines_stop(struct seq_file *m, void *v)
+{
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+}
+
+static int saved_cmdlines_show(struct seq_file *m, void *v)
+{
+ char buf[TASK_COMM_LEN];
+ unsigned int *pid = v;
+
+ __trace_find_cmdline(*pid, buf);
+ seq_printf(m, "%d %s\n", *pid, buf);
+ return 0;
+}
+
+static const struct seq_operations tracing_saved_cmdlines_seq_ops = {
+ .start = saved_cmdlines_start,
+ .next = saved_cmdlines_next,
+ .stop = saved_cmdlines_stop,
+ .show = saved_cmdlines_show,
+};
+
+static int tracing_saved_cmdlines_open(struct inode *inode, struct file *filp)
+{
+ if (tracing_disabled)
+ return -ENODEV;
+
+ return seq_open(filp, &tracing_saved_cmdlines_seq_ops);
}
static const struct file_operations tracing_saved_cmdlines_fops = {
- .open = tracing_open_generic,
- .read = tracing_saved_cmdlines_read,
- .llseek = generic_file_llseek,
+ .open = tracing_saved_cmdlines_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = seq_release,
};
static ssize_t
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3b3e09e..217207a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -724,6 +724,8 @@
#define TRACE_GRAPH_PRINT_PROC 0x8
#define TRACE_GRAPH_PRINT_DURATION 0x10
#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
+#define TRACE_GRAPH_PRINT_IRQS 0x40
+#define TRACE_GRAPH_PRINT_TAIL 0x80
#define TRACE_GRAPH_PRINT_FILL_SHIFT 28
#define TRACE_GRAPH_PRINT_FILL_MASK (0x3 << TRACE_GRAPH_PRINT_FILL_SHIFT)
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
new file mode 100644
index 0000000..7dc1c42d
--- /dev/null
+++ b/kernel/trace/trace_benchmark.c
@@ -0,0 +1,176 @@
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/trace_clock.h>
+
+#define CREATE_TRACE_POINTS
+#include "trace_benchmark.h"
+
+static struct task_struct *bm_event_thread;
+
+static char bm_str[BENCHMARK_EVENT_STRLEN] = "START";
+
+static u64 bm_total;
+static u64 bm_totalsq;
+static u64 bm_last;
+static u64 bm_max;
+static u64 bm_min;
+static u64 bm_first;
+static s64 bm_cnt;
+
+/*
+ * This gets called in a loop recording the time it took to write
+ * the tracepoint. What it writes is the time statistics of the last
+ * tracepoint write. As there is nothing to write the first time
+ * it simply writes "START". As the first write is cold cache and
+ * the rest is hot, we save off that time in bm_first and it is
+ * reported as "first", which is shown in the second write to the
+ * tracepoint. The "first" field is writen within the statics from
+ * then on but never changes.
+ */
+static void trace_do_benchmark(void)
+{
+ u64 start;
+ u64 stop;
+ u64 delta;
+ s64 stddev;
+ u64 seed;
+ u64 seedsq;
+ u64 last_seed;
+ unsigned int avg;
+ unsigned int std = 0;
+
+ /* Only run if the tracepoint is actually active */
+ if (!trace_benchmark_event_enabled())
+ return;
+
+ local_irq_disable();
+ start = trace_clock_local();
+ trace_benchmark_event(bm_str);
+ stop = trace_clock_local();
+ local_irq_enable();
+
+ bm_cnt++;
+
+ delta = stop - start;
+
+ /*
+ * The first read is cold cached, keep it separate from the
+ * other calculations.
+ */
+ if (bm_cnt == 1) {
+ bm_first = delta;
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "first=%llu [COLD CACHED]", bm_first);
+ return;
+ }
+
+ bm_last = delta;
+
+ bm_total += delta;
+ bm_totalsq += delta * delta;
+
+ if (delta > bm_max)
+ bm_max = delta;
+ if (!bm_min || delta < bm_min)
+ bm_min = delta;
+
+ if (bm_cnt > 1) {
+ /*
+ * Apply Welford's method to calculate standard deviation:
+ * s^2 = 1 / (n * (n-1)) * (n * \Sum (x_i)^2 - (\Sum x_i)^2)
+ */
+ stddev = (u64)bm_cnt * bm_totalsq - bm_total * bm_total;
+ do_div(stddev, bm_cnt);
+ do_div(stddev, bm_cnt - 1);
+ } else
+ stddev = 0;
+
+ delta = bm_total;
+ do_div(delta, bm_cnt);
+ avg = delta;
+
+ if (stddev > 0) {
+ int i = 0;
+ /*
+ * stddev is the square of standard deviation but
+ * we want the actualy number. Use the average
+ * as our seed to find the std.
+ *
+ * The next try is:
+ * x = (x + N/x) / 2
+ *
+ * Where N is the squared number to find the square
+ * root of.
+ */
+ seed = avg;
+ do {
+ last_seed = seed;
+ seed = stddev;
+ if (!last_seed)
+ break;
+ do_div(seed, last_seed);
+ seed += last_seed;
+ do_div(seed, 2);
+ } while (i++ < 10 && last_seed != seed);
+
+ std = seed;
+ }
+
+ scnprintf(bm_str, BENCHMARK_EVENT_STRLEN,
+ "last=%llu first=%llu max=%llu min=%llu avg=%u std=%d std^2=%lld",
+ bm_last, bm_first, bm_max, bm_min, avg, std, stddev);
+}
+
+static int benchmark_event_kthread(void *arg)
+{
+ /* sleep a bit to make sure the tracepoint gets activated */
+ msleep(100);
+
+ while (!kthread_should_stop()) {
+
+ trace_do_benchmark();
+
+ /*
+ * We don't go to sleep, but let others
+ * run as well.
+ */
+ cond_resched();
+ }
+
+ return 0;
+}
+
+/*
+ * When the benchmark tracepoint is enabled, it calls this
+ * function and the thread that calls the tracepoint is created.
+ */
+void trace_benchmark_reg(void)
+{
+ bm_event_thread = kthread_run(benchmark_event_kthread,
+ NULL, "event_benchmark");
+ WARN_ON(!bm_event_thread);
+}
+
+/*
+ * When the benchmark tracepoint is disabled, it calls this
+ * function and the thread that calls the tracepoint is deleted
+ * and all the numbers are reset.
+ */
+void trace_benchmark_unreg(void)
+{
+ if (!bm_event_thread)
+ return;
+
+ kthread_stop(bm_event_thread);
+
+ strcpy(bm_str, "START");
+ bm_total = 0;
+ bm_totalsq = 0;
+ bm_last = 0;
+ bm_max = 0;
+ bm_min = 0;
+ bm_cnt = 0;
+ /* bm_first doesn't need to be reset but reset it anyway */
+ bm_first = 0;
+}
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
new file mode 100644
index 0000000..3c1df1df
--- /dev/null
+++ b/kernel/trace/trace_benchmark.h
@@ -0,0 +1,41 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM benchmark
+
+#if !defined(_TRACE_BENCHMARK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BENCHMARK_H
+
+#include <linux/tracepoint.h>
+
+extern void trace_benchmark_reg(void);
+extern void trace_benchmark_unreg(void);
+
+#define BENCHMARK_EVENT_STRLEN 128
+
+TRACE_EVENT_FN(benchmark_event,
+
+ TP_PROTO(const char *str),
+
+ TP_ARGS(str),
+
+ TP_STRUCT__entry(
+ __array( char, str, BENCHMARK_EVENT_STRLEN )
+ ),
+
+ TP_fast_assign(
+ memcpy(__entry->str, str, BENCHMARK_EVENT_STRLEN);
+ ),
+
+ TP_printk("%s", __entry->str),
+
+ trace_benchmark_reg, trace_benchmark_unreg
+);
+
+#endif /* _TRACE_BENCHMARK_H */
+
+#undef TRACE_INCLUDE_FILE
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE trace_benchmark
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index b86dd4d..4de3e57 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -38,15 +38,6 @@
#define TRACE_GRAPH_INDENT 2
-/* Flag options */
-#define TRACE_GRAPH_PRINT_OVERRUN 0x1
-#define TRACE_GRAPH_PRINT_CPU 0x2
-#define TRACE_GRAPH_PRINT_OVERHEAD 0x4
-#define TRACE_GRAPH_PRINT_PROC 0x8
-#define TRACE_GRAPH_PRINT_DURATION 0x10
-#define TRACE_GRAPH_PRINT_ABS_TIME 0x20
-#define TRACE_GRAPH_PRINT_IRQS 0x40
-
static unsigned int max_depth;
static struct tracer_opt trace_opts[] = {
@@ -64,11 +55,13 @@
{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
/* Display interrupts */
{ TRACER_OPT(funcgraph-irqs, TRACE_GRAPH_PRINT_IRQS) },
+ /* Display function name after trailing } */
+ { TRACER_OPT(funcgraph-tail, TRACE_GRAPH_PRINT_TAIL) },
{ } /* Empty entry */
};
static struct tracer_flags tracer_flags = {
- /* Don't display overruns and proc by default */
+ /* Don't display overruns, proc, or tail by default */
.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
TRACE_GRAPH_PRINT_DURATION | TRACE_GRAPH_PRINT_IRQS,
.opts = trace_opts
@@ -1176,9 +1169,10 @@
* If the return function does not have a matching entry,
* then the entry was lost. Instead of just printing
* the '}' and letting the user guess what function this
- * belongs to, write out the function name.
+ * belongs to, write out the function name. Always do
+ * that if the funcgraph-tail option is enabled.
*/
- if (func_match) {
+ if (func_match && !(flags & TRACE_GRAPH_PRINT_TAIL)) {
ret = trace_seq_puts(s, "}\n");
if (!ret)
return TRACE_TYPE_PARTIAL_LINE;