| From 6c16f5a7abd6dc840351abc8d5d48b9c04eeb80c Mon Sep 17 00:00:00 2001 |
| From: Jiri Olsa <jolsa@redhat.com> |
| Date: Sun, 23 Sep 2018 18:13:43 +0200 |
| Subject: perf/ring_buffer: Prevent concurent ring buffer access |
| |
| [ Upstream commit cd6fb677ce7e460c25bdd66f689734102ec7d642 ] |
| |
| Some of the scheduling tracepoints allow the perf_tp_event |
| code to write to ring buffer under different cpu than the |
| code is running on. |
| |
| This results in corrupted ring buffer data demonstrated in |
| following perf commands: |
| |
| # perf record -e 'sched:sched_switch,sched:sched_wakeup' perf bench sched messaging |
| # Running 'sched/messaging' benchmark: |
| # 20 sender and receiver processes per group |
| # 10 groups == 400 processes run |
| |
| Total time: 0.383 [sec] |
| [ perf record: Woken up 8 times to write data ] |
| 0x42b890 [0]: failed to process type: -1765585640 |
| [ perf record: Captured and wrote 4.825 MB perf.data (29669 samples) ] |
| |
| # perf report --stdio |
| 0x42b890 [0]: failed to process type: -1765585640 |
| |
| The reason for the corruption are some of the scheduling tracepoints, |
| that have __perf_task dfined and thus allow to store data to another |
| cpu ring buffer: |
| |
| sched_waking |
| sched_wakeup |
| sched_wakeup_new |
| sched_stat_wait |
| sched_stat_sleep |
| sched_stat_iowait |
| sched_stat_blocked |
| |
| The perf_tp_event function first store samples for current cpu |
| related events defined for tracepoint: |
| |
| hlist_for_each_entry_rcu(event, head, hlist_entry) |
| perf_swevent_event(event, count, &data, regs); |
| |
| And then iterates events of the 'task' and store the sample |
| for any task's event that passes tracepoint checks: |
| |
| ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]); |
| |
| list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
| if (event->attr.type != PERF_TYPE_TRACEPOINT) |
| continue; |
| if (event->attr.config != entry->type) |
| continue; |
| |
| perf_swevent_event(event, count, &data, regs); |
| } |
| |
| Above code can race with same code running on another cpu, |
| ending up with 2 cpus trying to store under the same ring |
| buffer, which is specifically not allowed. |
| |
| This patch prevents the problem, by allowing only events with the same |
| current cpu to receive the event. |
| |
| NOTE: this requires the use of (per-task-)per-cpu buffers for this |
| feature to work; perf-record does this. |
| |
| Signed-off-by: Jiri Olsa <jolsa@kernel.org> |
| [peterz: small edits to Changelog] |
| Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
| Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com> |
| Cc: Andrew Vagin <avagin@openvz.org> |
| Cc: Arnaldo Carvalho de Melo <acme@kernel.org> |
| Cc: Arnaldo Carvalho de Melo <acme@redhat.com> |
| Cc: Jiri Olsa <jolsa@redhat.com> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Cc: Namhyung Kim <namhyung@kernel.org> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Cc: Stephane Eranian <eranian@google.com> |
| Cc: Thomas Gleixner <tglx@linutronix.de> |
| Cc: Vince Weaver <vincent.weaver@maine.edu> |
| Fixes: e6dab5ffab59 ("perf/trace: Add ability to set a target task for events") |
| Link: http://lkml.kernel.org/r/20180923161343.GB15054@krava |
| Signed-off-by: Ingo Molnar <mingo@kernel.org> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| kernel/events/core.c | 2 ++ |
| 1 file changed, 2 insertions(+) |
| |
| diff --git a/kernel/events/core.c b/kernel/events/core.c |
| index 95bd00d9f2c3..3caf1a863a0b 100644 |
| --- a/kernel/events/core.c |
| +++ b/kernel/events/core.c |
| @@ -7737,6 +7737,8 @@ void perf_tp_event(u16 event_type, u64 count, void *record, int entry_size, |
| goto unlock; |
| |
| list_for_each_entry_rcu(event, &ctx->event_list, event_entry) { |
| + if (event->cpu != smp_processor_id()) |
| + continue; |
| if (event->attr.type != PERF_TYPE_TRACEPOINT) |
| continue; |
| if (event->attr.config != entry->type) |
| -- |
| 2.17.1 |
| |