| From: Peter Zijlstra <peterz@infradead.org> |
| Date: Fri, 23 Jan 2015 12:24:14 +0100 |
| Subject: perf: Fix event->ctx locking |
| |
| commit f63a8daa5812afef4f06c962351687e1ff9ccb2b upstream. |
| |
| There have been a few reported issues wrt. the lack of locking around |
| changing event->ctx. This patch tries to address those. |
| |
| It avoids the whole rwsem thing; and while it appears to work, please |
| give it some thought in review. |
| |
| What I did fail at is sensible runtime checks on the use of |
| event->ctx, the RCU use makes it very hard. |
| |
| Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> |
| Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> |
| Cc: Jiri Olsa <jolsa@redhat.com> |
| Cc: Arnaldo Carvalho de Melo <acme@kernel.org> |
| Cc: Linus Torvalds <torvalds@linux-foundation.org> |
| Link: http://lkml.kernel.org/r/20150123125834.209535886@infradead.org |
| Signed-off-by: Ingo Molnar <mingo@kernel.org> |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| kernel/events/core.c | 244 +++++++++++++++++++++++++++++++++++++++++++-------- |
| 1 file changed, 207 insertions(+), 37 deletions(-) |
| |
| --- a/kernel/events/core.c |
| +++ b/kernel/events/core.c |
| @@ -903,6 +903,77 @@ static void put_ctx(struct perf_event_co |
| } |
| |
| /* |
| + * Because of perf_event::ctx migration in sys_perf_event_open::move_group and |
| + * perf_pmu_migrate_context() we need some magic. |
| + * |
| + * Those places that change perf_event::ctx will hold both |
| + * perf_event_ctx::mutex of the 'old' and 'new' ctx value. |
| + * |
| + * Lock ordering is by mutex address. There is one other site where |
| + * perf_event_context::mutex nests and that is put_event(). But remember that |
| + * that is a parent<->child context relation, and migration does not affect |
| + * children, therefore these two orderings should not interact. |
| + * |
| + * The change in perf_event::ctx does not affect children (as claimed above) |
| + * because the sys_perf_event_open() case will install a new event and break |
| + * the ctx parent<->child relation, and perf_pmu_migrate_context() is only |
| + * concerned with cpuctx and that doesn't have children. |
| + * |
| + * The places that change perf_event::ctx will issue: |
| + * |
| + * perf_remove_from_context(); |
| + * synchronize_rcu(); |
| + * perf_install_in_context(); |
| + * |
| + * to affect the change. The remove_from_context() + synchronize_rcu() should |
| + * quiesce the event, after which we can install it in the new location. This |
| + * means that only external vectors (perf_fops, prctl) can perturb the event |
| + * while in transit. Therefore all such accessors should also acquire |
| + * perf_event_context::mutex to serialize against this. |
| + * |
| + * However; because event->ctx can change while we're waiting to acquire |
| + * ctx->mutex we must be careful and use the below perf_event_ctx_lock() |
| + * function. |
| + * |
| + * Lock order: |
| + * task_struct::perf_event_mutex |
| + * perf_event_context::mutex |
| + * perf_event_context::lock |
| + * perf_event::child_mutex; |
| + * perf_event::mmap_mutex |
| + * mmap_sem |
| + */ |
| +static struct perf_event_context *perf_event_ctx_lock(struct perf_event *event) |
| +{ |
| + struct perf_event_context *ctx; |
| + |
| +again: |
| + rcu_read_lock(); |
| + ctx = ACCESS_ONCE(event->ctx); |
| + if (!atomic_inc_not_zero(&ctx->refcount)) { |
| + rcu_read_unlock(); |
| + goto again; |
| + } |
| + rcu_read_unlock(); |
| + |
| + mutex_lock(&ctx->mutex); |
| + if (event->ctx != ctx) { |
| + mutex_unlock(&ctx->mutex); |
| + put_ctx(ctx); |
| + goto again; |
| + } |
| + |
| + return ctx; |
| +} |
| + |
| +static void perf_event_ctx_unlock(struct perf_event *event, |
| + struct perf_event_context *ctx) |
| +{ |
| + mutex_unlock(&ctx->mutex); |
| + put_ctx(ctx); |
| +} |
| + |
| +/* |
| * This must be done under the ctx->lock, such as to serialize against |
| * context_equiv(), therefore we cannot call put_ctx() since that might end up |
| * calling scheduler related locks and ctx->lock nests inside those. |
| @@ -1606,7 +1677,7 @@ int __perf_event_disable(void *info) |
| * is the current context on this CPU and preemption is disabled, |
| * hence we can't get into perf_event_task_sched_out for this context. |
| */ |
| -void perf_event_disable(struct perf_event *event) |
| +static void _perf_event_disable(struct perf_event *event) |
| { |
| struct perf_event_context *ctx = event->ctx; |
| struct task_struct *task = ctx->task; |
| @@ -1647,6 +1718,19 @@ retry: |
| } |
| raw_spin_unlock_irq(&ctx->lock); |
| } |
| + |
| +/* |
| + * Strictly speaking kernel users cannot create groups and therefore this |
| + * interface does not need the perf_event_ctx_lock() magic. |
| + */ |
| +void perf_event_disable(struct perf_event *event) |
| +{ |
| + struct perf_event_context *ctx; |
| + |
| + ctx = perf_event_ctx_lock(event); |
| + _perf_event_disable(event); |
| + perf_event_ctx_unlock(event, ctx); |
| +} |
| EXPORT_SYMBOL_GPL(perf_event_disable); |
| |
| static void perf_set_shadow_time(struct perf_event *event, |
| @@ -2107,7 +2191,7 @@ unlock: |
| * perf_event_for_each_child or perf_event_for_each as described |
| * for perf_event_disable. |
| */ |
| -void perf_event_enable(struct perf_event *event) |
| +static void _perf_event_enable(struct perf_event *event) |
| { |
| struct perf_event_context *ctx = event->ctx; |
| struct task_struct *task = ctx->task; |
| @@ -2163,9 +2247,21 @@ retry: |
| out: |
| raw_spin_unlock_irq(&ctx->lock); |
| } |
| + |
| +/* |
| + * See perf_event_disable(); |
| + */ |
| +void perf_event_enable(struct perf_event *event) |
| +{ |
| + struct perf_event_context *ctx; |
| + |
| + ctx = perf_event_ctx_lock(event); |
| + _perf_event_enable(event); |
| + perf_event_ctx_unlock(event, ctx); |
| +} |
| EXPORT_SYMBOL_GPL(perf_event_enable); |
| |
| -int perf_event_refresh(struct perf_event *event, int refresh) |
| +static int _perf_event_refresh(struct perf_event *event, int refresh) |
| { |
| /* |
| * not supported on inherited events |
| @@ -2174,10 +2270,25 @@ int perf_event_refresh(struct perf_event |
| return -EINVAL; |
| |
| atomic_add(refresh, &event->event_limit); |
| - perf_event_enable(event); |
| + _perf_event_enable(event); |
| |
| return 0; |
| } |
| + |
| +/* |
| + * See perf_event_disable() |
| + */ |
| +int perf_event_refresh(struct perf_event *event, int refresh) |
| +{ |
| + struct perf_event_context *ctx; |
| + int ret; |
| + |
| + ctx = perf_event_ctx_lock(event); |
| + ret = _perf_event_refresh(event, refresh); |
| + perf_event_ctx_unlock(event, ctx); |
| + |
| + return ret; |
| +} |
| EXPORT_SYMBOL_GPL(perf_event_refresh); |
| |
| static void ctx_sched_out(struct perf_event_context *ctx, |
| @@ -3373,7 +3484,16 @@ static void put_event(struct perf_event |
| rcu_read_unlock(); |
| |
| if (owner) { |
| - mutex_lock(&owner->perf_event_mutex); |
| + /* |
| + * If we're here through perf_event_exit_task() we're already |
| + * holding ctx->mutex which would be an inversion wrt. the |
| + * normal lock order. |
| + * |
| + * However we can safely take this lock because its the child |
| + * ctx->mutex. |
| + */ |
| + mutex_lock_nested(&owner->perf_event_mutex, SINGLE_DEPTH_NESTING); |
| + |
| /* |
| * We have to re-check the event->owner field, if it is cleared |
| * we raced with perf_event_exit_task(), acquiring the mutex |
| @@ -3449,12 +3569,13 @@ static int perf_event_read_group(struct |
| u64 read_format, char __user *buf) |
| { |
| struct perf_event *leader = event->group_leader, *sub; |
| - int n = 0, size = 0, ret = -EFAULT; |
| struct perf_event_context *ctx = leader->ctx; |
| - u64 values[5]; |
| + int n = 0, size = 0, ret; |
| u64 count, enabled, running; |
| + u64 values[5]; |
| + |
| + lockdep_assert_held(&ctx->mutex); |
| |
| - mutex_lock(&ctx->mutex); |
| count = perf_event_read_value(leader, &enabled, &running); |
| |
| values[n++] = 1 + leader->nr_siblings; |
| @@ -3469,7 +3590,7 @@ static int perf_event_read_group(struct |
| size = n * sizeof(u64); |
| |
| if (copy_to_user(buf, values, size)) |
| - goto unlock; |
| + return -EFAULT; |
| |
| ret = size; |
| |
| @@ -3483,14 +3604,11 @@ static int perf_event_read_group(struct |
| size = n * sizeof(u64); |
| |
| if (copy_to_user(buf + ret, values, size)) { |
| - ret = -EFAULT; |
| - goto unlock; |
| + return -EFAULT; |
| } |
| |
| ret += size; |
| } |
| -unlock: |
| - mutex_unlock(&ctx->mutex); |
| |
| return ret; |
| } |
| @@ -3549,8 +3667,14 @@ static ssize_t |
| perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) |
| { |
| struct perf_event *event = file->private_data; |
| + struct perf_event_context *ctx; |
| + int ret; |
| |
| - return perf_read_hw(event, buf, count); |
| + ctx = perf_event_ctx_lock(event); |
| + ret = perf_read_hw(event, buf, count); |
| + perf_event_ctx_unlock(event, ctx); |
| + |
| + return ret; |
| } |
| |
| static unsigned int perf_poll(struct file *file, poll_table *wait) |
| @@ -3574,7 +3698,7 @@ static unsigned int perf_poll(struct fil |
| return events; |
| } |
| |
| -static void perf_event_reset(struct perf_event *event) |
| +static void _perf_event_reset(struct perf_event *event) |
| { |
| (void)perf_event_read(event); |
| local64_set(&event->count, 0); |
| @@ -3593,6 +3717,7 @@ static void perf_event_for_each_child(st |
| struct perf_event *child; |
| |
| WARN_ON_ONCE(event->ctx->parent_ctx); |
| + |
| mutex_lock(&event->child_mutex); |
| func(event); |
| list_for_each_entry(child, &event->child_list, child_list) |
| @@ -3606,14 +3731,13 @@ static void perf_event_for_each(struct p |
| struct perf_event_context *ctx = event->ctx; |
| struct perf_event *sibling; |
| |
| - WARN_ON_ONCE(ctx->parent_ctx); |
| - mutex_lock(&ctx->mutex); |
| + lockdep_assert_held(&ctx->mutex); |
| + |
| event = event->group_leader; |
| |
| perf_event_for_each_child(event, func); |
| list_for_each_entry(sibling, &event->sibling_list, group_entry) |
| perf_event_for_each_child(sibling, func); |
| - mutex_unlock(&ctx->mutex); |
| } |
| |
| struct period_event { |
| @@ -3725,25 +3849,24 @@ static int perf_event_set_output(struct |
| struct perf_event *output_event); |
| static int perf_event_set_filter(struct perf_event *event, void __user *arg); |
| |
| -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
| +static long _perf_ioctl(struct perf_event *event, unsigned int cmd, unsigned long arg) |
| { |
| - struct perf_event *event = file->private_data; |
| void (*func)(struct perf_event *); |
| u32 flags = arg; |
| |
| switch (cmd) { |
| case PERF_EVENT_IOC_ENABLE: |
| - func = perf_event_enable; |
| + func = _perf_event_enable; |
| break; |
| case PERF_EVENT_IOC_DISABLE: |
| - func = perf_event_disable; |
| + func = _perf_event_disable; |
| break; |
| case PERF_EVENT_IOC_RESET: |
| - func = perf_event_reset; |
| + func = _perf_event_reset; |
| break; |
| |
| case PERF_EVENT_IOC_REFRESH: |
| - return perf_event_refresh(event, arg); |
| + return _perf_event_refresh(event, arg); |
| |
| case PERF_EVENT_IOC_PERIOD: |
| return perf_event_period(event, (u64 __user *)arg); |
| @@ -3790,6 +3913,19 @@ static long perf_ioctl(struct file *file |
| return 0; |
| } |
| |
| +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) |
| +{ |
| + struct perf_event *event = file->private_data; |
| + struct perf_event_context *ctx; |
| + long ret; |
| + |
| + ctx = perf_event_ctx_lock(event); |
| + ret = _perf_ioctl(event, cmd, arg); |
| + perf_event_ctx_unlock(event, ctx); |
| + |
| + return ret; |
| +} |
| + |
| #ifdef CONFIG_COMPAT |
| static long perf_compat_ioctl(struct file *file, unsigned int cmd, |
| unsigned long arg) |
| @@ -3812,11 +3948,15 @@ static long perf_compat_ioctl(struct fil |
| |
| int perf_event_task_enable(void) |
| { |
| + struct perf_event_context *ctx; |
| struct perf_event *event; |
| |
| mutex_lock(¤t->perf_event_mutex); |
| - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) |
| - perf_event_for_each_child(event, perf_event_enable); |
| + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { |
| + ctx = perf_event_ctx_lock(event); |
| + perf_event_for_each_child(event, _perf_event_enable); |
| + perf_event_ctx_unlock(event, ctx); |
| + } |
| mutex_unlock(¤t->perf_event_mutex); |
| |
| return 0; |
| @@ -3824,11 +3964,15 @@ int perf_event_task_enable(void) |
| |
| int perf_event_task_disable(void) |
| { |
| + struct perf_event_context *ctx; |
| struct perf_event *event; |
| |
| mutex_lock(¤t->perf_event_mutex); |
| - list_for_each_entry(event, ¤t->perf_event_list, owner_entry) |
| - perf_event_for_each_child(event, perf_event_disable); |
| + list_for_each_entry(event, ¤t->perf_event_list, owner_entry) { |
| + ctx = perf_event_ctx_lock(event); |
| + perf_event_for_each_child(event, _perf_event_disable); |
| + perf_event_ctx_unlock(event, ctx); |
| + } |
| mutex_unlock(¤t->perf_event_mutex); |
| |
| return 0; |
| @@ -7158,6 +7302,15 @@ out: |
| return ret; |
| } |
| |
| +static void mutex_lock_double(struct mutex *a, struct mutex *b) |
| +{ |
| + if (b < a) |
| + swap(a, b); |
| + |
| + mutex_lock(a); |
| + mutex_lock_nested(b, SINGLE_DEPTH_NESTING); |
| +} |
| + |
| /** |
| * sys_perf_event_open - open a performance event, associate it to a task/cpu |
| * |
| @@ -7173,7 +7326,7 @@ SYSCALL_DEFINE5(perf_event_open, |
| struct perf_event *group_leader = NULL, *output_event = NULL; |
| struct perf_event *event, *sibling; |
| struct perf_event_attr attr; |
| - struct perf_event_context *ctx; |
| + struct perf_event_context *ctx, *uninitialized_var(gctx); |
| struct file *event_file = NULL; |
| struct fd group = {NULL, 0}; |
| struct task_struct *task = NULL; |
| @@ -7369,9 +7522,14 @@ SYSCALL_DEFINE5(perf_event_open, |
| } |
| |
| if (move_group) { |
| - struct perf_event_context *gctx = group_leader->ctx; |
| + gctx = group_leader->ctx; |
| + |
| + /* |
| + * See perf_event_ctx_lock() for comments on the details |
| + * of swizzling perf_event::ctx. |
| + */ |
| + mutex_lock_double(&gctx->mutex, &ctx->mutex); |
| |
| - mutex_lock(&gctx->mutex); |
| perf_remove_from_context(group_leader, false); |
| |
| /* |
| @@ -7386,15 +7544,19 @@ SYSCALL_DEFINE5(perf_event_open, |
| perf_event__state_init(sibling); |
| put_ctx(gctx); |
| } |
| - mutex_unlock(&gctx->mutex); |
| - put_ctx(gctx); |
| + } else { |
| + mutex_lock(&ctx->mutex); |
| } |
| |
| WARN_ON_ONCE(ctx->parent_ctx); |
| - mutex_lock(&ctx->mutex); |
| |
| if (move_group) { |
| + /* |
| + * Wait for everybody to stop referencing the events through |
| + * the old lists, before installing it on new lists. |
| + */ |
| synchronize_rcu(); |
| + |
| perf_install_in_context(ctx, group_leader, group_leader->cpu); |
| get_ctx(ctx); |
| list_for_each_entry(sibling, &group_leader->sibling_list, |
| @@ -7406,6 +7568,11 @@ SYSCALL_DEFINE5(perf_event_open, |
| |
| perf_install_in_context(ctx, event, event->cpu); |
| perf_unpin_context(ctx); |
| + |
| + if (move_group) { |
| + mutex_unlock(&gctx->mutex); |
| + put_ctx(gctx); |
| + } |
| mutex_unlock(&ctx->mutex); |
| |
| put_online_cpus(); |
| @@ -7508,7 +7675,11 @@ void perf_pmu_migrate_context(struct pmu |
| src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx; |
| dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx; |
| |
| - mutex_lock(&src_ctx->mutex); |
| + /* |
| + * See perf_event_ctx_lock() for comments on the details |
| + * of swizzling perf_event::ctx. |
| + */ |
| + mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); |
| list_for_each_entry_safe(event, tmp, &src_ctx->event_list, |
| event_entry) { |
| perf_remove_from_context(event, false); |
| @@ -7516,11 +7687,9 @@ void perf_pmu_migrate_context(struct pmu |
| put_ctx(src_ctx); |
| list_add(&event->migrate_entry, &events); |
| } |
| - mutex_unlock(&src_ctx->mutex); |
| |
| synchronize_rcu(); |
| |
| - mutex_lock(&dst_ctx->mutex); |
| list_for_each_entry_safe(event, tmp, &events, migrate_entry) { |
| list_del(&event->migrate_entry); |
| if (event->state >= PERF_EVENT_STATE_OFF) |
| @@ -7530,6 +7699,7 @@ void perf_pmu_migrate_context(struct pmu |
| get_ctx(dst_ctx); |
| } |
| mutex_unlock(&dst_ctx->mutex); |
| + mutex_unlock(&src_ctx->mutex); |
| } |
| EXPORT_SYMBOL_GPL(perf_pmu_migrate_context); |
| |