| From 61661bafa876a4e1949202354edb9b821c70b63d Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Mon, 24 Feb 2020 12:47:14 -0800 |
| Subject: signal: avoid double atomic counter increments for user accounting |
| |
| From: Linus Torvalds <torvalds@linux-foundation.org> |
| |
| [ Upstream commit fda31c50292a5062332fa0343c084bd9f46604d9 ] |
| |
| When queueing a signal, we increment both the users count of pending |
| signals (for RLIMIT_SIGPENDING tracking) and we increment the refcount |
| of the user struct itself (because we keep a reference to the user in |
| the signal structure in order to correctly account for it when freeing). |
| |
| That turns out to be fairly expensive, because both of them are atomic |
| updates, and particularly under extreme signal handling pressure on big |
| machines, you can get a lot of cache contention on the user struct. |
| That can then cause horrid cacheline ping-pong when you do these |
| multiple accesses. |
| |
| So change the reference counting to only pin the user for the _first_ |
| pending signal, and to unpin it when the last pending signal is |
| dequeued. That means that when a user sees a lot of concurrent signal |
| queuing - which is the only situation when this matters - the only |
| atomic access needed is generally the 'sigpending' count update. |
| |
| This was noticed because of a particularly odd timing artifact on a |
| dual-socket 96C/192T Cascade Lake platform: when you get into bad |
| contention, on that machine for some reason seems to be much worse when |
| the contention happens in the upper 32-byte half of the cacheline. |
| |
| As a result, the kernel test robot will-it-scale 'signal1' benchmark had |
| an odd performance regression simply due to random alignment of the |
| 'struct user_struct' (and pointed to a completely unrelated and |
| apparently nonsensical commit for the regression). |
| |
| Avoiding the double increments (and decrements on the dequeueing side, |
| of course) makes for much less contention and hugely improved |
| performance on that will-it-scale microbenchmark. |
| |
| Quoting Feng Tang: |
| |
| "It makes a big difference, that the performance score is tripled! bump |
| from original 17000 to 54000. Also the gap between 5.0-rc6 and |
| 5.0-rc6+Jiri's patch is reduced to around 2%" |
| |
| [ The "2% gap" is the odd cacheline placement difference on that |
| platform: under the extreme contention case, the effect of which half |
| of the cacheline was hot was 5%, so with the reduced contention the |
| odd timing artifact is reduced too ] |
| |
| It does help in the non-contended case too, but is not nearly as |
| noticeable. |
| |
| Reported-and-tested-by: Feng Tang <feng.tang@intel.com> |
| Cc: Eric W. Biederman <ebiederm@xmission.com> |
| Cc: Huang, Ying <ying.huang@intel.com> |
| Cc: Philip Li <philip.li@intel.com> |
| Cc: Andi Kleen <andi.kleen@intel.com> |
| Cc: Jiri Olsa <jolsa@redhat.com> |
| Cc: Peter Zijlstra <peterz@infradead.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| kernel/signal.c | 23 ++++++++++++++--------- |
| 1 file changed, 14 insertions(+), 9 deletions(-) |
| |
| diff --git a/kernel/signal.c b/kernel/signal.c |
| index bcd46f547db39..eea748174ade9 100644 |
| --- a/kernel/signal.c |
| +++ b/kernel/signal.c |
| @@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi |
| { |
| struct sigqueue *q = NULL; |
| struct user_struct *user; |
| + int sigpending; |
| |
| /* |
| * Protect access to @t credentials. This can go away when all |
| * callers hold rcu read lock. |
| + * |
| + * NOTE! A pending signal will hold on to the user refcount, |
| + * and we get/put the refcount only when the sigpending count |
| + * changes from/to zero. |
| */ |
| rcu_read_lock(); |
| - user = get_uid(__task_cred(t)->user); |
| - atomic_inc(&user->sigpending); |
| + user = __task_cred(t)->user; |
| + sigpending = atomic_inc_return(&user->sigpending); |
| + if (sigpending == 1) |
| + get_uid(user); |
| rcu_read_unlock(); |
| |
| - if (override_rlimit || |
| - atomic_read(&user->sigpending) <= |
| - task_rlimit(t, RLIMIT_SIGPENDING)) { |
| + if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { |
| q = kmem_cache_alloc(sigqueue_cachep, flags); |
| } else { |
| print_dropped_signal(sig); |
| } |
| |
| if (unlikely(q == NULL)) { |
| - atomic_dec(&user->sigpending); |
| - free_uid(user); |
| + if (atomic_dec_and_test(&user->sigpending)) |
| + free_uid(user); |
| } else { |
| INIT_LIST_HEAD(&q->list); |
| q->flags = 0; |
| @@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q) |
| { |
| if (q->flags & SIGQUEUE_PREALLOC) |
| return; |
| - atomic_dec(&q->user->sigpending); |
| - free_uid(q->user); |
| + if (atomic_dec_and_test(&q->user->sigpending)) |
| + free_uid(q->user); |
| kmem_cache_free(sigqueue_cachep, q); |
| } |
| |
| -- |
| 2.20.1 |
| |