releases/5.4.27/signal-avoid-double-atomic-counter-increments-for-us.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From 61661bafa876a4e1949202354edb9b821c70b63d Mon Sep 17 00:00:00 2001
 From: Sasha Levin <sashal@kernel.org>
 Date: Mon, 24 Feb 2020 12:47:14 -0800
 Subject: signal: avoid double atomic counter increments for user accounting

 From: Linus Torvalds <torvalds@linux-foundation.org>

 [ Upstream commit fda31c50292a5062332fa0343c084bd9f46604d9 ]

 When queueing a signal, we increment both the users count of pending
 signals (for RLIMIT_SIGPENDING tracking) and we increment the refcount
 of the user struct itself (because we keep a reference to the user in
 the signal structure in order to correctly account for it when freeing).

 That turns out to be fairly expensive, because both of them are atomic
 updates, and particularly under extreme signal handling pressure on big
 machines, you can get a lot of cache contention on the user struct.
 That can then cause horrid cacheline ping-pong when you do these
 multiple accesses.

 So change the reference counting to only pin the user for the _first_
 pending signal, and to unpin it when the last pending signal is
 dequeued.  That means that when a user sees a lot of concurrent signal
 queuing - which is the only situation when this matters - the only
 atomic access needed is generally the 'sigpending' count update.

 This was noticed because of a particularly odd timing artifact on a
 dual-socket 96C/192T Cascade Lake platform: when you get into bad
 contention, on that machine for some reason seems to be much worse when
 the contention happens in the upper 32-byte half of the cacheline.

 As a result, the kernel test robot will-it-scale 'signal1' benchmark had
 an odd performance regression simply due to random alignment of the
 'struct user_struct' (and pointed to a completely unrelated and
 apparently nonsensical commit for the regression).

 Avoiding the double increments (and decrements on the dequeueing side,
 of course) makes for much less contention and hugely improved
 performance on that will-it-scale microbenchmark.

 Quoting Feng Tang:

  "It makes a big difference, that the performance score is tripled! bump
   from original 17000 to 54000. Also the gap between 5.0-rc6 and
   5.0-rc6+Jiri's patch is reduced to around 2%"

 [ The "2% gap" is the odd cacheline placement difference on that
   platform: under the extreme contention case, the effect of which half
   of the cacheline was hot was 5%, so with the reduced contention the
   odd timing artifact is reduced too ]

 It does help in the non-contended case too, but is not nearly as
 noticeable.

 Reported-and-tested-by: Feng Tang <feng.tang@intel.com>
 Cc: Eric W. Biederman <ebiederm@xmission.com>
 Cc: Huang, Ying <ying.huang@intel.com>
 Cc: Philip Li <philip.li@intel.com>
 Cc: Andi Kleen <andi.kleen@intel.com>
 Cc: Jiri Olsa <jolsa@redhat.com>
 Cc: Peter Zijlstra <peterz@infradead.org>
 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 Signed-off-by: Sasha Levin <sashal@kernel.org>
 ---
  kernel/signal.c | 23 ++++++++++++++---------
  1 file changed, 14 insertions(+), 9 deletions(-)

 diff --git a/kernel/signal.c b/kernel/signal.c
 index bcd46f547db39..eea748174ade9 100644
 --- a/kernel/signal.c
 +++ b/kernel/signal.c
 @@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
  {
  	struct sigqueue *q = NULL;
  	struct user_struct *user;
 +	int sigpending;

  	/*
  	 * Protect access to @t credentials. This can go away when all
  	 * callers hold rcu read lock.
 +	 *
 +	 * NOTE! A pending signal will hold on to the user refcount,
 +	 * and we get/put the refcount only when the sigpending count
 +	 * changes from/to zero.
  	 */
  	rcu_read_lock();
 -	user = get_uid(__task_cred(t)->user);
 -	atomic_inc(&user->sigpending);
 +	user = __task_cred(t)->user;
 +	sigpending = atomic_inc_return(&user->sigpending);
 +	if (sigpending == 1)
 +		get_uid(user);
  	rcu_read_unlock();

 -	if (override_rlimit ||
 -	    atomic_read(&user->sigpending) <=
 -			task_rlimit(t, RLIMIT_SIGPENDING)) {
 +	if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
  		q = kmem_cache_alloc(sigqueue_cachep, flags);
  	} else {
  		print_dropped_signal(sig);
  	}

  	if (unlikely(q == NULL)) {
 -		atomic_dec(&user->sigpending);
 -		free_uid(user);
 +		if (atomic_dec_and_test(&user->sigpending))
 +			free_uid(user);
  	} else {
  		INIT_LIST_HEAD(&q->list);
  		q->flags = 0;
 @@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q)
  {
  	if (q->flags & SIGQUEUE_PREALLOC)
  		return;
 -	atomic_dec(&q->user->sigpending);
 -	free_uid(q->user);
 +	if (atomic_dec_and_test(&q->user->sigpending))
 +		free_uid(q->user);
  	kmem_cache_free(sigqueue_cachep, q);
  }

 --
 2.20.1
	From 61661bafa876a4e1949202354edb9b821c70b63d Mon Sep 17 00:00:00 2001
	From: Sasha Levin <sashal@kernel.org>
	Date: Mon, 24 Feb 2020 12:47:14 -0800
	Subject: signal: avoid double atomic counter increments for user accounting

	From: Linus Torvalds <torvalds@linux-foundation.org>

	[ Upstream commit fda31c50292a5062332fa0343c084bd9f46604d9 ]

	When queueing a signal, we increment both the users count of pending
	signals (for RLIMIT_SIGPENDING tracking) and we increment the refcount
	of the user struct itself (because we keep a reference to the user in
	the signal structure in order to correctly account for it when freeing).

	That turns out to be fairly expensive, because both of them are atomic
	updates, and particularly under extreme signal handling pressure on big
	machines, you can get a lot of cache contention on the user struct.
	That can then cause horrid cacheline ping-pong when you do these
	multiple accesses.

	So change the reference counting to only pin the user for the _first_
	pending signal, and to unpin it when the last pending signal is
	dequeued. That means that when a user sees a lot of concurrent signal
	queuing - which is the only situation when this matters - the only
	atomic access needed is generally the 'sigpending' count update.

	This was noticed because of a particularly odd timing artifact on a
	dual-socket 96C/192T Cascade Lake platform: when you get into bad
	contention, on that machine for some reason seems to be much worse when
	the contention happens in the upper 32-byte half of the cacheline.

	As a result, the kernel test robot will-it-scale 'signal1' benchmark had
	an odd performance regression simply due to random alignment of the
	'struct user_struct' (and pointed to a completely unrelated and
	apparently nonsensical commit for the regression).

	Avoiding the double increments (and decrements on the dequeueing side,
	of course) makes for much less contention and hugely improved
	performance on that will-it-scale microbenchmark.

	Quoting Feng Tang:

	"It makes a big difference, that the performance score is tripled! bump
	from original 17000 to 54000. Also the gap between 5.0-rc6 and
	5.0-rc6+Jiri's patch is reduced to around 2%"

	[ The "2% gap" is the odd cacheline placement difference on that
	platform: under the extreme contention case, the effect of which half
	of the cacheline was hot was 5%, so with the reduced contention the
	odd timing artifact is reduced too ]

	It does help in the non-contended case too, but is not nearly as
	noticeable.

	Reported-and-tested-by: Feng Tang <feng.tang@intel.com>
	Cc: Eric W. Biederman <ebiederm@xmission.com>
	Cc: Huang, Ying <ying.huang@intel.com>
	Cc: Philip Li <philip.li@intel.com>
	Cc: Andi Kleen <andi.kleen@intel.com>
	Cc: Jiri Olsa <jolsa@redhat.com>
	Cc: Peter Zijlstra <peterz@infradead.org>
	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	Signed-off-by: Sasha Levin <sashal@kernel.org>
	---
	kernel/signal.c \| 23 ++++++++++++++---------
	1 file changed, 14 insertions(+), 9 deletions(-)

	diff --git a/kernel/signal.c b/kernel/signal.c
	index bcd46f547db39..eea748174ade9 100644
	--- a/kernel/signal.c
	+++ b/kernel/signal.c
	@@ -413,27 +413,32 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
	{
	struct sigqueue *q = NULL;
	struct user_struct *user;
	+ int sigpending;

	/*
	* Protect access to @t credentials. This can go away when all
	* callers hold rcu read lock.
	+ *
	+ * NOTE! A pending signal will hold on to the user refcount,
	+ * and we get/put the refcount only when the sigpending count
	+ * changes from/to zero.
	*/
	rcu_read_lock();
	- user = get_uid(__task_cred(t)->user);
	- atomic_inc(&user->sigpending);
	+ user = __task_cred(t)->user;
	+ sigpending = atomic_inc_return(&user->sigpending);
	+ if (sigpending == 1)
	+ get_uid(user);
	rcu_read_unlock();

	- if (override_rlimit \|\|
	- atomic_read(&user->sigpending) <=
	- task_rlimit(t, RLIMIT_SIGPENDING)) {
	+ if (override_rlimit \|\| likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
	q = kmem_cache_alloc(sigqueue_cachep, flags);
	} else {
	print_dropped_signal(sig);
	}

	if (unlikely(q == NULL)) {
	- atomic_dec(&user->sigpending);
	- free_uid(user);
	+ if (atomic_dec_and_test(&user->sigpending))
	+ free_uid(user);
	} else {
	INIT_LIST_HEAD(&q->list);
	q->flags = 0;
	@@ -447,8 +452,8 @@ static void __sigqueue_free(struct sigqueue *q)
	{
	if (q->flags & SIGQUEUE_PREALLOC)
	return;
	- atomic_dec(&q->user->sigpending);
	- free_uid(q->user);
	+ if (atomic_dec_and_test(&q->user->sigpending))
	+ free_uid(q->user);
	kmem_cache_free(sigqueue_cachep, q);
	}

	--
	2.20.1