releases/5.13.14/pipe-do-fasync-notifications-for-every-pipe-io-not-just-state-changes.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From fe67f4dd8daa252eb9aa7acb61555f3cc3c1ce4c Mon Sep 17 00:00:00 2001
 From: Linus Torvalds <torvalds@linux-foundation.org>
 Date: Tue, 24 Aug 2021 10:39:25 -0700
 Subject: pipe: do FASYNC notifications for every pipe IO, not just state changes

 From: Linus Torvalds <torvalds@linux-foundation.org>

 commit fe67f4dd8daa252eb9aa7acb61555f3cc3c1ce4c upstream.

 It turns out that the SIGIO/FASYNC situation is almost exactly the same
 as the EPOLLET case was: user space really wants to be notified after
 every operation.

 Now, in a perfect world it should be sufficient to only notify user
 space on "state transitions" when the IO state changes (ie when a pipe
 goes from unreadable to readable, or from unwritable to writable).  User
 space should then do as much as possible - fully emptying the buffer or
 what not - and we'll notify it again the next time the state changes.

 But as with EPOLLET, we have at least one case (stress-ng) where the
 kernel sent SIGIO due to the pipe being marked for asynchronous
 notification, but the user space signal handler then didn't actually
 necessarily read it all before returning (it read more than what was
 written, but since there could be multiple writes, it could leave data
 pending).

 The user space code then expected to get another SIGIO for subsequent
 writes - even though the pipe had been readable the whole time - and
 would only then read more.

 This is arguably a user space bug - and Colin King already fixed the
 stress-ng code in question - but the kernel regression rules are clear:
 it doesn't matter if kernel people think that user space did something
 silly and wrong.  What matters is that it used to work.

 So if user space depends on specific historical kernel behavior, it's a
 regression when that behavior changes.  It's on us: we were silly to
 have that non-optimal historical behavior, and our old kernel behavior
 was what user space was tested against.

 Because of how the FASYNC notification was tied to wakeup behavior, this
 was first broken by commits f467a6a66419 and 1b6b26ae7053 ("pipe: fix
 and clarify pipe read/write wakeup logic"), but at the time it seems
 nobody noticed.  Probably because the stress-ng problem case ends up
 being timing-dependent too.

 It was then unwittingly fixed by commit 3a34b13a88ca ("pipe: make pipe
 writes always wake up readers") only to be broken again when by commit
 3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal
 loads").

 And at that point the kernel test robot noticed the performance
 refression in the stress-ng.sigio.ops_per_sec case.  So the "Fixes" tag
 below is somewhat ad hoc, but it matches when the issue was noticed.

 Fix it for good (knock wood) by simply making the kill_fasync() case
 separate from the wakeup case.  FASYNC is quite rare, and we clearly
 shouldn't even try to use the "avoid unnecessary wakeups" logic for it.

 Link: https://lore.kernel.org/lkml/20210824151337.GC27667@xsang-OptiPlex-9020/
 Fixes: 3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal loads")
 Reported-by: kernel test robot <oliver.sang@intel.com>
 Tested-by: Oliver Sang <oliver.sang@intel.com>
 Cc: Eric Biederman <ebiederm@xmission.com>
 Cc: Colin Ian King <colin.king@canonical.com>
 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  fs/pipe.c |   20 ++++++++------------
  1 file changed, 8 insertions(+), 12 deletions(-)

 --- a/fs/pipe.c
 +++ b/fs/pipe.c
 @@ -363,10 +363,9 @@ pipe_read(struct kiocb *iocb, struct iov
  		 * _very_ unlikely case that the pipe was full, but we got
  		 * no data.
  		 */
 -		if (unlikely(was_full)) {
 +		if (unlikely(was_full))
  			wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 -			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 -		}
 +		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);

  		/*
  		 * But because we didn't read anything, at this point we can
 @@ -385,12 +384,11 @@ pipe_read(struct kiocb *iocb, struct iov
  		wake_next_reader = false;
  	__pipe_unlock(pipe);

 -	if (was_full) {
 +	if (was_full)
  		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
 -		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 -	}
  	if (wake_next_reader)
  		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 +	kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
  	if (ret > 0)
  		file_accessed(filp);
  	return ret;
 @@ -565,10 +563,9 @@ pipe_write(struct kiocb *iocb, struct io
  		 * become empty while we dropped the lock.
  		 */
  		__pipe_unlock(pipe);
 -		if (was_empty) {
 +		if (was_empty)
  			wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 -			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 -		}
 +		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  		wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
  		__pipe_lock(pipe);
  		was_empty = pipe_empty(pipe->head, pipe->tail);
 @@ -591,10 +588,9 @@ out:
  	 * Epoll nonsensically wants a wakeup whether the pipe
  	 * was already empty or not.
  	 */
 -	if (was_empty || pipe->poll_usage) {
 +	if (was_empty || pipe->poll_usage)
  		wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM);
 -		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 -	}
 +	kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
  	if (wake_next_writer)
  		wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM);
  	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {
	From fe67f4dd8daa252eb9aa7acb61555f3cc3c1ce4c Mon Sep 17 00:00:00 2001
	From: Linus Torvalds <torvalds@linux-foundation.org>
	Date: Tue, 24 Aug 2021 10:39:25 -0700
	Subject: pipe: do FASYNC notifications for every pipe IO, not just state changes

	From: Linus Torvalds <torvalds@linux-foundation.org>

	commit fe67f4dd8daa252eb9aa7acb61555f3cc3c1ce4c upstream.

	It turns out that the SIGIO/FASYNC situation is almost exactly the same
	as the EPOLLET case was: user space really wants to be notified after
	every operation.

	Now, in a perfect world it should be sufficient to only notify user
	space on "state transitions" when the IO state changes (ie when a pipe
	goes from unreadable to readable, or from unwritable to writable). User
	space should then do as much as possible - fully emptying the buffer or
	what not - and we'll notify it again the next time the state changes.

	But as with EPOLLET, we have at least one case (stress-ng) where the
	kernel sent SIGIO due to the pipe being marked for asynchronous
	notification, but the user space signal handler then didn't actually
	necessarily read it all before returning (it read more than what was
	written, but since there could be multiple writes, it could leave data
	pending).

	The user space code then expected to get another SIGIO for subsequent
	writes - even though the pipe had been readable the whole time - and
	would only then read more.

	This is arguably a user space bug - and Colin King already fixed the
	stress-ng code in question - but the kernel regression rules are clear:
	it doesn't matter if kernel people think that user space did something
	silly and wrong. What matters is that it used to work.

	So if user space depends on specific historical kernel behavior, it's a
	regression when that behavior changes. It's on us: we were silly to
	have that non-optimal historical behavior, and our old kernel behavior
	was what user space was tested against.

	Because of how the FASYNC notification was tied to wakeup behavior, this
	was first broken by commits f467a6a66419 and 1b6b26ae7053 ("pipe: fix
	and clarify pipe read/write wakeup logic"), but at the time it seems
	nobody noticed. Probably because the stress-ng problem case ends up
	being timing-dependent too.

	It was then unwittingly fixed by commit 3a34b13a88ca ("pipe: make pipe
	writes always wake up readers") only to be broken again when by commit
	3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal
	loads").

	And at that point the kernel test robot noticed the performance
	refression in the stress-ng.sigio.ops_per_sec case. So the "Fixes" tag
	below is somewhat ad hoc, but it matches when the issue was noticed.

	Fix it for good (knock wood) by simply making the kill_fasync() case
	separate from the wakeup case. FASYNC is quite rare, and we clearly
	shouldn't even try to use the "avoid unnecessary wakeups" logic for it.

	Link: https://lore.kernel.org/lkml/20210824151337.GC27667@xsang-OptiPlex-9020/
	Fixes: 3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal loads")
	Reported-by: kernel test robot <oliver.sang@intel.com>
	Tested-by: Oliver Sang <oliver.sang@intel.com>
	Cc: Eric Biederman <ebiederm@xmission.com>
	Cc: Colin Ian King <colin.king@canonical.com>
	Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	fs/pipe.c \| 20 ++++++++------------
	1 file changed, 8 insertions(+), 12 deletions(-)

	--- a/fs/pipe.c
	+++ b/fs/pipe.c
	@@ -363,10 +363,9 @@ pipe_read(struct kiocb *iocb, struct iov
	* _very_ unlikely case that the pipe was full, but we got
	* no data.
	*/
	- if (unlikely(was_full)) {
	+ if (unlikely(was_full))
	wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT \| EPOLLWRNORM);
	- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
	- }
	+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);

	/*
	* But because we didn't read anything, at this point we can
	@@ -385,12 +384,11 @@ pipe_read(struct kiocb *iocb, struct iov
	wake_next_reader = false;
	__pipe_unlock(pipe);

	- if (was_full) {
	+ if (was_full)
	wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT \| EPOLLWRNORM);
	- kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
	- }
	if (wake_next_reader)
	wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN \| EPOLLRDNORM);
	+ kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
	if (ret > 0)
	file_accessed(filp);
	return ret;
	@@ -565,10 +563,9 @@ pipe_write(struct kiocb *iocb, struct io
	* become empty while we dropped the lock.
	*/
	__pipe_unlock(pipe);
	- if (was_empty) {
	+ if (was_empty)
	wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN \| EPOLLRDNORM);
	- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
	- }
	+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
	wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe));
	__pipe_lock(pipe);
	was_empty = pipe_empty(pipe->head, pipe->tail);
	@@ -591,10 +588,9 @@ out:
	* Epoll nonsensically wants a wakeup whether the pipe
	* was already empty or not.
	*/
	- if (was_empty \|\| pipe->poll_usage) {
	+ if (was_empty \|\| pipe->poll_usage)
	wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN \| EPOLLRDNORM);
	- kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
	- }
	+ kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
	if (wake_next_writer)
	wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT \| EPOLLWRNORM);
	if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) {