| From fe67f4dd8daa252eb9aa7acb61555f3cc3c1ce4c Mon Sep 17 00:00:00 2001 |
| From: Linus Torvalds <torvalds@linux-foundation.org> |
| Date: Tue, 24 Aug 2021 10:39:25 -0700 |
| Subject: pipe: do FASYNC notifications for every pipe IO, not just state changes |
| |
| From: Linus Torvalds <torvalds@linux-foundation.org> |
| |
| commit fe67f4dd8daa252eb9aa7acb61555f3cc3c1ce4c upstream. |
| |
| It turns out that the SIGIO/FASYNC situation is almost exactly the same |
| as the EPOLLET case was: user space really wants to be notified after |
| every operation. |
| |
| Now, in a perfect world it should be sufficient to only notify user |
| space on "state transitions" when the IO state changes (ie when a pipe |
| goes from unreadable to readable, or from unwritable to writable). User |
| space should then do as much as possible - fully emptying the buffer or |
| what not - and we'll notify it again the next time the state changes. |
| |
| But as with EPOLLET, we have at least one case (stress-ng) where the |
| kernel sent SIGIO due to the pipe being marked for asynchronous |
| notification, but the user space signal handler then didn't actually |
| necessarily read it all before returning (it read more than what was |
| written, but since there could be multiple writes, it could leave data |
| pending). |
| |
| The user space code then expected to get another SIGIO for subsequent |
| writes - even though the pipe had been readable the whole time - and |
| would only then read more. |
| |
| This is arguably a user space bug - and Colin King already fixed the |
| stress-ng code in question - but the kernel regression rules are clear: |
| it doesn't matter if kernel people think that user space did something |
| silly and wrong. What matters is that it used to work. |
| |
| So if user space depends on specific historical kernel behavior, it's a |
| regression when that behavior changes. It's on us: we were silly to |
| have that non-optimal historical behavior, and our old kernel behavior |
| was what user space was tested against. |
| |
| Because of how the FASYNC notification was tied to wakeup behavior, this |
| was first broken by commits f467a6a66419 and 1b6b26ae7053 ("pipe: fix |
| and clarify pipe read/write wakeup logic"), but at the time it seems |
| nobody noticed. Probably because the stress-ng problem case ends up |
| being timing-dependent too. |
| |
| It was then unwittingly fixed by commit 3a34b13a88ca ("pipe: make pipe |
| writes always wake up readers") only to be broken again when by commit |
| 3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal |
| loads"). |
| |
| And at that point the kernel test robot noticed the performance |
| refression in the stress-ng.sigio.ops_per_sec case. So the "Fixes" tag |
| below is somewhat ad hoc, but it matches when the issue was noticed. |
| |
| Fix it for good (knock wood) by simply making the kill_fasync() case |
| separate from the wakeup case. FASYNC is quite rare, and we clearly |
| shouldn't even try to use the "avoid unnecessary wakeups" logic for it. |
| |
| Link: https://lore.kernel.org/lkml/20210824151337.GC27667@xsang-OptiPlex-9020/ |
| Fixes: 3b844826b6c6 ("pipe: avoid unnecessary EPOLLET wakeups under normal loads") |
| Reported-by: kernel test robot <oliver.sang@intel.com> |
| Tested-by: Oliver Sang <oliver.sang@intel.com> |
| Cc: Eric Biederman <ebiederm@xmission.com> |
| Cc: Colin Ian King <colin.king@canonical.com> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| fs/pipe.c | 20 ++++++++------------ |
| 1 file changed, 8 insertions(+), 12 deletions(-) |
| |
| --- a/fs/pipe.c |
| +++ b/fs/pipe.c |
| @@ -363,10 +363,9 @@ pipe_read(struct kiocb *iocb, struct iov |
| * _very_ unlikely case that the pipe was full, but we got |
| * no data. |
| */ |
| - if (unlikely(was_full)) { |
| + if (unlikely(was_full)) |
| wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); |
| - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| - } |
| + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| |
| /* |
| * But because we didn't read anything, at this point we can |
| @@ -385,12 +384,11 @@ pipe_read(struct kiocb *iocb, struct iov |
| wake_next_reader = false; |
| __pipe_unlock(pipe); |
| |
| - if (was_full) { |
| + if (was_full) |
| wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); |
| - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| - } |
| if (wake_next_reader) |
| wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); |
| + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); |
| if (ret > 0) |
| file_accessed(filp); |
| return ret; |
| @@ -565,10 +563,9 @@ pipe_write(struct kiocb *iocb, struct io |
| * become empty while we dropped the lock. |
| */ |
| __pipe_unlock(pipe); |
| - if (was_empty) { |
| + if (was_empty) |
| wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); |
| - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| - } |
| + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); |
| __pipe_lock(pipe); |
| was_empty = pipe_empty(pipe->head, pipe->tail); |
| @@ -591,10 +588,9 @@ out: |
| * Epoll nonsensically wants a wakeup whether the pipe |
| * was already empty or not. |
| */ |
| - if (was_empty || pipe->poll_usage) { |
| + if (was_empty || pipe->poll_usage) |
| wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); |
| - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| - } |
| + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); |
| if (wake_next_writer) |
| wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); |
| if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { |