| From foo@baz Sat 16 May 2020 02:04:40 PM CEST |
| From: Eric Dumazet <edumazet@google.com> |
| Date: Tue, 12 May 2020 06:54:30 -0700 |
| Subject: tcp: fix SO_RCVLOWAT hangs with fat skbs |
| |
| From: Eric Dumazet <edumazet@google.com> |
| |
| [ Upstream commit 24adbc1676af4e134e709ddc7f34cf2adc2131e4 ] |
| |
| We autotune rcvbuf whenever SO_RCVLOWAT is set to account for 100% |
| overhead in tcp_set_rcvlowat() |
| |
| This works well when skb->len/skb->truesize ratio is bigger than 0.5 |
| |
| But if we receive packets with small MSS, we can end up in a situation |
| where not enough bytes are available in the receive queue to satisfy |
| RCVLOWAT setting. |
| As our sk_rcvbuf limit is hit, we send zero windows in ACK packets, |
| preventing remote peer from sending more data. |
| |
| Even autotuning does not help, because it only triggers at the time |
| user process drains the queue. If no EPOLLIN is generated, this |
| can not happen. |
| |
| Note poll() has a similar issue, after commit |
| c7004482e8dc ("tcp: Respect SO_RCVLOWAT in tcp_poll().") |
| |
| Fixes: 03f45c883c6f ("tcp: avoid extra wakeups for SO_RCVLOWAT users") |
| Signed-off-by: Eric Dumazet <edumazet@google.com> |
| Acked-by: Soheil Hassas Yeganeh <soheil@google.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| include/net/tcp.h | 13 +++++++++++++ |
| net/ipv4/tcp.c | 14 +++++++++++--- |
| net/ipv4/tcp_input.c | 3 ++- |
| 3 files changed, 26 insertions(+), 4 deletions(-) |
| |
| --- a/include/net/tcp.h |
| +++ b/include/net/tcp.h |
| @@ -1421,6 +1421,19 @@ static inline int tcp_full_space(const s |
| return tcp_win_from_space(sk, READ_ONCE(sk->sk_rcvbuf)); |
| } |
| |
| +/* We provision sk_rcvbuf around 200% of sk_rcvlowat. |
| + * If 87.5 % (7/8) of the space has been consumed, we want to override |
| + * SO_RCVLOWAT constraint, since we are receiving skbs with too small |
| + * len/truesize ratio. |
| + */ |
| +static inline bool tcp_rmem_pressure(const struct sock *sk) |
| +{ |
| + int rcvbuf = READ_ONCE(sk->sk_rcvbuf); |
| + int threshold = rcvbuf - (rcvbuf >> 3); |
| + |
| + return atomic_read(&sk->sk_rmem_alloc) > threshold; |
| +} |
| + |
| extern void tcp_openreq_init_rwin(struct request_sock *req, |
| const struct sock *sk_listener, |
| const struct dst_entry *dst); |
| --- a/net/ipv4/tcp.c |
| +++ b/net/ipv4/tcp.c |
| @@ -476,9 +476,17 @@ static void tcp_tx_timestamp(struct sock |
| static inline bool tcp_stream_is_readable(const struct tcp_sock *tp, |
| int target, struct sock *sk) |
| { |
| - return (READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq) >= target) || |
| - (sk->sk_prot->stream_memory_read ? |
| - sk->sk_prot->stream_memory_read(sk) : false); |
| + int avail = READ_ONCE(tp->rcv_nxt) - READ_ONCE(tp->copied_seq); |
| + |
| + if (avail > 0) { |
| + if (avail >= target) |
| + return true; |
| + if (tcp_rmem_pressure(sk)) |
| + return true; |
| + } |
| + if (sk->sk_prot->stream_memory_read) |
| + return sk->sk_prot->stream_memory_read(sk); |
| + return false; |
| } |
| |
| /* |
| --- a/net/ipv4/tcp_input.c |
| +++ b/net/ipv4/tcp_input.c |
| @@ -4761,7 +4761,8 @@ void tcp_data_ready(struct sock *sk) |
| const struct tcp_sock *tp = tcp_sk(sk); |
| int avail = tp->rcv_nxt - tp->copied_seq; |
| |
| - if (avail < sk->sk_rcvlowat && !sock_flag(sk, SOCK_DONE)) |
| + if (avail < sk->sk_rcvlowat && !tcp_rmem_pressure(sk) && |
| + !sock_flag(sk, SOCK_DONE)) |
| return; |
| |
| sk->sk_data_ready(sk); |