| From e9d35808c2b9684f8b4020144821e23427044406 Mon Sep 17 00:00:00 2001 |
| From: Sasha Levin <sashal@kernel.org> |
| Date: Wed, 23 Jun 2021 14:44:38 -0700 |
| Subject: net: ip: avoid OOM kills with large UDP sends over loopback |
| |
| From: Jakub Kicinski <kuba@kernel.org> |
| |
| [ Upstream commit 6d123b81ac615072a8525c13c6c41b695270a15d ] |
| |
| Dave observed number of machines hitting OOM on the UDP send |
| path. The workload seems to be sending large UDP packets over |
| loopback. Since loopback has MTU of 64k kernel will try to |
| allocate an skb with up to 64k of head space. This has a good |
| chance of failing under memory pressure. What's worse if |
| the message length is <32k the allocation may trigger an |
| OOM killer. |
| |
| This is entirely avoidable, we can use an skb with page frags. |
| |
| af_unix solves a similar problem by limiting the head |
| length to SKB_MAX_ALLOC. This seems like a good and simple |
| approach. It means that UDP messages > 16kB will now |
| use fragments if underlying device supports SG, if extra |
| allocator pressure causes regressions in real workloads |
| we can switch to trying the large allocation first and |
| falling back. |
| |
| v4: pre-calculate all the additions to alloclen so |
| we can be sure it won't go over order-2 |
| |
| Reported-by: Dave Jones <dsj@fb.com> |
| Signed-off-by: Jakub Kicinski <kuba@kernel.org> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| net/ipv4/ip_output.c | 32 ++++++++++++++++++-------------- |
| net/ipv6/ip6_output.c | 32 +++++++++++++++++--------------- |
| 2 files changed, 35 insertions(+), 29 deletions(-) |
| |
| diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c |
| index e411c42d8428..e63905f7f6f9 100644 |
| --- a/net/ipv4/ip_output.c |
| +++ b/net/ipv4/ip_output.c |
| @@ -940,7 +940,7 @@ static int __ip_append_data(struct sock *sk, |
| unsigned int datalen; |
| unsigned int fraglen; |
| unsigned int fraggap; |
| - unsigned int alloclen; |
| + unsigned int alloclen, alloc_extra; |
| unsigned int pagedlen; |
| struct sk_buff *skb_prev; |
| alloc_new_skb: |
| @@ -960,35 +960,39 @@ alloc_new_skb: |
| fraglen = datalen + fragheaderlen; |
| pagedlen = 0; |
| |
| + alloc_extra = hh_len + 15; |
| + alloc_extra += exthdrlen; |
| + |
| + /* The last fragment gets additional space at tail. |
| + * Note, with MSG_MORE we overallocate on fragments, |
| + * because we have no idea what fragment will be |
| + * the last. |
| + */ |
| + if (datalen == length + fraggap) |
| + alloc_extra += rt->dst.trailer_len; |
| + |
| if ((flags & MSG_MORE) && |
| !(rt->dst.dev->features&NETIF_F_SG)) |
| alloclen = mtu; |
| - else if (!paged) |
| + else if (!paged && |
| + (fraglen + alloc_extra < SKB_MAX_ALLOC || |
| + !(rt->dst.dev->features & NETIF_F_SG))) |
| alloclen = fraglen; |
| else { |
| alloclen = min_t(int, fraglen, MAX_HEADER); |
| pagedlen = fraglen - alloclen; |
| } |
| |
| - alloclen += exthdrlen; |
| - |
| - /* The last fragment gets additional space at tail. |
| - * Note, with MSG_MORE we overallocate on fragments, |
| - * because we have no idea what fragment will be |
| - * the last. |
| - */ |
| - if (datalen == length + fraggap) |
| - alloclen += rt->dst.trailer_len; |
| + alloclen += alloc_extra; |
| |
| if (transhdrlen) { |
| - skb = sock_alloc_send_skb(sk, |
| - alloclen + hh_len + 15, |
| + skb = sock_alloc_send_skb(sk, alloclen, |
| (flags & MSG_DONTWAIT), &err); |
| } else { |
| skb = NULL; |
| if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= |
| 2 * sk->sk_sndbuf) |
| - skb = alloc_skb(alloclen + hh_len + 15, |
| + skb = alloc_skb(alloclen, |
| sk->sk_allocation); |
| if (unlikely(!skb)) |
| err = -ENOBUFS; |
| diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c |
| index e1bb7db88483..aa8f19f852cc 100644 |
| --- a/net/ipv6/ip6_output.c |
| +++ b/net/ipv6/ip6_output.c |
| @@ -1394,7 +1394,7 @@ emsgsize: |
| unsigned int datalen; |
| unsigned int fraglen; |
| unsigned int fraggap; |
| - unsigned int alloclen; |
| + unsigned int alloclen, alloc_extra; |
| unsigned int pagedlen; |
| alloc_new_skb: |
| /* There's no room in the current skb */ |
| @@ -1421,17 +1421,28 @@ alloc_new_skb: |
| fraglen = datalen + fragheaderlen; |
| pagedlen = 0; |
| |
| + alloc_extra = hh_len; |
| + alloc_extra += dst_exthdrlen; |
| + alloc_extra += rt->dst.trailer_len; |
| + |
| + /* We just reserve space for fragment header. |
| + * Note: this may be overallocation if the message |
| + * (without MSG_MORE) fits into the MTU. |
| + */ |
| + alloc_extra += sizeof(struct frag_hdr); |
| + |
| if ((flags & MSG_MORE) && |
| !(rt->dst.dev->features&NETIF_F_SG)) |
| alloclen = mtu; |
| - else if (!paged) |
| + else if (!paged && |
| + (fraglen + alloc_extra < SKB_MAX_ALLOC || |
| + !(rt->dst.dev->features & NETIF_F_SG))) |
| alloclen = fraglen; |
| else { |
| alloclen = min_t(int, fraglen, MAX_HEADER); |
| pagedlen = fraglen - alloclen; |
| } |
| - |
| - alloclen += dst_exthdrlen; |
| + alloclen += alloc_extra; |
| |
| if (datalen != length + fraggap) { |
| /* |
| @@ -1441,30 +1452,21 @@ alloc_new_skb: |
| datalen += rt->dst.trailer_len; |
| } |
| |
| - alloclen += rt->dst.trailer_len; |
| fraglen = datalen + fragheaderlen; |
| |
| - /* |
| - * We just reserve space for fragment header. |
| - * Note: this may be overallocation if the message |
| - * (without MSG_MORE) fits into the MTU. |
| - */ |
| - alloclen += sizeof(struct frag_hdr); |
| - |
| copy = datalen - transhdrlen - fraggap - pagedlen; |
| if (copy < 0) { |
| err = -EINVAL; |
| goto error; |
| } |
| if (transhdrlen) { |
| - skb = sock_alloc_send_skb(sk, |
| - alloclen + hh_len, |
| + skb = sock_alloc_send_skb(sk, alloclen, |
| (flags & MSG_DONTWAIT), &err); |
| } else { |
| skb = NULL; |
| if (refcount_read(&sk->sk_wmem_alloc) + wmem_alloc_delta <= |
| 2 * sk->sk_sndbuf) |
| - skb = alloc_skb(alloclen + hh_len, |
| + skb = alloc_skb(alloclen, |
| sk->sk_allocation); |
| if (unlikely(!skb)) |
| err = -ENOBUFS; |
| -- |
| 2.30.2 |
| |