releases/4.0.2/tcp-avoid-looping-in-tcp_send_fin.patch - pub/scm/linux/kernel/git/stable/stable-queue - Git at Google

 From foo@baz Wed Apr 29 11:59:49 CEST 2015
 From: Eric Dumazet <edumazet@google.com>
 Date: Thu, 23 Apr 2015 10:42:39 -0700
 Subject: tcp: avoid looping in tcp_send_fin()

 From: Eric Dumazet <edumazet@google.com>

 [ Upstream commit 845704a535e9b3c76448f52af1b70e4422ea03fd ]

 Presence of an unbound loop in tcp_send_fin() had always been hard
 to explain when analyzing crash dumps involving gigantic dying processes
 with millions of sockets.

 Lets try a different strategy :

 In case of memory pressure, try to add the FIN flag to last packet
 in write queue, even if packet was already sent. TCP stack will
 be able to deliver this FIN after a timeout event. Note that this
 FIN being delivered by a retransmit, it also carries a Push flag
 given our current implementation.

 By checking sk_under_memory_pressure(), we anticipate that cooking
 many FIN packets might deplete tcp memory.

 In the case we could not allocate a packet, even with __GFP_WAIT
 allocation, then not sending a FIN seems quite reasonable if it allows
 to get rid of this socket, free memory, and not block the process from
 eventually doing other useful work.

 Signed-off-by: Eric Dumazet <edumazet@google.com>
 Signed-off-by: David S. Miller <davem@davemloft.net>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 ---
  net/ipv4/tcp_output.c |   50 +++++++++++++++++++++++++++++---------------------
  1 file changed, 29 insertions(+), 21 deletions(-)

 --- a/net/ipv4/tcp_output.c
 +++ b/net/ipv4/tcp_output.c
 @@ -2753,7 +2753,8 @@ begin_fwd:

  /* We allow to exceed memory limits for FIN packets to expedite
   * connection tear down and (memory) recovery.
 - * Otherwise tcp_send_fin() could loop forever.
 + * Otherwise tcp_send_fin() could be tempted to either delay FIN
 + * or even be forced to close flow without any FIN.
   */
  static void sk_forced_wmem_schedule(struct sock *sk, int size)
  {
 @@ -2766,33 +2767,40 @@ static void sk_forced_wmem_schedule(stru
  	sk_memory_allocated_add(sk, amt, &status);
  }

 -/* Send a fin.  The caller locks the socket for us.  This cannot be
 - * allowed to fail queueing a FIN frame under any circumstances.
 +/* Send a FIN. The caller locks the socket for us.
 + * We should try to send a FIN packet really hard, but eventually give up.
   */
  void tcp_send_fin(struct sock *sk)
  {
 +	struct sk_buff *skb, *tskb = tcp_write_queue_tail(sk);
  	struct tcp_sock *tp = tcp_sk(sk);
 -	struct sk_buff *skb = tcp_write_queue_tail(sk);
 -	int mss_now;

 -	/* Optimization, tack on the FIN if we have a queue of
 -	 * unsent frames.  But be careful about outgoing SACKS
 -	 * and IP options.
 +	/* Optimization, tack on the FIN if we have one skb in write queue and
 +	 * this skb was not yet sent, or we are under memory pressure.
 +	 * Note: in the latter case, FIN packet will be sent after a timeout,
 +	 * as TCP stack thinks it has already been transmitted.
  	 */
 -	mss_now = tcp_current_mss(sk);
 -
 -	if (tcp_send_head(sk) != NULL) {
 -		TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_FIN;
 -		TCP_SKB_CB(skb)->end_seq++;
 +	if (tskb && (tcp_send_head(sk) || sk_under_memory_pressure(sk))) {
 +coalesce:
 +		TCP_SKB_CB(tskb)->tcp_flags |= TCPHDR_FIN;
 +		TCP_SKB_CB(tskb)->end_seq++;
  		tp->write_seq++;
 +		if (!tcp_send_head(sk)) {
 +			/* This means tskb was already sent.
 +			 * Pretend we included the FIN on previous transmit.
 +			 * We need to set tp->snd_nxt to the value it would have
 +			 * if FIN had been sent. This is because retransmit path
 +			 * does not change tp->snd_nxt.
 +			 */
 +			tp->snd_nxt++;
 +			return;
 +		}
  	} else {
 -		/* Socket is locked, keep trying until memory is available. */
 -		for (;;) {
 -			skb = alloc_skb_fclone(MAX_TCP_HEADER,
 -					       sk->sk_allocation);
 -			if (skb)
 -				break;
 -			yield();
 +		skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
 +		if (unlikely(!skb)) {
 +			if (tskb)
 +				goto coalesce;
 +			return;
  		}
  		skb_reserve(skb, MAX_TCP_HEADER);
  		sk_forced_wmem_schedule(sk, skb->truesize);
 @@ -2801,7 +2809,7 @@ void tcp_send_fin(struct sock *sk)
  				     TCPHDR_ACK | TCPHDR_FIN);
  		tcp_queue_skb(sk, skb);
  	}
 -	__tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
 +	__tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
  }

  /* We get here when a process closes a file descriptor (either due to
	From foo@baz Wed Apr 29 11:59:49 CEST 2015
	From: Eric Dumazet <edumazet@google.com>
	Date: Thu, 23 Apr 2015 10:42:39 -0700
	Subject: tcp: avoid looping in tcp_send_fin()

	From: Eric Dumazet <edumazet@google.com>

	[ Upstream commit 845704a535e9b3c76448f52af1b70e4422ea03fd ]

	Presence of an unbound loop in tcp_send_fin() had always been hard
	to explain when analyzing crash dumps involving gigantic dying processes
	with millions of sockets.

	Lets try a different strategy :

	In case of memory pressure, try to add the FIN flag to last packet
	in write queue, even if packet was already sent. TCP stack will
	be able to deliver this FIN after a timeout event. Note that this
	FIN being delivered by a retransmit, it also carries a Push flag
	given our current implementation.

	By checking sk_under_memory_pressure(), we anticipate that cooking
	many FIN packets might deplete tcp memory.

	In the case we could not allocate a packet, even with __GFP_WAIT
	allocation, then not sending a FIN seems quite reasonable if it allows
	to get rid of this socket, free memory, and not block the process from
	eventually doing other useful work.

	Signed-off-by: Eric Dumazet <edumazet@google.com>
	Signed-off-by: David S. Miller <davem@davemloft.net>
	Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	---
	net/ipv4/tcp_output.c \| 50 +++++++++++++++++++++++++++++---------------------
	1 file changed, 29 insertions(+), 21 deletions(-)

	--- a/net/ipv4/tcp_output.c
	+++ b/net/ipv4/tcp_output.c
	@@ -2753,7 +2753,8 @@ begin_fwd:

	/* We allow to exceed memory limits for FIN packets to expedite
	* connection tear down and (memory) recovery.
	- * Otherwise tcp_send_fin() could loop forever.
	+ * Otherwise tcp_send_fin() could be tempted to either delay FIN
	+ * or even be forced to close flow without any FIN.
	*/
	static void sk_forced_wmem_schedule(struct sock *sk, int size)
	{
	@@ -2766,33 +2767,40 @@ static void sk_forced_wmem_schedule(stru
	sk_memory_allocated_add(sk, amt, &status);
	}

	-/* Send a fin. The caller locks the socket for us. This cannot be
	- * allowed to fail queueing a FIN frame under any circumstances.
	+/* Send a FIN. The caller locks the socket for us.
	+ * We should try to send a FIN packet really hard, but eventually give up.
	*/
	void tcp_send_fin(struct sock *sk)
	{
	+ struct sk_buff skb, tskb = tcp_write_queue_tail(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	- struct sk_buff *skb = tcp_write_queue_tail(sk);
	- int mss_now;

	- /* Optimization, tack on the FIN if we have a queue of
	- * unsent frames. But be careful about outgoing SACKS
	- * and IP options.
	+ /* Optimization, tack on the FIN if we have one skb in write queue and
	+ * this skb was not yet sent, or we are under memory pressure.
	+ * Note: in the latter case, FIN packet will be sent after a timeout,
	+ * as TCP stack thinks it has already been transmitted.
	*/
	- mss_now = tcp_current_mss(sk);
	-
	- if (tcp_send_head(sk) != NULL) {
	- TCP_SKB_CB(skb)->tcp_flags \|= TCPHDR_FIN;
	- TCP_SKB_CB(skb)->end_seq++;
	+ if (tskb && (tcp_send_head(sk) \|\| sk_under_memory_pressure(sk))) {
	+coalesce:
	+ TCP_SKB_CB(tskb)->tcp_flags \|= TCPHDR_FIN;
	+ TCP_SKB_CB(tskb)->end_seq++;
	tp->write_seq++;
	+ if (!tcp_send_head(sk)) {
	+ /* This means tskb was already sent.
	+ * Pretend we included the FIN on previous transmit.
	+ * We need to set tp->snd_nxt to the value it would have
	+ * if FIN had been sent. This is because retransmit path
	+ * does not change tp->snd_nxt.
	+ */
	+ tp->snd_nxt++;
	+ return;
	+ }
	} else {
	- /* Socket is locked, keep trying until memory is available. */
	- for (;;) {
	- skb = alloc_skb_fclone(MAX_TCP_HEADER,
	- sk->sk_allocation);
	- if (skb)
	- break;
	- yield();
	+ skb = alloc_skb_fclone(MAX_TCP_HEADER, sk->sk_allocation);
	+ if (unlikely(!skb)) {
	+ if (tskb)
	+ goto coalesce;
	+ return;
	}
	skb_reserve(skb, MAX_TCP_HEADER);
	sk_forced_wmem_schedule(sk, skb->truesize);
	@@ -2801,7 +2809,7 @@ void tcp_send_fin(struct sock *sk)
	TCPHDR_ACK \| TCPHDR_FIN);
	tcp_queue_skb(sk, skb);
	}
	- __tcp_push_pending_frames(sk, mss_now, TCP_NAGLE_OFF);
	+ __tcp_push_pending_frames(sk, tcp_current_mss(sk), TCP_NAGLE_OFF);
	}

	/* We get here when a process closes a file descriptor (either due to