| From foo@baz Sat Jun 13 09:48:35 PDT 2015 |
| From: Neal Cardwell <ncardwell@google.com> |
| Date: Fri, 29 May 2015 13:47:07 -0400 |
| Subject: tcp: fix child sockets to use system default congestion control if not set |
| |
| From: Neal Cardwell <ncardwell@google.com> |
| |
| [ Upstream commit 9f950415e4e28e7cfae2e416b43e862e8101d996 ] |
| |
| Linux 3.17 and earlier are explicitly engineered so that if the app |
| doesn't specifically request a CC module on a listener before the SYN |
| arrives, then the child gets the system default CC when the connection |
| is established. See tcp_init_congestion_control() in 3.17 or earlier, |
| which says "if no choice made yet assign the current value set as |
| default". The change ("net: tcp: assign tcp cong_ops when tcp sk is |
| created") altered these semantics, so that children got their parent |
| listener's congestion control even if the system default had changed |
| after the listener was created. |
| |
| This commit returns to those original semantics from 3.17 and earlier, |
| since they are the original semantics from 2007 in 4d4d3d1e8 ("[TCP]: |
| Congestion control initialization."), and some Linux congestion |
| control workflows depend on that. |
| |
| In summary, if a listener socket specifically sets TCP_CONGESTION to |
| "x", or the route locks the CC module to "x", then the child gets |
| "x". Otherwise the child gets current system default from |
| net.ipv4.tcp_congestion_control. That's the behavior in 3.17 and |
| earlier, and this commit restores that. |
| |
| Fixes: 55d8694fa82c ("net: tcp: assign tcp cong_ops when tcp sk is created") |
| Cc: Florian Westphal <fw@strlen.de> |
| Cc: Daniel Borkmann <dborkman@redhat.com> |
| Cc: Glenn Judd <glenn.judd@morganstanley.com> |
| Cc: Stephen Hemminger <stephen@networkplumber.org> |
| Signed-off-by: Neal Cardwell <ncardwell@google.com> |
| Signed-off-by: Eric Dumazet <edumazet@google.com> |
| Signed-off-by: Yuchung Cheng <ycheng@google.com> |
| Acked-by: Daniel Borkmann <daniel@iogearbox.net> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| include/net/inet_connection_sock.h | 3 ++- |
| net/ipv4/tcp_cong.c | 5 ++++- |
| net/ipv4/tcp_minisocks.c | 5 ++++- |
| 3 files changed, 10 insertions(+), 3 deletions(-) |
| |
| --- a/include/net/inet_connection_sock.h |
| +++ b/include/net/inet_connection_sock.h |
| @@ -98,7 +98,8 @@ struct inet_connection_sock { |
| const struct tcp_congestion_ops *icsk_ca_ops; |
| const struct inet_connection_sock_af_ops *icsk_af_ops; |
| unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); |
| - __u8 icsk_ca_state:7, |
| + __u8 icsk_ca_state:6, |
| + icsk_ca_setsockopt:1, |
| icsk_ca_dst_locked:1; |
| __u8 icsk_retransmits; |
| __u8 icsk_pending; |
| --- a/net/ipv4/tcp_cong.c |
| +++ b/net/ipv4/tcp_cong.c |
| @@ -187,6 +187,7 @@ static void tcp_reinit_congestion_contro |
| |
| tcp_cleanup_congestion_control(sk); |
| icsk->icsk_ca_ops = ca; |
| + icsk->icsk_ca_setsockopt = 1; |
| |
| if (sk->sk_state != TCP_CLOSE && icsk->icsk_ca_ops->init) |
| icsk->icsk_ca_ops->init(sk); |
| @@ -335,8 +336,10 @@ int tcp_set_congestion_control(struct so |
| rcu_read_lock(); |
| ca = __tcp_ca_find_autoload(name); |
| /* No change asking for existing value */ |
| - if (ca == icsk->icsk_ca_ops) |
| + if (ca == icsk->icsk_ca_ops) { |
| + icsk->icsk_ca_setsockopt = 1; |
| goto out; |
| + } |
| if (!ca) |
| err = -ENOENT; |
| else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || |
| --- a/net/ipv4/tcp_minisocks.c |
| +++ b/net/ipv4/tcp_minisocks.c |
| @@ -437,7 +437,10 @@ void tcp_ca_openreq_child(struct sock *s |
| rcu_read_unlock(); |
| } |
| |
| - if (!ca_got_dst && !try_module_get(icsk->icsk_ca_ops->owner)) |
| + /* If no valid choice made yet, assign current system default ca. */ |
| + if (!ca_got_dst && |
| + (!icsk->icsk_ca_setsockopt || |
| + !try_module_get(icsk->icsk_ca_ops->owner))) |
| tcp_assign_congestion_control(sk); |
| |
| tcp_set_ca_state(sk, TCP_CA_Open); |