| From foo@baz Tue Jan 26 21:37:04 PST 2016 |
| From: Eric Dumazet <edumazet@google.com> |
| Date: Sat, 19 Sep 2015 09:08:34 -0700 |
| Subject: tcp/dccp: fix timewait races in timer handling |
| |
| From: Eric Dumazet <edumazet@google.com> |
| |
| [ Upstream commit ed2e923945892a8372ab70d2f61d364b0b6d9054 ] |
| |
| When creating a timewait socket, we need to arm the timer before |
| allowing other cpus to find it. The signal allowing cpus to find |
| the socket is setting tw_refcnt to non zero value. |
| |
| As we set tw_refcnt in __inet_twsk_hashdance(), we therefore need to |
| call inet_twsk_schedule() first. |
| |
| This also means we need to remove tw_refcnt changes from |
| inet_twsk_schedule() and let the caller handle it. |
| |
| Note that because we use mod_timer_pinned(), we have the guarantee |
| the timer wont expire before we set tw_refcnt as we run in BH context. |
| |
| To make things more readable I introduced inet_twsk_reschedule() helper. |
| |
| When rearming the timer, we can use mod_timer_pending() to make sure |
| we do not rearm a canceled timer. |
| |
| Note: This bug can possibly trigger if packets of a flow can hit |
| multiple cpus. This does not normally happen, unless flow steering |
| is broken somehow. This explains this bug was spotted ~5 months after |
| its introduction. |
| |
| A similar fix is needed for SYN_RECV sockets in reqsk_queue_hash_req(), |
| but will be provided in a separate patch for proper tracking. |
| |
| Fixes: 789f558cfb36 ("tcp/dccp: get rid of central timewait timer") |
| Signed-off-by: Eric Dumazet <edumazet@google.com> |
| Reported-by: Ying Cai <ycai@google.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| include/net/inet_timewait_sock.h | 14 +++++++++++++- |
| net/dccp/minisocks.c | 4 ++-- |
| net/ipv4/inet_timewait_sock.c | 16 ++++++++++------ |
| net/ipv4/tcp_minisocks.c | 13 ++++++------- |
| 4 files changed, 31 insertions(+), 16 deletions(-) |
| |
| --- a/include/net/inet_timewait_sock.h |
| +++ b/include/net/inet_timewait_sock.h |
| @@ -112,7 +112,19 @@ struct inet_timewait_sock *inet_twsk_all |
| void __inet_twsk_hashdance(struct inet_timewait_sock *tw, struct sock *sk, |
| struct inet_hashinfo *hashinfo); |
| |
| -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo); |
| +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, |
| + bool rearm); |
| + |
| +static void inline inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo) |
| +{ |
| + __inet_twsk_schedule(tw, timeo, false); |
| +} |
| + |
| +static void inline inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo) |
| +{ |
| + __inet_twsk_schedule(tw, timeo, true); |
| +} |
| + |
| void inet_twsk_deschedule(struct inet_timewait_sock *tw); |
| |
| void inet_twsk_purge(struct inet_hashinfo *hashinfo, |
| --- a/net/dccp/minisocks.c |
| +++ b/net/dccp/minisocks.c |
| @@ -48,8 +48,6 @@ void dccp_time_wait(struct sock *sk, int |
| tw->tw_ipv6only = sk->sk_ipv6only; |
| } |
| #endif |
| - /* Linkage updates. */ |
| - __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); |
| |
| /* Get the TIME_WAIT timeout firing. */ |
| if (timeo < rto) |
| @@ -60,6 +58,8 @@ void dccp_time_wait(struct sock *sk, int |
| timeo = DCCP_TIMEWAIT_LEN; |
| |
| inet_twsk_schedule(tw, timeo); |
| + /* Linkage updates. */ |
| + __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); |
| inet_twsk_put(tw); |
| } else { |
| /* Sorry, if we're out of memory, just CLOSE this |
| --- a/net/ipv4/inet_timewait_sock.c |
| +++ b/net/ipv4/inet_timewait_sock.c |
| @@ -153,13 +153,15 @@ void __inet_twsk_hashdance(struct inet_t |
| /* |
| * Step 2: Hash TW into tcp ehash chain. |
| * Notes : |
| - * - tw_refcnt is set to 3 because : |
| + * - tw_refcnt is set to 4 because : |
| * - We have one reference from bhash chain. |
| * - We have one reference from ehash chain. |
| + * - We have one reference from timer. |
| + * - One reference for ourself (our caller will release it). |
| * We can use atomic_set() because prior spin_lock()/spin_unlock() |
| * committed into memory all tw fields. |
| */ |
| - atomic_set(&tw->tw_refcnt, 1 + 1 + 1); |
| + atomic_set(&tw->tw_refcnt, 4); |
| inet_twsk_add_node_rcu(tw, &ehead->chain); |
| |
| /* Step 3: Remove SK from hash chain */ |
| @@ -243,7 +245,7 @@ void inet_twsk_deschedule(struct inet_ti |
| } |
| EXPORT_SYMBOL(inet_twsk_deschedule); |
| |
| -void inet_twsk_schedule(struct inet_timewait_sock *tw, const int timeo) |
| +void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm) |
| { |
| /* timeout := RTO * 3.5 |
| * |
| @@ -271,12 +273,14 @@ void inet_twsk_schedule(struct inet_time |
| */ |
| |
| tw->tw_kill = timeo <= 4*HZ; |
| - if (!mod_timer_pinned(&tw->tw_timer, jiffies + timeo)) { |
| - atomic_inc(&tw->tw_refcnt); |
| + if (!rearm) { |
| + BUG_ON(mod_timer_pinned(&tw->tw_timer, jiffies + timeo)); |
| atomic_inc(&tw->tw_dr->tw_count); |
| + } else { |
| + mod_timer_pending(&tw->tw_timer, jiffies + timeo); |
| } |
| } |
| -EXPORT_SYMBOL_GPL(inet_twsk_schedule); |
| +EXPORT_SYMBOL_GPL(__inet_twsk_schedule); |
| |
| void inet_twsk_purge(struct inet_hashinfo *hashinfo, |
| struct inet_timewait_death_row *twdr, int family) |
| --- a/net/ipv4/tcp_minisocks.c |
| +++ b/net/ipv4/tcp_minisocks.c |
| @@ -163,9 +163,9 @@ kill_with_rst: |
| if (tcp_death_row.sysctl_tw_recycle && |
| tcptw->tw_ts_recent_stamp && |
| tcp_tw_remember_stamp(tw)) |
| - inet_twsk_schedule(tw, tw->tw_timeout); |
| + inet_twsk_reschedule(tw, tw->tw_timeout); |
| else |
| - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); |
| + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); |
| return TCP_TW_ACK; |
| } |
| |
| @@ -203,7 +203,7 @@ kill: |
| return TCP_TW_SUCCESS; |
| } |
| } |
| - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); |
| + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); |
| |
| if (tmp_opt.saw_tstamp) { |
| tcptw->tw_ts_recent = tmp_opt.rcv_tsval; |
| @@ -253,7 +253,7 @@ kill: |
| * Do not reschedule in the last case. |
| */ |
| if (paws_reject || th->ack) |
| - inet_twsk_schedule(tw, TCP_TIMEWAIT_LEN); |
| + inet_twsk_reschedule(tw, TCP_TIMEWAIT_LEN); |
| |
| return tcp_timewait_check_oow_rate_limit( |
| tw, skb, LINUX_MIB_TCPACKSKIPPEDTIMEWAIT); |
| @@ -324,9 +324,6 @@ void tcp_time_wait(struct sock *sk, int |
| } while (0); |
| #endif |
| |
| - /* Linkage updates. */ |
| - __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); |
| - |
| /* Get the TIME_WAIT timeout firing. */ |
| if (timeo < rto) |
| timeo = rto; |
| @@ -340,6 +337,8 @@ void tcp_time_wait(struct sock *sk, int |
| } |
| |
| inet_twsk_schedule(tw, timeo); |
| + /* Linkage updates. */ |
| + __inet_twsk_hashdance(tw, sk, &tcp_hashinfo); |
| inet_twsk_put(tw); |
| } else { |
| /* Sorry, if we're out of memory, just CLOSE this |