| From foo@baz Wed 29 Jul 2020 11:19:56 AM CEST |
| From: Kuniyuki Iwashima <kuniyu@amazon.co.jp> |
| Date: Tue, 21 Jul 2020 15:15:31 +0900 |
| Subject: udp: Improve load balancing for SO_REUSEPORT. |
| |
| From: Kuniyuki Iwashima <kuniyu@amazon.co.jp> |
| |
| [ Upstream commit efc6b6f6c3113e8b203b9debfb72d81e0f3dcace ] |
| |
| Currently, SO_REUSEPORT does not work well if connected sockets are in a |
| UDP reuseport group. |
| |
| Then reuseport_has_conns() returns true and the result of |
| reuseport_select_sock() is discarded. Also, unconnected sockets have the |
| same score, hence only does the first unconnected socket in udp_hslot |
| always receive all packets sent to unconnected sockets. |
| |
| So, the result of reuseport_select_sock() should be used for load |
| balancing. |
| |
| The noteworthy point is that the unconnected sockets placed after |
| connected sockets in sock_reuseport.socks will receive more packets than |
| others because of the algorithm in reuseport_select_sock(). |
| |
| index | connected | reciprocal_scale | result |
| --------------------------------------------- |
| 0 | no | 20% | 40% |
| 1 | no | 20% | 20% |
| 2 | yes | 20% | 0% |
| 3 | no | 20% | 40% |
| 4 | yes | 20% | 0% |
| |
| If most of the sockets are connected, this can be a problem, but it still |
| works better than now. |
| |
| Fixes: acdcecc61285 ("udp: correct reuseport selection with connected sockets") |
| CC: Willem de Bruijn <willemb@google.com> |
| Reviewed-by: Benjamin Herrenschmidt <benh@amazon.com> |
| Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.co.jp> |
| Acked-by: Willem de Bruijn <willemb@google.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| net/ipv4/udp.c | 15 +++++++++------ |
| net/ipv6/udp.c | 15 +++++++++------ |
| 2 files changed, 18 insertions(+), 12 deletions(-) |
| |
| --- a/net/ipv4/udp.c |
| +++ b/net/ipv4/udp.c |
| @@ -413,7 +413,7 @@ static struct sock *udp4_lib_lookup2(str |
| struct udp_hslot *hslot2, |
| struct sk_buff *skb) |
| { |
| - struct sock *sk, *result; |
| + struct sock *sk, *result, *reuseport_result; |
| int score, badness; |
| u32 hash = 0; |
| |
| @@ -423,17 +423,20 @@ static struct sock *udp4_lib_lookup2(str |
| score = compute_score(sk, net, saddr, sport, |
| daddr, hnum, dif, sdif); |
| if (score > badness) { |
| + reuseport_result = NULL; |
| + |
| if (sk->sk_reuseport && |
| sk->sk_state != TCP_ESTABLISHED) { |
| hash = udp_ehashfn(net, daddr, hnum, |
| saddr, sport); |
| - result = reuseport_select_sock(sk, hash, skb, |
| - sizeof(struct udphdr)); |
| - if (result && !reuseport_has_conns(sk, false)) |
| - return result; |
| + reuseport_result = reuseport_select_sock(sk, hash, skb, |
| + sizeof(struct udphdr)); |
| + if (reuseport_result && !reuseport_has_conns(sk, false)) |
| + return reuseport_result; |
| } |
| + |
| + result = reuseport_result ? : sk; |
| badness = score; |
| - result = sk; |
| } |
| } |
| return result; |
| --- a/net/ipv6/udp.c |
| +++ b/net/ipv6/udp.c |
| @@ -148,7 +148,7 @@ static struct sock *udp6_lib_lookup2(str |
| int dif, int sdif, struct udp_hslot *hslot2, |
| struct sk_buff *skb) |
| { |
| - struct sock *sk, *result; |
| + struct sock *sk, *result, *reuseport_result; |
| int score, badness; |
| u32 hash = 0; |
| |
| @@ -158,17 +158,20 @@ static struct sock *udp6_lib_lookup2(str |
| score = compute_score(sk, net, saddr, sport, |
| daddr, hnum, dif, sdif); |
| if (score > badness) { |
| + reuseport_result = NULL; |
| + |
| if (sk->sk_reuseport && |
| sk->sk_state != TCP_ESTABLISHED) { |
| hash = udp6_ehashfn(net, daddr, hnum, |
| saddr, sport); |
| |
| - result = reuseport_select_sock(sk, hash, skb, |
| - sizeof(struct udphdr)); |
| - if (result && !reuseport_has_conns(sk, false)) |
| - return result; |
| + reuseport_result = reuseport_select_sock(sk, hash, skb, |
| + sizeof(struct udphdr)); |
| + if (reuseport_result && !reuseport_has_conns(sk, false)) |
| + return reuseport_result; |
| } |
| - result = sk; |
| + |
| + result = reuseport_result ? : sk; |
| badness = score; |
| } |
| } |