| From fb49f67d21b8eb1a652ef4473c03510a0ffe9442 Mon Sep 17 00:00:00 2001 |
| From: Julian Anastasov <ja@ssi.bg> |
| Date: Wed, 1 Jul 2020 18:17:19 +0300 |
| Subject: [PATCH] ipvs: allow connection reuse for unconfirmed conntrack |
| MIME-Version: 1.0 |
| Content-Type: text/plain; charset=UTF-8 |
| Content-Transfer-Encoding: 8bit |
| |
| commit f0a5e4d7a594e0fe237d3dfafb069bb82f80f42f upstream. |
| |
| YangYuxi is reporting that connection reuse |
| is causing one-second delay when SYN hits |
| existing connection in TIME_WAIT state. |
| Such delay was added to give time to expire |
| both the IPVS connection and the corresponding |
| conntrack. This was considered a rare case |
| at that time but it is causing problem for |
| some environments such as Kubernetes. |
| |
| As nf_conntrack_tcp_packet() can decide to |
| release the conntrack in TIME_WAIT state and |
| to replace it with a fresh NEW conntrack, we |
| can use this to allow rescheduling just by |
| tuning our check: if the conntrack is |
| confirmed we can not schedule it to different |
| real server and the one-second delay still |
| applies but if new conntrack was created, |
| we are free to select new real server without |
| any delays. |
| |
| YangYuxi lists some of the problem reports: |
| |
| - One second connection delay in masquerading mode: |
| https://marc.info/?t=151683118100004&r=1&w=2 |
| |
| - IPVS low throughput #70747 |
| https://github.com/kubernetes/kubernetes/issues/70747 |
| |
| - Apache Bench can fill up ipvs service proxy in seconds #544 |
| https://github.com/cloudnativelabs/kube-router/issues/544 |
| |
| - Additional 1s latency in `host -> service IP -> pod` |
| https://github.com/kubernetes/kubernetes/issues/90854 |
| |
| Fixes: f719e3754ee2 ("ipvs: drop first packet to redirect conntrack") |
| Co-developed-by: YangYuxi <yx.atom1@gmail.com> |
| Signed-off-by: YangYuxi <yx.atom1@gmail.com> |
| Signed-off-by: Julian Anastasov <ja@ssi.bg> |
| Reviewed-by: Simon Horman <horms@verge.net.au> |
| Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h |
| index b71ad28fdc98..01e034774c5b 100644 |
| --- a/include/net/ip_vs.h |
| +++ b/include/net/ip_vs.h |
| @@ -1616,18 +1616,16 @@ static inline void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) |
| } |
| #endif /* CONFIG_IP_VS_NFCT */ |
| |
| -/* Really using conntrack? */ |
| -static inline bool ip_vs_conn_uses_conntrack(struct ip_vs_conn *cp, |
| - struct sk_buff *skb) |
| +/* Using old conntrack that can not be redirected to another real server? */ |
| +static inline bool ip_vs_conn_uses_old_conntrack(struct ip_vs_conn *cp, |
| + struct sk_buff *skb) |
| { |
| #ifdef CONFIG_IP_VS_NFCT |
| enum ip_conntrack_info ctinfo; |
| struct nf_conn *ct; |
| |
| - if (!(cp->flags & IP_VS_CONN_F_NFCT)) |
| - return false; |
| ct = nf_ct_get(skb, &ctinfo); |
| - if (ct) |
| + if (ct && nf_ct_is_confirmed(ct)) |
| return true; |
| #endif |
| return false; |
| diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c |
| index d5103a9eb302..c85752f8bd1a 100644 |
| --- a/net/netfilter/ipvs/ip_vs_core.c |
| +++ b/net/netfilter/ipvs/ip_vs_core.c |
| @@ -1955,14 +1955,14 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int |
| |
| conn_reuse_mode = sysctl_conn_reuse_mode(ipvs); |
| if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) { |
| - bool uses_ct = false, resched = false; |
| + bool old_ct = false, resched = false; |
| |
| if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest && |
| unlikely(!atomic_read(&cp->dest->weight))) { |
| resched = true; |
| - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); |
| + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); |
| } else if (is_new_conn_expected(cp, conn_reuse_mode)) { |
| - uses_ct = ip_vs_conn_uses_conntrack(cp, skb); |
| + old_ct = ip_vs_conn_uses_old_conntrack(cp, skb); |
| if (!atomic_read(&cp->n_control)) { |
| resched = true; |
| } else { |
| @@ -1970,15 +1970,17 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int |
| * that uses conntrack while it is still |
| * referenced by controlled connection(s). |
| */ |
| - resched = !uses_ct; |
| + resched = !old_ct; |
| } |
| } |
| |
| if (resched) { |
| + if (!old_ct) |
| + cp->flags &= ~IP_VS_CONN_F_NFCT; |
| if (!atomic_read(&cp->n_control)) |
| ip_vs_conn_expire_now(cp); |
| __ip_vs_conn_put(cp); |
| - if (uses_ct) |
| + if (old_ct) |
| return NF_DROP; |
| cp = NULL; |
| } |
| -- |
| 2.27.0 |
| |