| From: Julian Anastasov <ja@ssi.bg> |
| Date: Wed, 2 May 2018 09:41:19 +0300 |
| Subject: ipv4: fix fnhe usage by non-cached routes |
| |
| commit 94720e3aee6884d8c8beb678001629da60ec6366 upstream. |
| |
| Allow some non-cached routes to use non-expired fnhe: |
| |
| 1. ip_del_fnhe: moved above and now called by find_exception. |
| The 4.5+ commit deed49df7390 expires fnhe only when caching |
| routes. Change that to: |
| |
| 1.1. use fnhe for non-cached local output routes, with the help |
| from (2) |
| |
| 1.2. allow __mkroute_input to detect expired fnhe (outdated |
| fnhe_gw, for example) when do_cache is false, eg. when itag!=0 |
| for unicast destinations. |
| |
| 2. __mkroute_output: keep fi to allow local routes with orig_oif != 0 |
| to use fnhe info even when the new route will not be cached into fnhe. |
| After commit 839da4d98960 ("net: ipv4: set orig_oif based on fib |
| result for local traffic") it means all local routes will be affected |
| because they are not cached. This change is used to solve a PMTU |
| problem with IPVS (and probably Netfilter DNAT) setups that redirect |
| local clients from target local IP (local route to Virtual IP) |
| to new remote IP target, eg. IPVS TUN real server. Loopback has |
| 64K MTU and we need to create fnhe on the local route that will |
| keep the reduced PMTU for the Virtual IP. Without this change |
| fnhe_pmtu is updated from ICMP but never exposed to non-cached |
| local routes. This includes routes with flowi4_oif!=0 for 4.6+ and |
| with flowi4_oif=any for 4.14+). |
| |
| 3. update_or_create_fnhe: make sure fnhe_expires is not 0 for |
| new entries |
| |
| Fixes: 839da4d98960 ("net: ipv4: set orig_oif based on fib result for local traffic") |
| Fixes: d6d5e999e5df ("route: do not cache fib route info on local routes with oif") |
| Fixes: deed49df7390 ("route: check and remove route cache when we get route") |
| Cc: David Ahern <dsahern@gmail.com> |
| Cc: Xin Long <lucien.xin@gmail.com> |
| Signed-off-by: Julian Anastasov <ja@ssi.bg> |
| Acked-by: David Ahern <dsahern@gmail.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| [bwh: Backported to 3.16: adjust context] |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| net/ipv4/route.c | 118 +++++++++++++++++++++-------------------------- |
| 1 file changed, 53 insertions(+), 65 deletions(-) |
| |
| --- a/net/ipv4/route.c |
| +++ b/net/ipv4/route.c |
| @@ -684,7 +684,7 @@ static void update_or_create_fnhe(struct |
| fnhe->fnhe_gw = gw; |
| fnhe->fnhe_pmtu = pmtu; |
| fnhe->fnhe_mtu_locked = lock; |
| - fnhe->fnhe_expires = expires; |
| + fnhe->fnhe_expires = max(1UL, expires); |
| |
| /* Exception created; mark the cached routes for the nexthop |
| * stale, so anyone caching it rechecks if this exception |
| @@ -1259,6 +1259,36 @@ static unsigned int ipv4_mtu(const struc |
| return min_t(unsigned int, mtu, IP_MAX_MTU); |
| } |
| |
| +static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) |
| +{ |
| + struct fnhe_hash_bucket *hash; |
| + struct fib_nh_exception *fnhe, __rcu **fnhe_p; |
| + u32 hval = fnhe_hashfun(daddr); |
| + |
| + spin_lock_bh(&fnhe_lock); |
| + |
| + hash = rcu_dereference_protected(nh->nh_exceptions, |
| + lockdep_is_held(&fnhe_lock)); |
| + hash += hval; |
| + |
| + fnhe_p = &hash->chain; |
| + fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); |
| + while (fnhe) { |
| + if (fnhe->fnhe_daddr == daddr) { |
| + rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( |
| + fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); |
| + fnhe_flush_routes(fnhe); |
| + kfree_rcu(fnhe, rcu); |
| + break; |
| + } |
| + fnhe_p = &fnhe->fnhe_next; |
| + fnhe = rcu_dereference_protected(fnhe->fnhe_next, |
| + lockdep_is_held(&fnhe_lock)); |
| + } |
| + |
| + spin_unlock_bh(&fnhe_lock); |
| +} |
| + |
| static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr) |
| { |
| struct fnhe_hash_bucket *hash = nh->nh_exceptions; |
| @@ -1272,8 +1302,14 @@ static struct fib_nh_exception *find_exc |
| |
| for (fnhe = rcu_dereference(hash[hval].chain); fnhe; |
| fnhe = rcu_dereference(fnhe->fnhe_next)) { |
| - if (fnhe->fnhe_daddr == daddr) |
| + if (fnhe->fnhe_daddr == daddr) { |
| + if (fnhe->fnhe_expires && |
| + time_after(jiffies, fnhe->fnhe_expires)) { |
| + ip_del_fnhe(nh, daddr); |
| + break; |
| + } |
| return fnhe; |
| + } |
| } |
| return NULL; |
| } |
| @@ -1568,36 +1604,6 @@ static void ip_handle_martian_source(str |
| #endif |
| } |
| |
| -static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr) |
| -{ |
| - struct fnhe_hash_bucket *hash; |
| - struct fib_nh_exception *fnhe, __rcu **fnhe_p; |
| - u32 hval = fnhe_hashfun(daddr); |
| - |
| - spin_lock_bh(&fnhe_lock); |
| - |
| - hash = rcu_dereference_protected(nh->nh_exceptions, |
| - lockdep_is_held(&fnhe_lock)); |
| - hash += hval; |
| - |
| - fnhe_p = &hash->chain; |
| - fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock)); |
| - while (fnhe) { |
| - if (fnhe->fnhe_daddr == daddr) { |
| - rcu_assign_pointer(*fnhe_p, rcu_dereference_protected( |
| - fnhe->fnhe_next, lockdep_is_held(&fnhe_lock))); |
| - fnhe_flush_routes(fnhe); |
| - kfree_rcu(fnhe, rcu); |
| - break; |
| - } |
| - fnhe_p = &fnhe->fnhe_next; |
| - fnhe = rcu_dereference_protected(fnhe->fnhe_next, |
| - lockdep_is_held(&fnhe_lock)); |
| - } |
| - |
| - spin_unlock_bh(&fnhe_lock); |
| -} |
| - |
| /* called in rcu_read_lock() section */ |
| static int __mkroute_input(struct sk_buff *skb, |
| const struct fib_result *res, |
| @@ -1651,20 +1657,10 @@ static int __mkroute_input(struct sk_buf |
| |
| fnhe = find_exception(&FIB_RES_NH(*res), daddr); |
| if (do_cache) { |
| - if (fnhe) { |
| + if (fnhe) |
| rth = rcu_dereference(fnhe->fnhe_rth_input); |
| - if (rth && rth->dst.expires && |
| - time_after(jiffies, rth->dst.expires)) { |
| - ip_del_fnhe(&FIB_RES_NH(*res), daddr); |
| - fnhe = NULL; |
| - } else { |
| - goto rt_cache; |
| - } |
| - } |
| - |
| - rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); |
| - |
| -rt_cache: |
| + else |
| + rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input); |
| if (rt_cache_valid(rth)) { |
| skb_dst_set_noref(skb, &rth->dst); |
| goto out; |
| @@ -2000,39 +1996,31 @@ static struct rtable *__mkroute_output(c |
| * the loopback interface and the IP_PKTINFO ipi_ifindex will |
| * be set to the loopback interface as well. |
| */ |
| - fi = NULL; |
| + do_cache = false; |
| } |
| |
| fnhe = NULL; |
| do_cache &= fi != NULL; |
| - if (do_cache) { |
| + if (fi) { |
| struct rtable __rcu **prth; |
| struct fib_nh *nh = &FIB_RES_NH(*res); |
| |
| fnhe = find_exception(nh, fl4->daddr); |
| + if (!do_cache) |
| + goto add; |
| if (fnhe) { |
| prth = &fnhe->fnhe_rth_output; |
| - rth = rcu_dereference(*prth); |
| - if (rth && rth->dst.expires && |
| - time_after(jiffies, rth->dst.expires)) { |
| - ip_del_fnhe(nh, fl4->daddr); |
| - fnhe = NULL; |
| - } else { |
| - goto rt_cache; |
| + } else { |
| + if (unlikely(fl4->flowi4_flags & |
| + FLOWI_FLAG_KNOWN_NH && |
| + !(nh->nh_gw && |
| + nh->nh_scope == RT_SCOPE_LINK))) { |
| + do_cache = false; |
| + goto add; |
| } |
| + prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); |
| } |
| - |
| - if (unlikely(fl4->flowi4_flags & |
| - FLOWI_FLAG_KNOWN_NH && |
| - !(nh->nh_gw && |
| - nh->nh_scope == RT_SCOPE_LINK))) { |
| - do_cache = false; |
| - goto add; |
| - } |
| - prth = raw_cpu_ptr(nh->nh_pcpu_rth_output); |
| rth = rcu_dereference(*prth); |
| - |
| -rt_cache: |
| if (rt_cache_valid(rth)) { |
| dst_hold(&rth->dst); |
| return rth; |