| From foo@baz Tue Oct 16 16:47:53 CEST 2018 |
| From: Sabrina Dubroca <sd@queasysnail.net> |
| Date: Tue, 9 Oct 2018 17:48:14 +0200 |
| Subject: net: ipv4: update fnhe_pmtu when first hop's MTU changes |
| |
| From: Sabrina Dubroca <sd@queasysnail.net> |
| |
| [ Upstream commit af7d6cce53694a88d6a1bb60c9a239a6a5144459 ] |
| |
| Since commit 5aad1de5ea2c ("ipv4: use separate genid for next hop |
| exceptions"), exceptions get deprecated separately from cached |
| routes. In particular, administrative changes don't clear PMTU anymore. |
| |
| As Stefano described in commit e9fa1495d738 ("ipv6: Reflect MTU changes |
| on PMTU of exceptions for MTU-less routes"), the PMTU discovered before |
| the local MTU change can become stale: |
| - if the local MTU is now lower than the PMTU, that PMTU is now |
| incorrect |
| - if the local MTU was the lowest value in the path, and is increased, |
| we might discover a higher PMTU |
| |
| Similarly to what commit e9fa1495d738 did for IPv6, update PMTU in those |
| cases. |
| |
| If the exception was locked, the discovered PMTU was smaller than the |
| minimal accepted PMTU. In that case, if the new local MTU is smaller |
| than the current PMTU, let PMTU discovery figure out if locking of the |
| exception is still needed. |
| |
| To do this, we need to know the old link MTU in the NETDEV_CHANGEMTU |
| notifier. By the time the notifier is called, dev->mtu has been |
| changed. This patch adds the old MTU as additional information in the |
| notifier structure, and a new call_netdevice_notifiers_u32() function. |
| |
| Fixes: 5aad1de5ea2c ("ipv4: use separate genid for next hop exceptions") |
| Signed-off-by: Sabrina Dubroca <sd@queasysnail.net> |
| Reviewed-by: Stefano Brivio <sbrivio@redhat.com> |
| Reviewed-by: David Ahern <dsahern@gmail.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| include/linux/netdevice.h | 7 ++++++ |
| include/net/ip_fib.h | 1 |
| net/core/dev.c | 28 +++++++++++++++++++++++-- |
| net/ipv4/fib_frontend.c | 12 +++++++---- |
| net/ipv4/fib_semantics.c | 50 ++++++++++++++++++++++++++++++++++++++++++++++ |
| 5 files changed, 92 insertions(+), 6 deletions(-) |
| |
| --- a/include/linux/netdevice.h |
| +++ b/include/linux/netdevice.h |
| @@ -2168,6 +2168,13 @@ struct netdev_notifier_info { |
| struct net_device *dev; |
| }; |
| |
| +struct netdev_notifier_info_ext { |
| + struct netdev_notifier_info info; /* must be first */ |
| + union { |
| + u32 mtu; |
| + } ext; |
| +}; |
| + |
| struct netdev_notifier_change_info { |
| struct netdev_notifier_info info; /* must be first */ |
| unsigned int flags_changed; |
| --- a/include/net/ip_fib.h |
| +++ b/include/net/ip_fib.h |
| @@ -322,6 +322,7 @@ int ip_fib_check_default(__be32 gw, stru |
| int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force); |
| int fib_sync_down_addr(struct net *net, __be32 local); |
| int fib_sync_up(struct net_device *dev, unsigned int nh_flags); |
| +void fib_sync_mtu(struct net_device *dev, u32 orig_mtu); |
| |
| extern u32 fib_multipath_secret __read_mostly; |
| |
| --- a/net/core/dev.c |
| +++ b/net/core/dev.c |
| @@ -1660,6 +1660,28 @@ int call_netdevice_notifiers(unsigned lo |
| } |
| EXPORT_SYMBOL(call_netdevice_notifiers); |
| |
| +/** |
| + * call_netdevice_notifiers_mtu - call all network notifier blocks |
| + * @val: value passed unmodified to notifier function |
| + * @dev: net_device pointer passed unmodified to notifier function |
| + * @arg: additional u32 argument passed to the notifier function |
| + * |
| + * Call all network notifier blocks. Parameters and return value |
| + * are as for raw_notifier_call_chain(). |
| + */ |
| +static int call_netdevice_notifiers_mtu(unsigned long val, |
| + struct net_device *dev, u32 arg) |
| +{ |
| + struct netdev_notifier_info_ext info = { |
| + .info.dev = dev, |
| + .ext.mtu = arg, |
| + }; |
| + |
| + BUILD_BUG_ON(offsetof(struct netdev_notifier_info_ext, info) != 0); |
| + |
| + return call_netdevice_notifiers_info(val, dev, &info.info); |
| +} |
| + |
| #ifdef CONFIG_NET_INGRESS |
| static struct static_key ingress_needed __read_mostly; |
| |
| @@ -6134,14 +6156,16 @@ int dev_set_mtu(struct net_device *dev, |
| err = __dev_set_mtu(dev, new_mtu); |
| |
| if (!err) { |
| - err = call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); |
| + err = call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, |
| + orig_mtu); |
| err = notifier_to_errno(err); |
| if (err) { |
| /* setting mtu back and notifying everyone again, |
| * so that they have a chance to revert changes. |
| */ |
| __dev_set_mtu(dev, orig_mtu); |
| - call_netdevice_notifiers(NETDEV_CHANGEMTU, dev); |
| + call_netdevice_notifiers_mtu(NETDEV_CHANGEMTU, dev, |
| + new_mtu); |
| } |
| } |
| return err; |
| --- a/net/ipv4/fib_frontend.c |
| +++ b/net/ipv4/fib_frontend.c |
| @@ -1170,7 +1170,8 @@ static int fib_inetaddr_event(struct not |
| static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) |
| { |
| struct net_device *dev = netdev_notifier_info_to_dev(ptr); |
| - struct netdev_notifier_changeupper_info *info; |
| + struct netdev_notifier_changeupper_info *upper_info = ptr; |
| + struct netdev_notifier_info_ext *info_ext = ptr; |
| struct in_device *in_dev; |
| struct net *net = dev_net(dev); |
| unsigned int flags; |
| @@ -1205,16 +1206,19 @@ static int fib_netdev_event(struct notif |
| fib_sync_up(dev, RTNH_F_LINKDOWN); |
| else |
| fib_sync_down_dev(dev, event, false); |
| - /* fall through */ |
| + rt_cache_flush(net); |
| + break; |
| case NETDEV_CHANGEMTU: |
| + fib_sync_mtu(dev, info_ext->ext.mtu); |
| rt_cache_flush(net); |
| break; |
| case NETDEV_CHANGEUPPER: |
| - info = ptr; |
| + upper_info = ptr; |
| /* flush all routes if dev is linked to or unlinked from |
| * an L3 master device (e.g., VRF) |
| */ |
| - if (info->upper_dev && netif_is_l3_master(info->upper_dev)) |
| + if (upper_info->upper_dev && |
| + netif_is_l3_master(upper_info->upper_dev)) |
| fib_disable_ip(dev, NETDEV_DOWN, true); |
| break; |
| } |
| --- a/net/ipv4/fib_semantics.c |
| +++ b/net/ipv4/fib_semantics.c |
| @@ -1373,6 +1373,56 @@ int fib_sync_down_addr(struct net *net, |
| return ret; |
| } |
| |
| +/* Update the PMTU of exceptions when: |
| + * - the new MTU of the first hop becomes smaller than the PMTU |
| + * - the old MTU was the same as the PMTU, and it limited discovery of |
| + * larger MTUs on the path. With that limit raised, we can now |
| + * discover larger MTUs |
| + * A special case is locked exceptions, for which the PMTU is smaller |
| + * than the minimal accepted PMTU: |
| + * - if the new MTU is greater than the PMTU, don't make any change |
| + * - otherwise, unlock and set PMTU |
| + */ |
| +static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig) |
| +{ |
| + struct fnhe_hash_bucket *bucket; |
| + int i; |
| + |
| + bucket = rcu_dereference_protected(nh->nh_exceptions, 1); |
| + if (!bucket) |
| + return; |
| + |
| + for (i = 0; i < FNHE_HASH_SIZE; i++) { |
| + struct fib_nh_exception *fnhe; |
| + |
| + for (fnhe = rcu_dereference_protected(bucket[i].chain, 1); |
| + fnhe; |
| + fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) { |
| + if (fnhe->fnhe_mtu_locked) { |
| + if (new <= fnhe->fnhe_pmtu) { |
| + fnhe->fnhe_pmtu = new; |
| + fnhe->fnhe_mtu_locked = false; |
| + } |
| + } else if (new < fnhe->fnhe_pmtu || |
| + orig == fnhe->fnhe_pmtu) { |
| + fnhe->fnhe_pmtu = new; |
| + } |
| + } |
| + } |
| +} |
| + |
| +void fib_sync_mtu(struct net_device *dev, u32 orig_mtu) |
| +{ |
| + unsigned int hash = fib_devindex_hashfn(dev->ifindex); |
| + struct hlist_head *head = &fib_info_devhash[hash]; |
| + struct fib_nh *nh; |
| + |
| + hlist_for_each_entry(nh, head, nh_hash) { |
| + if (nh->nh_dev == dev) |
| + nh_update_mtu(nh, dev->mtu, orig_mtu); |
| + } |
| +} |
| + |
| /* Event force Flags Description |
| * NETDEV_CHANGE 0 LINKDOWN Carrier OFF, not for scope host |
| * NETDEV_DOWN 0 LINKDOWN|DEAD Link down, not for scope host |