blob: 49b954d6d0fa44ea0c4427e2918b3ab9c1610fe0 [file] [log] [blame]
/*
* Linux INET6 implementation
* FIB front-end.
*
* Authors:
* Pedro Roque <roque@di.fc.ul.pt>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
/* Changes:
*
* YOSHIFUJI Hideaki @USAGI
* reworked default router selection.
* - respect outgoing interface
* - select from (probably) reachable routers (i.e.
* routers in REACHABLE, STALE, DELAY or PROBE states).
* - always select the same router if it is (probably)
* reachable. otherwise, round-robin the list.
* Ville Nuorvala
* Fixed routing subtrees.
*/
#define pr_fmt(fmt) "IPv6: " fmt
#include <linux/capability.h>
#include <linux/errno.h>
#include <linux/export.h>
#include <linux/types.h>
#include <linux/times.h>
#include <linux/socket.h>
#include <linux/sockios.h>
#include <linux/net.h>
#include <linux/route.h>
#include <linux/netdevice.h>
#include <linux/in6.h>
#include <linux/mroute6.h>
#include <linux/init.h>
#include <linux/if_arp.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/nsproxy.h>
#include <linux/slab.h>
#include <linux/jhash.h>
#include <net/net_namespace.h>
#include <net/snmp.h>
#include <net/ipv6.h>
#include <net/ip6_fib.h>
#include <net/ip6_route.h>
#include <net/ndisc.h>
#include <net/addrconf.h>
#include <net/tcp.h>
#include <linux/rtnetlink.h>
#include <net/dst.h>
#include <net/dst_metadata.h>
#include <net/xfrm.h>
#include <net/netevent.h>
#include <net/netlink.h>
#include <net/nexthop.h>
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
#include <trace/events/fib6.h>
#include <linux/uaccess.h>
#ifdef CONFIG_SYSCTL
#include <linux/sysctl.h>
#endif
enum rt6_nud_state {
RT6_NUD_FAIL_HARD = -3,
RT6_NUD_FAIL_PROBE = -2,
RT6_NUD_FAIL_DO_RR = -1,
RT6_NUD_SUCCEED = 1
};
static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
static unsigned int ip6_default_advmss(const struct dst_entry *dst);
static unsigned int ip6_mtu(const struct dst_entry *dst);
static struct dst_entry *ip6_negative_advice(struct dst_entry *);
static void ip6_dst_destroy(struct dst_entry *);
static void ip6_dst_ifdown(struct dst_entry *,
struct net_device *dev, int how);
static int ip6_dst_gc(struct dst_ops *ops);
static int ip6_pkt_discard(struct sk_buff *skb);
static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static int ip6_pkt_prohibit(struct sk_buff *skb);
static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
static void ip6_link_failure(struct sk_buff *skb);
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu);
static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb);
static void rt6_dst_from_metrics_check(struct rt6_info *rt);
static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
static size_t rt6_nlmsg_size(struct rt6_info *rt);
static int rt6_fill_node(struct net *net,
struct sk_buff *skb, struct rt6_info *rt,
struct in6_addr *dst, struct in6_addr *src,
int iif, int type, u32 portid, u32 seq,
unsigned int flags);
static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
struct in6_addr *daddr,
struct in6_addr *saddr);
#ifdef CONFIG_IPV6_ROUTE_INFO
static struct rt6_info *rt6_add_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
const struct in6_addr *gwaddr,
struct net_device *dev,
unsigned int pref);
static struct rt6_info *rt6_get_route_info(struct net *net,
const struct in6_addr *prefix, int prefixlen,
const struct in6_addr *gwaddr,
struct net_device *dev);
#endif
struct uncached_list {
spinlock_t lock;
struct list_head head;
};
static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
void rt6_uncached_list_add(struct rt6_info *rt)
{
struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
rt->rt6i_uncached_list = ul;
spin_lock_bh(&ul->lock);
list_add_tail(&rt->rt6i_uncached, &ul->head);
spin_unlock_bh(&ul->lock);
}
void rt6_uncached_list_del(struct rt6_info *rt)
{
if (!list_empty(&rt->rt6i_uncached)) {
struct uncached_list *ul = rt->rt6i_uncached_list;
struct net *net = dev_net(rt->dst.dev);
spin_lock_bh(&ul->lock);
list_del(&rt->rt6i_uncached);
atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
spin_unlock_bh(&ul->lock);
}
}
static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
{
struct net_device *loopback_dev = net->loopback_dev;
int cpu;
if (dev == loopback_dev)
return;
for_each_possible_cpu(cpu) {
struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
struct rt6_info *rt;
spin_lock_bh(&ul->lock);
list_for_each_entry(rt, &ul->head, rt6i_uncached) {
struct inet6_dev *rt_idev = rt->rt6i_idev;
struct net_device *rt_dev = rt->dst.dev;
if (rt_idev->dev == dev) {
rt->rt6i_idev = in6_dev_get(loopback_dev);
in6_dev_put(rt_idev);
}
if (rt_dev == dev) {
rt->dst.dev = loopback_dev;
dev_hold(rt->dst.dev);
dev_put(rt_dev);
}
}
spin_unlock_bh(&ul->lock);
}
}
static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
{
return dst_metrics_write_ptr(&rt->from->dst);
}
static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
{
struct rt6_info *rt = (struct rt6_info *)dst;
if (rt->rt6i_flags & RTF_PCPU)
return rt6_pcpu_cow_metrics(rt);
else if (rt->rt6i_flags & RTF_CACHE)
return NULL;
else
return dst_cow_metrics_generic(dst, old);
}
static inline const void *choose_neigh_daddr(struct rt6_info *rt,
struct sk_buff *skb,
const void *daddr)
{
struct in6_addr *p = &rt->rt6i_gateway;
if (!ipv6_addr_any(p))
return (const void *) p;
else if (skb)
return &ipv6_hdr(skb)->daddr;
return daddr;
}
static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
struct sk_buff *skb,
const void *daddr)
{
struct rt6_info *rt = (struct rt6_info *) dst;
struct neighbour *n;
daddr = choose_neigh_daddr(rt, skb, daddr);
n = __ipv6_neigh_lookup(dst->dev, daddr);
if (n)
return n;
return neigh_create(&nd_tbl, daddr, dst->dev);
}
static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
{
struct net_device *dev = dst->dev;
struct rt6_info *rt = (struct rt6_info *)dst;
daddr = choose_neigh_daddr(rt, NULL, daddr);
if (!daddr)
return;
if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
return;
if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
return;
__ipv6_confirm_neigh(dev, daddr);
}
static struct dst_ops ip6_dst_ops_template = {
.family = AF_INET6,
.gc = ip6_dst_gc,
.gc_thresh = 1024,
.check = ip6_dst_check,
.default_advmss = ip6_default_advmss,
.mtu = ip6_mtu,
.cow_metrics = ipv6_cow_metrics,
.destroy = ip6_dst_destroy,
.ifdown = ip6_dst_ifdown,
.negative_advice = ip6_negative_advice,
.link_failure = ip6_link_failure,
.update_pmtu = ip6_rt_update_pmtu,
.redirect = rt6_do_redirect,
.local_out = __ip6_local_out,
.neigh_lookup = ip6_neigh_lookup,
.confirm_neigh = ip6_confirm_neigh,
};
static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
{
unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
return mtu ? : dst->dev->mtu;
}
static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu)
{
}
static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb)
{
}
static struct dst_ops ip6_dst_blackhole_ops = {
.family = AF_INET6,
.destroy = ip6_dst_destroy,
.check = ip6_dst_check,
.mtu = ip6_blackhole_mtu,
.default_advmss = ip6_default_advmss,
.update_pmtu = ip6_rt_blackhole_update_pmtu,
.redirect = ip6_rt_blackhole_redirect,
.cow_metrics = dst_cow_metrics_generic,
.neigh_lookup = ip6_neigh_lookup,
};
static const u32 ip6_template_metrics[RTAX_MAX] = {
[RTAX_HOPLIMIT - 1] = 0,
};
static const struct rt6_info ip6_null_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -ENETUNREACH,
.input = ip6_pkt_discard,
.output = ip6_pkt_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
.rt6i_protocol = RTPROT_KERNEL,
.rt6i_metric = ~(u32) 0,
.rt6i_ref = ATOMIC_INIT(1),
};
#ifdef CONFIG_IPV6_MULTIPLE_TABLES
static const struct rt6_info ip6_prohibit_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EACCES,
.input = ip6_pkt_prohibit,
.output = ip6_pkt_prohibit_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
.rt6i_protocol = RTPROT_KERNEL,
.rt6i_metric = ~(u32) 0,
.rt6i_ref = ATOMIC_INIT(1),
};
static const struct rt6_info ip6_blk_hole_entry_template = {
.dst = {
.__refcnt = ATOMIC_INIT(1),
.__use = 1,
.obsolete = DST_OBSOLETE_FORCE_CHK,
.error = -EINVAL,
.input = dst_discard,
.output = dst_discard_out,
},
.rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
.rt6i_protocol = RTPROT_KERNEL,
.rt6i_metric = ~(u32) 0,
.rt6i_ref = ATOMIC_INIT(1),
};
#endif
static void rt6_info_init(struct rt6_info *rt)
{
struct dst_entry *dst = &rt->dst;
memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
INIT_LIST_HEAD(&rt->rt6i_siblings);
INIT_LIST_HEAD(&rt->rt6i_uncached);
}
/* allocate dst with ip6_dst_ops */
static struct rt6_info *__ip6_dst_alloc(struct net *net,
struct net_device *dev,
int flags)
{
struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
1, DST_OBSOLETE_FORCE_CHK, flags);
if (rt) {
rt6_info_init(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
}
return rt;
}
struct rt6_info *ip6_dst_alloc(struct net *net,
struct net_device *dev,
int flags)
{
struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
if (rt) {
rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
if (!rt->rt6i_pcpu) {
dst_release_immediate(&rt->dst);
return NULL;
}
}
return rt;
}
EXPORT_SYMBOL(ip6_dst_alloc);
static void ip6_dst_destroy(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *)dst;
struct rt6_exception_bucket *bucket;
struct rt6_info *from = rt->from;
struct inet6_dev *idev;
dst_destroy_metrics_generic(dst);
free_percpu(rt->rt6i_pcpu);
rt6_uncached_list_del(rt);
idev = rt->rt6i_idev;
if (idev) {
rt->rt6i_idev = NULL;
in6_dev_put(idev);
}
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
if (bucket) {
rt->rt6i_exception_bucket = NULL;
kfree(bucket);
}
rt->from = NULL;
dst_release(&from->dst);
}
static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
int how)
{
struct rt6_info *rt = (struct rt6_info *)dst;
struct inet6_dev *idev = rt->rt6i_idev;
struct net_device *loopback_dev =
dev_net(dev)->loopback_dev;
if (idev && idev->dev != loopback_dev) {
struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
if (loopback_idev) {
rt->rt6i_idev = loopback_idev;
in6_dev_put(idev);
}
}
}
static bool __rt6_check_expired(const struct rt6_info *rt)
{
if (rt->rt6i_flags & RTF_EXPIRES)
return time_after(jiffies, rt->dst.expires);
else
return false;
}
static bool rt6_check_expired(const struct rt6_info *rt)
{
if (rt->rt6i_flags & RTF_EXPIRES) {
if (time_after(jiffies, rt->dst.expires))
return true;
} else if (rt->from) {
return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
rt6_check_expired(rt->from);
}
return false;
}
static struct rt6_info *rt6_multipath_select(const struct net *net,
struct rt6_info *match,
struct flowi6 *fl6, int oif,
const struct sk_buff *skb,
int strict)
{
struct rt6_info *sibling, *next_sibling;
/* We might have already computed the hash for ICMPv6 errors. In such
* case it will always be non-zero. Otherwise now is the time to do it.
*/
if (!fl6->mp_hash)
fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
return match;
list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
rt6i_siblings) {
if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
continue;
if (rt6_score_route(sibling, oif, strict) < 0)
break;
match = sibling;
break;
}
return match;
}
/*
* Route lookup. rcu_read_lock() should be held.
*/
static inline struct rt6_info *rt6_device_match(struct net *net,
struct rt6_info *rt,
const struct in6_addr *saddr,
int oif,
int flags)
{
struct rt6_info *local = NULL;
struct rt6_info *sprt;
if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
return rt;
for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
struct net_device *dev = sprt->dst.dev;
if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
continue;
if (oif) {
if (dev->ifindex == oif)
return sprt;
if (dev->flags & IFF_LOOPBACK) {
if (!sprt->rt6i_idev ||
sprt->rt6i_idev->dev->ifindex != oif) {
if (flags & RT6_LOOKUP_F_IFACE)
continue;
if (local &&
local->rt6i_idev->dev->ifindex == oif)
continue;
}
local = sprt;
}
} else {
if (ipv6_chk_addr(net, saddr, dev,
flags & RT6_LOOKUP_F_IFACE))
return sprt;
}
}
if (oif) {
if (local)
return local;
if (flags & RT6_LOOKUP_F_IFACE)
return net->ipv6.ip6_null_entry;
}
return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
}
#ifdef CONFIG_IPV6_ROUTER_PREF
struct __rt6_probe_work {
struct work_struct work;
struct in6_addr target;
struct net_device *dev;
};
static void rt6_probe_deferred(struct work_struct *w)
{
struct in6_addr mcaddr;
struct __rt6_probe_work *work =
container_of(w, struct __rt6_probe_work, work);
addrconf_addr_solict_mult(&work->target, &mcaddr);
ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
dev_put(work->dev);
kfree(work);
}
static void rt6_probe(struct rt6_info *rt)
{
struct __rt6_probe_work *work;
struct neighbour *neigh;
/*
* Okay, this does not seem to be appropriate
* for now, however, we need to check if it
* is really so; aka Router Reachability Probing.
*
* Router Reachability Probe MUST be rate-limited
* to no more than one per minute.
*/
if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
return;
rcu_read_lock_bh();
neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
if (neigh) {
if (neigh->nud_state & NUD_VALID)
goto out;
work = NULL;
write_lock(&neigh->lock);
if (!(neigh->nud_state & NUD_VALID) &&
time_after(jiffies,
neigh->updated +
rt->rt6i_idev->cnf.rtr_probe_interval)) {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
if (work)
__neigh_set_probe_once(neigh);
}
write_unlock(&neigh->lock);
} else {
work = kmalloc(sizeof(*work), GFP_ATOMIC);
}
if (work) {
INIT_WORK(&work->work, rt6_probe_deferred);
work->target = rt->rt6i_gateway;
dev_hold(rt->dst.dev);
work->dev = rt->dst.dev;
schedule_work(&work->work);
}
out:
rcu_read_unlock_bh();
}
#else
static inline void rt6_probe(struct rt6_info *rt)
{
}
#endif
/*
* Default Router Selection (RFC 2461 6.3.6)
*/
static inline int rt6_check_dev(struct rt6_info *rt, int oif)
{
struct net_device *dev = rt->dst.dev;
if (!oif || dev->ifindex == oif)
return 2;
if ((dev->flags & IFF_LOOPBACK) &&
rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
return 1;
return 0;
}
static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
{
struct neighbour *neigh;
enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
if (rt->rt6i_flags & RTF_NONEXTHOP ||
!(rt->rt6i_flags & RTF_GATEWAY))
return RT6_NUD_SUCCEED;
rcu_read_lock_bh();
neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
if (neigh) {
read_lock(&neigh->lock);
if (neigh->nud_state & NUD_VALID)
ret = RT6_NUD_SUCCEED;
#ifdef CONFIG_IPV6_ROUTER_PREF
else if (!(neigh->nud_state & NUD_FAILED))
ret = RT6_NUD_SUCCEED;
else
ret = RT6_NUD_FAIL_PROBE;
#endif
read_unlock(&neigh->lock);
} else {
ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
}
rcu_read_unlock_bh();
return ret;
}
static int rt6_score_route(struct rt6_info *rt, int oif,
int strict)
{
int m;
m = rt6_check_dev(rt, oif);
if (!m && (strict & RT6_LOOKUP_F_IFACE))
return RT6_NUD_FAIL_HARD;
#ifdef CONFIG_IPV6_ROUTER_PREF
m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
#endif
if (strict & RT6_LOOKUP_F_REACHABLE) {
int n = rt6_check_neigh(rt);
if (n < 0)
return n;
}
return m;
}
static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
int *mpri, struct rt6_info *match,
bool *do_rr)
{
int m;
bool match_do_rr = false;
struct inet6_dev *idev = rt->rt6i_idev;
if (rt->rt6i_nh_flags & RTNH_F_DEAD)
goto out;
if (idev->cnf.ignore_routes_with_linkdown &&
rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
!(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
goto out;
if (rt6_check_expired(rt))
goto out;
m = rt6_score_route(rt, oif, strict);
if (m == RT6_NUD_FAIL_DO_RR) {
match_do_rr = true;
m = 0; /* lowest valid score */
} else if (m == RT6_NUD_FAIL_HARD) {
goto out;
}
if (strict & RT6_LOOKUP_F_REACHABLE)
rt6_probe(rt);
/* note that m can be RT6_NUD_FAIL_PROBE at this point */
if (m > *mpri) {
*do_rr = match_do_rr;
*mpri = m;
match = rt;
}
out:
return match;
}
static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
struct rt6_info *leaf,
struct rt6_info *rr_head,
u32 metric, int oif, int strict,
bool *do_rr)
{
struct rt6_info *rt, *match, *cont;
int mpri = -1;
match = NULL;
cont = NULL;
for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
if (rt->rt6i_metric != metric) {
cont = rt;
break;
}
match = find_match(rt, oif, strict, &mpri, match, do_rr);
}
for (rt = leaf; rt && rt != rr_head;
rt = rcu_dereference(rt->rt6_next)) {
if (rt->rt6i_metric != metric) {
cont = rt;
break;
}
match = find_match(rt, oif, strict, &mpri, match, do_rr);
}
if (match || !cont)
return match;
for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
match = find_match(rt, oif, strict, &mpri, match, do_rr);
return match;
}
static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
int oif, int strict)
{
struct rt6_info *leaf = rcu_dereference(fn->leaf);
struct rt6_info *match, *rt0;
bool do_rr = false;
int key_plen;
if (!leaf || leaf == net->ipv6.ip6_null_entry)
return net->ipv6.ip6_null_entry;
rt0 = rcu_dereference(fn->rr_ptr);
if (!rt0)
rt0 = leaf;
/* Double check to make sure fn is not an intermediate node
* and fn->leaf does not points to its child's leaf
* (This might happen if all routes under fn are deleted from
* the tree and fib6_repair_tree() is called on the node.)
*/
key_plen = rt0->rt6i_dst.plen;
#ifdef CONFIG_IPV6_SUBTREES
if (rt0->rt6i_src.plen)
key_plen = rt0->rt6i_src.plen;
#endif
if (fn->fn_bit != key_plen)
return net->ipv6.ip6_null_entry;
match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
&do_rr);
if (do_rr) {
struct rt6_info *next = rcu_dereference(rt0->rt6_next);
/* no entries matched; do round-robin */
if (!next || next->rt6i_metric != rt0->rt6i_metric)
next = leaf;
if (next != rt0) {
spin_lock_bh(&leaf->rt6i_table->tb6_lock);
/* make sure next is not being deleted from the tree */
if (next->rt6i_node)
rcu_assign_pointer(fn->rr_ptr, next);
spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
}
}
return match ? match : net->ipv6.ip6_null_entry;
}
static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
{
return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
}
#ifdef CONFIG_IPV6_ROUTE_INFO
int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
const struct in6_addr *gwaddr)
{
struct net *net = dev_net(dev);
struct route_info *rinfo = (struct route_info *) opt;
struct in6_addr prefix_buf, *prefix;
unsigned int pref;
unsigned long lifetime;
struct rt6_info *rt;
if (len < sizeof(struct route_info)) {
return -EINVAL;
}
/* Sanity check for prefix_len and length */
if (rinfo->length > 3) {
return -EINVAL;
} else if (rinfo->prefix_len > 128) {
return -EINVAL;
} else if (rinfo->prefix_len > 64) {
if (rinfo->length < 2) {
return -EINVAL;
}
} else if (rinfo->prefix_len > 0) {
if (rinfo->length < 1) {
return -EINVAL;
}
}
pref = rinfo->route_pref;
if (pref == ICMPV6_ROUTER_PREF_INVALID)
return -EINVAL;
lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
if (rinfo->length == 3)
prefix = (struct in6_addr *)rinfo->prefix;
else {
/* this function is safe */
ipv6_addr_prefix(&prefix_buf,
(struct in6_addr *)rinfo->prefix,
rinfo->prefix_len);
prefix = &prefix_buf;
}
if (rinfo->prefix_len == 0)
rt = rt6_get_dflt_router(gwaddr, dev);
else
rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
gwaddr, dev);
if (rt && !lifetime) {
ip6_del_rt(rt);
rt = NULL;
}
if (!rt && lifetime)
rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
dev, pref);
else if (rt)
rt->rt6i_flags = RTF_ROUTEINFO |
(rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
if (rt) {
if (!addrconf_finite_timeout(lifetime))
rt6_clean_expires(rt);
else
rt6_set_expires(rt, jiffies + HZ * lifetime);
ip6_rt_put(rt);
}
return 0;
}
#endif
static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
struct in6_addr *saddr)
{
struct fib6_node *pn, *sn;
while (1) {
if (fn->fn_flags & RTN_TL_ROOT)
return NULL;
pn = rcu_dereference(fn->parent);
sn = FIB6_SUBTREE(pn);
if (sn && sn != fn)
fn = fib6_lookup(sn, NULL, saddr);
else
fn = pn;
if (fn->fn_flags & RTN_RTINFO)
return fn;
}
}
static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
bool null_fallback)
{
struct rt6_info *rt = *prt;
if (dst_hold_safe(&rt->dst))
return true;
if (null_fallback) {
rt = net->ipv6.ip6_null_entry;
dst_hold(&rt->dst);
} else {
rt = NULL;
}
*prt = rt;
return false;
}
static struct rt6_info *ip6_pol_route_lookup(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
struct rt6_info *rt, *rt_cache;
struct fib6_node *fn;
if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
flags &= ~RT6_LOOKUP_F_IFACE;
rcu_read_lock();
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
rt = rcu_dereference(fn->leaf);
if (!rt) {
rt = net->ipv6.ip6_null_entry;
} else {
rt = rt6_device_match(net, rt, &fl6->saddr,
fl6->flowi6_oif, flags);
if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
skb, flags);
}
if (rt == net->ipv6.ip6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto restart;
}
/* Search through exception table */
rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
if (rt_cache)
rt = rt_cache;
if (ip6_hold_safe(net, &rt, true))
dst_use_noref(&rt->dst, jiffies);
rcu_read_unlock();
trace_fib6_table_lookup(net, rt, table, fl6);
return rt;
}
struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
const struct sk_buff *skb, int flags)
{
return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
}
EXPORT_SYMBOL_GPL(ip6_route_lookup);
struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
const struct in6_addr *saddr, int oif,
const struct sk_buff *skb, int strict)
{
struct flowi6 fl6 = {
.flowi6_oif = oif,
.daddr = *daddr,
};
struct dst_entry *dst;
int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
if (saddr) {
memcpy(&fl6.saddr, saddr, sizeof(*saddr));
flags |= RT6_LOOKUP_F_HAS_SADDR;
}
dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
if (dst->error == 0)
return (struct rt6_info *) dst;
dst_release(dst);
return NULL;
}
EXPORT_SYMBOL(rt6_lookup);
/* ip6_ins_rt is called with FREE table->tb6_lock.
* It takes new route entry, the addition fails by any reason the
* route is released.
* Caller must hold dst before calling it.
*/
static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
struct mx6_config *mxc,
struct netlink_ext_ack *extack)
{
int err;
struct fib6_table *table;
table = rt->rt6i_table;
spin_lock_bh(&table->tb6_lock);
err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
spin_unlock_bh(&table->tb6_lock);
return err;
}
int ip6_ins_rt(struct rt6_info *rt)
{
struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
struct mx6_config mxc = { .mx = NULL, };
/* Hold dst to account for the reference from the fib6 tree */
dst_hold(&rt->dst);
return __ip6_ins_rt(rt, &info, &mxc, NULL);
}
/* called with rcu_lock held */
static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
{
struct net_device *dev = rt->dst.dev;
if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
/* for copies of local routes, dst->dev needs to be the
* device if it is a master device, the master device if
* device is enslaved, and the loopback as the default
*/
if (netif_is_l3_slave(dev) &&
!rt6_need_strict(&rt->rt6i_dst.addr))
dev = l3mdev_master_dev_rcu(dev);
else if (!netif_is_l3_master(dev))
dev = dev_net(dev)->loopback_dev;
/* last case is netif_is_l3_master(dev) is true in which
* case we want dev returned to be dev
*/
}
return dev;
}
static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
struct net_device *dev;
struct rt6_info *rt;
/*
* Clone the route.
*/
if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
ort = ort->from;
rcu_read_lock();
dev = ip6_rt_get_dev_rcu(ort);
rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
rcu_read_unlock();
if (!rt)
return NULL;
ip6_rt_copy_init(rt, ort);
rt->rt6i_flags |= RTF_CACHE;
rt->rt6i_metric = 0;
rt->dst.flags |= DST_HOST;
rt->rt6i_dst.addr = *daddr;
rt->rt6i_dst.plen = 128;
if (!rt6_is_gw_or_nonexthop(ort)) {
if (ort->rt6i_dst.plen != 128 &&
ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
rt->rt6i_flags |= RTF_ANYCAST;
#ifdef CONFIG_IPV6_SUBTREES
if (rt->rt6i_src.plen && saddr) {
rt->rt6i_src.addr = *saddr;
rt->rt6i_src.plen = 128;
}
#endif
}
return rt;
}
static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
{
struct net_device *dev;
struct rt6_info *pcpu_rt;
rcu_read_lock();
dev = ip6_rt_get_dev_rcu(rt);
pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
rcu_read_unlock();
if (!pcpu_rt)
return NULL;
ip6_rt_copy_init(pcpu_rt, rt);
pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
pcpu_rt->rt6i_flags |= RTF_PCPU;
return pcpu_rt;
}
/* It should be called with rcu_read_lock() acquired */
static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
{
struct rt6_info *pcpu_rt, **p;
p = this_cpu_ptr(rt->rt6i_pcpu);
pcpu_rt = *p;
if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
rt6_dst_from_metrics_check(pcpu_rt);
return pcpu_rt;
}
static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
{
struct rt6_info *pcpu_rt, *prev, **p;
pcpu_rt = ip6_rt_pcpu_alloc(rt);
if (!pcpu_rt) {
struct net *net = dev_net(rt->dst.dev);
dst_hold(&net->ipv6.ip6_null_entry->dst);
return net->ipv6.ip6_null_entry;
}
dst_hold(&pcpu_rt->dst);
p = this_cpu_ptr(rt->rt6i_pcpu);
prev = cmpxchg(p, NULL, pcpu_rt);
BUG_ON(prev);
rt6_dst_from_metrics_check(pcpu_rt);
return pcpu_rt;
}
/* exception hash table implementation
*/
static DEFINE_SPINLOCK(rt6_exception_lock);
/* Remove rt6_ex from hash table and free the memory
* Caller must hold rt6_exception_lock
*/
static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
struct rt6_exception *rt6_ex)
{
struct net *net;
if (!bucket || !rt6_ex)
return;
net = dev_net(rt6_ex->rt6i->dst.dev);
rt6_ex->rt6i->rt6i_node = NULL;
hlist_del_rcu(&rt6_ex->hlist);
rt6_release(rt6_ex->rt6i);
kfree_rcu(rt6_ex, rcu);
WARN_ON_ONCE(!bucket->depth);
bucket->depth--;
net->ipv6.rt6_stats->fib_rt_cache--;
}
/* Remove oldest rt6_ex in bucket and free the memory
* Caller must hold rt6_exception_lock
*/
static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
{
struct rt6_exception *rt6_ex, *oldest = NULL;
if (!bucket)
return;
hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
oldest = rt6_ex;
}
rt6_remove_exception(bucket, oldest);
}
static u32 rt6_exception_hash(const struct in6_addr *dst,
const struct in6_addr *src)
{
static u32 seed __read_mostly;
u32 val;
net_get_random_once(&seed, sizeof(seed));
val = jhash(dst, sizeof(*dst), seed);
#ifdef CONFIG_IPV6_SUBTREES
if (src)
val = jhash(src, sizeof(*src), val);
#endif
return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
}
/* Helper function to find the cached rt in the hash table
* and update bucket pointer to point to the bucket for this
* (daddr, saddr) pair
* Caller must hold rt6_exception_lock
*/
static struct rt6_exception *
__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
struct rt6_exception *rt6_ex;
u32 hval;
if (!(*bucket) || !daddr)
return NULL;
hval = rt6_exception_hash(daddr, saddr);
*bucket += hval;
hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
struct rt6_info *rt6 = rt6_ex->rt6i;
bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
#ifdef CONFIG_IPV6_SUBTREES
if (matched && saddr)
matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
if (matched)
return rt6_ex;
}
return NULL;
}
/* Helper function to find the cached rt in the hash table
* and update bucket pointer to point to the bucket for this
* (daddr, saddr) pair
* Caller must hold rcu_read_lock()
*/
static struct rt6_exception *
__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
const struct in6_addr *daddr,
const struct in6_addr *saddr)
{
struct rt6_exception *rt6_ex;
u32 hval;
WARN_ON_ONCE(!rcu_read_lock_held());
if (!(*bucket) || !daddr)
return NULL;
hval = rt6_exception_hash(daddr, saddr);
*bucket += hval;
hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
struct rt6_info *rt6 = rt6_ex->rt6i;
bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
#ifdef CONFIG_IPV6_SUBTREES
if (matched && saddr)
matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
#endif
if (matched)
return rt6_ex;
}
return NULL;
}
static int rt6_insert_exception(struct rt6_info *nrt,
struct rt6_info *ort)
{
struct net *net = dev_net(ort->dst.dev);
struct rt6_exception_bucket *bucket;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
int err = 0;
/* ort can't be a cache or pcpu route */
if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
ort = ort->from;
WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
spin_lock_bh(&rt6_exception_lock);
if (ort->exception_bucket_flushed) {
err = -EINVAL;
goto out;
}
bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
if (!bucket) {
bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
GFP_ATOMIC);
if (!bucket) {
err = -ENOMEM;
goto out;
}
rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
}
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates ort is in subtree
* and exception table is indexed by a hash of
* both rt6i_dst and rt6i_src.
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
if (ort->rt6i_src.plen)
src_key = &nrt->rt6i_src.addr;
#endif
/* Update rt6i_prefsrc as it could be changed
* in rt6_remove_prefsrc()
*/
nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
/* rt6_mtu_change() might lower mtu on ort.
* Only insert this exception route if its mtu
* is less than ort's mtu value.
*/
if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
err = -EINVAL;
goto out;
}
rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
src_key);
if (rt6_ex)
rt6_remove_exception(bucket, rt6_ex);
rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
if (!rt6_ex) {
err = -ENOMEM;
goto out;
}
rt6_ex->rt6i = nrt;
rt6_ex->stamp = jiffies;
atomic_inc(&nrt->rt6i_ref);
nrt->rt6i_node = ort->rt6i_node;
hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
bucket->depth++;
net->ipv6.rt6_stats->fib_rt_cache++;
if (bucket->depth > FIB6_MAX_DEPTH)
rt6_exception_remove_oldest(bucket);
out:
spin_unlock_bh(&rt6_exception_lock);
/* Update fn->fn_sernum to invalidate all cached dst */
if (!err) {
spin_lock_bh(&ort->rt6i_table->tb6_lock);
fib6_update_sernum(ort);
spin_unlock_bh(&ort->rt6i_table->tb6_lock);
fib6_force_start_gc(net);
}
return err;
}
void rt6_flush_exceptions(struct rt6_info *rt)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
spin_lock_bh(&rt6_exception_lock);
/* Prevent rt6_insert_exception() to recreate the bucket list */
rt->exception_bucket_flushed = 1;
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
if (!bucket)
goto out;
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
rt6_remove_exception(bucket, rt6_ex);
WARN_ON_ONCE(bucket->depth);
bucket++;
}
out:
spin_unlock_bh(&rt6_exception_lock);
}
/* Find cached rt in the hash table inside passed in rt
* Caller has to hold rcu_read_lock()
*/
static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
struct in6_addr *daddr,
struct in6_addr *saddr)
{
struct rt6_exception_bucket *bucket;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
struct rt6_info *res = NULL;
bucket = rcu_dereference(rt->rt6i_exception_bucket);
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates rt is in subtree
* and exception table is indexed by a hash of
* both rt6i_dst and rt6i_src.
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
if (rt->rt6i_src.plen)
src_key = saddr;
#endif
rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
res = rt6_ex->rt6i;
return res;
}
/* Remove the passed in cached rt from the hash table that contains it */
int rt6_remove_exception_rt(struct rt6_info *rt)
{
struct rt6_exception_bucket *bucket;
struct rt6_info *from = rt->from;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
int err;
if (!from ||
!(rt->rt6i_flags & RTF_CACHE))
return -EINVAL;
if (!rcu_access_pointer(from->rt6i_exception_bucket))
return -ENOENT;
spin_lock_bh(&rt6_exception_lock);
bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
* both rt6i_dst and rt6i_src.
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
if (from->rt6i_src.plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_spinlock(&bucket,
&rt->rt6i_dst.addr,
src_key);
if (rt6_ex) {
rt6_remove_exception(bucket, rt6_ex);
err = 0;
} else {
err = -ENOENT;
}
spin_unlock_bh(&rt6_exception_lock);
return err;
}
/* Find rt6_ex which contains the passed in rt cache and
* refresh its stamp
*/
static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
{
struct rt6_exception_bucket *bucket;
struct rt6_info *from = rt->from;
struct in6_addr *src_key = NULL;
struct rt6_exception *rt6_ex;
if (!from ||
!(rt->rt6i_flags & RTF_CACHE))
return;
rcu_read_lock();
bucket = rcu_dereference(from->rt6i_exception_bucket);
#ifdef CONFIG_IPV6_SUBTREES
/* rt6i_src.plen != 0 indicates 'from' is in subtree
* and exception table is indexed by a hash of
* both rt6i_dst and rt6i_src.
* Otherwise, the exception table is indexed by
* a hash of only rt6i_dst.
*/
if (from->rt6i_src.plen)
src_key = &rt->rt6i_src.addr;
#endif
rt6_ex = __rt6_find_exception_rcu(&bucket,
&rt->rt6i_dst.addr,
src_key);
if (rt6_ex)
rt6_ex->stamp = jiffies;
rcu_read_unlock();
}
static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
int i;
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
}
bucket++;
}
}
}
static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
struct rt6_info *rt, int mtu)
{
/* If the new MTU is lower than the route PMTU, this new MTU will be the
* lowest MTU in the path: always allow updating the route PMTU to
* reflect PMTU decreases.
*
* If the new MTU is higher, and the route PMTU is equal to the local
* MTU, this means the old MTU is the lowest in the path, so allow
* updating it: if other nodes now have lower MTUs, PMTU discovery will
* handle this.
*/
if (dst_mtu(&rt->dst) >= mtu)
return true;
if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
return true;
return false;
}
static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
struct rt6_info *rt, int mtu)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
int i;
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
if (!bucket)
return;
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
struct rt6_info *entry = rt6_ex->rt6i;
/* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
* route), the metrics of its rt->dst.from have already
* been updated.
*/
if (entry->rt6i_pmtu &&
rt6_mtu_change_route_allowed(idev, entry, mtu))
entry->rt6i_pmtu = mtu;
}
bucket++;
}
}
#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
struct in6_addr *gateway)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
if (!rcu_access_pointer(rt->rt6i_exception_bucket))
return;
spin_lock_bh(&rt6_exception_lock);
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
&bucket->chain, hlist) {
struct rt6_info *entry = rt6_ex->rt6i;
if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
RTF_CACHE_GATEWAY &&
ipv6_addr_equal(gateway,
&entry->rt6i_gateway)) {
rt6_remove_exception(bucket, rt6_ex);
}
}
bucket++;
}
}
spin_unlock_bh(&rt6_exception_lock);
}
static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
struct rt6_exception *rt6_ex,
struct fib6_gc_args *gc_args,
unsigned long now)
{
struct rt6_info *rt = rt6_ex->rt6i;
/* we are pruning and obsoleting aged-out and non gateway exceptions
* even if others have still references to them, so that on next
* dst_check() such references can be dropped.
* EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
* expired, independently from their aging, as per RFC 8201 section 4
*/
if (!(rt->rt6i_flags & RTF_EXPIRES)) {
if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
RT6_TRACE("aging clone %p\n", rt);
rt6_remove_exception(bucket, rt6_ex);
return;
}
} else if (time_after(jiffies, rt->dst.expires)) {
RT6_TRACE("purging expired route %p\n", rt);
rt6_remove_exception(bucket, rt6_ex);
return;
}
if (rt->rt6i_flags & RTF_GATEWAY) {
struct neighbour *neigh;
__u8 neigh_flags = 0;
neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
if (neigh)
neigh_flags = neigh->flags;
if (!(neigh_flags & NTF_ROUTER)) {
RT6_TRACE("purging route %p via non-router but gateway\n",
rt);
rt6_remove_exception(bucket, rt6_ex);
return;
}
}
gc_args->more++;
}
void rt6_age_exceptions(struct rt6_info *rt,
struct fib6_gc_args *gc_args,
unsigned long now)
{
struct rt6_exception_bucket *bucket;
struct rt6_exception *rt6_ex;
struct hlist_node *tmp;
int i;
if (!rcu_access_pointer(rt->rt6i_exception_bucket))
return;
rcu_read_lock_bh();
spin_lock(&rt6_exception_lock);
bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
lockdep_is_held(&rt6_exception_lock));
if (bucket) {
for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
hlist_for_each_entry_safe(rt6_ex, tmp,
&bucket->chain, hlist) {
rt6_age_examine_exception(bucket, rt6_ex,
gc_args, now);
}
bucket++;
}
}
spin_unlock(&rt6_exception_lock);
rcu_read_unlock_bh();
}
struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
int oif, struct flowi6 *fl6,
const struct sk_buff *skb, int flags)
{
struct fib6_node *fn, *saved_fn;
struct rt6_info *rt, *rt_cache;
int strict = 0;
strict |= flags & RT6_LOOKUP_F_IFACE;
strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
if (net->ipv6.devconf_all->forwarding == 0)
strict |= RT6_LOOKUP_F_REACHABLE;
rcu_read_lock();
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
saved_fn = fn;
if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
oif = 0;
redo_rt6_select:
rt = rt6_select(net, fn, oif, strict);
if (rt->rt6i_nsiblings)
rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
if (rt == net->ipv6.ip6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto redo_rt6_select;
else if (strict & RT6_LOOKUP_F_REACHABLE) {
/* also consider unreachable route */
strict &= ~RT6_LOOKUP_F_REACHABLE;
fn = saved_fn;
goto redo_rt6_select;
}
}
/*Search through exception table */
rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
if (rt_cache)
rt = rt_cache;
if (rt == net->ipv6.ip6_null_entry) {
rcu_read_unlock();
dst_hold(&rt->dst);
trace_fib6_table_lookup(net, rt, table, fl6);
return rt;
} else if (rt->rt6i_flags & RTF_CACHE) {
if (ip6_hold_safe(net, &rt, true)) {
dst_use_noref(&rt->dst, jiffies);
rt6_dst_from_metrics_check(rt);
}
rcu_read_unlock();
trace_fib6_table_lookup(net, rt, table, fl6);
return rt;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
!(rt->rt6i_flags & RTF_GATEWAY))) {
/* Create a RTF_CACHE clone which will not be
* owned by the fib6 tree. It is for the special case where
* the daddr in the skb during the neighbor look-up is different
* from the fl6->daddr used to look-up route here.
*/
struct rt6_info *uncached_rt;
if (ip6_hold_safe(net, &rt, true)) {
dst_use_noref(&rt->dst, jiffies);
} else {
rcu_read_unlock();
uncached_rt = rt;
goto uncached_rt_out;
}
rcu_read_unlock();
uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
dst_release(&rt->dst);
if (uncached_rt) {
/* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
* No need for another dst_hold()
*/
rt6_uncached_list_add(uncached_rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
} else {
uncached_rt = net->ipv6.ip6_null_entry;
dst_hold(&uncached_rt->dst);
}
uncached_rt_out:
trace_fib6_table_lookup(net, uncached_rt, table, fl6);
return uncached_rt;
} else {
/* Get a percpu copy */
struct rt6_info *pcpu_rt;
dst_use_noref(&rt->dst, jiffies);
local_bh_disable();
pcpu_rt = rt6_get_pcpu_route(rt);
if (!pcpu_rt) {
/* atomic_inc_not_zero() is needed when using rcu */
if (atomic_inc_not_zero(&rt->rt6i_ref)) {
/* No dst_hold() on rt is needed because grabbing
* rt->rt6i_ref makes sure rt can't be released.
*/
pcpu_rt = rt6_make_pcpu_route(rt);
rt6_release(rt);
} else {
/* rt is already removed from tree */
pcpu_rt = net->ipv6.ip6_null_entry;
dst_hold(&pcpu_rt->dst);
}
}
local_bh_enable();
rcu_read_unlock();
trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
return pcpu_rt;
}
}
EXPORT_SYMBOL_GPL(ip6_pol_route);
static struct rt6_info *ip6_pol_route_input(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
}
struct dst_entry *ip6_route_input_lookup(struct net *net,
struct net_device *dev,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
flags |= RT6_LOOKUP_F_IFACE;
return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
}
EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
static void ip6_multipath_l3_keys(const struct sk_buff *skb,
struct flow_keys *keys,
struct flow_keys *flkeys)
{
const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
const struct ipv6hdr *key_iph = outer_iph;
struct flow_keys *_flkeys = flkeys;
const struct ipv6hdr *inner_iph;
const struct icmp6hdr *icmph;
struct ipv6hdr _inner_iph;
if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
goto out;
icmph = icmp6_hdr(skb);
if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
icmph->icmp6_type != ICMPV6_PARAMPROB)
goto out;
inner_iph = skb_header_pointer(skb,
skb_transport_offset(skb) + sizeof(*icmph),
sizeof(_inner_iph), &_inner_iph);
if (!inner_iph)
goto out;
key_iph = inner_iph;
_flkeys = NULL;
out:
if (_flkeys) {
keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
keys->tags.flow_label = _flkeys->tags.flow_label;
keys->basic.ip_proto = _flkeys->basic.ip_proto;
} else {
keys->addrs.v6addrs.src = key_iph->saddr;
keys->addrs.v6addrs.dst = key_iph->daddr;
keys->tags.flow_label = ip6_flowinfo(key_iph);
keys->basic.ip_proto = key_iph->nexthdr;
}
}
/* if skb is set it will be used and fl6 can be NULL */
u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
const struct sk_buff *skb, struct flow_keys *flkeys)
{
struct flow_keys hash_keys;
u32 mhash;
switch (ip6_multipath_hash_policy(net)) {
case 0:
memset(&hash_keys, 0, sizeof(hash_keys));
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
if (skb) {
ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
} else {
hash_keys.addrs.v6addrs.src = fl6->saddr;
hash_keys.addrs.v6addrs.dst = fl6->daddr;
hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
break;
case 1:
if (skb) {
unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
struct flow_keys keys;
/* short-circuit if we already have L4 hash present */
if (skb->l4_hash)
return skb_get_hash_raw(skb) >> 1;
memset(&hash_keys, 0, sizeof(hash_keys));
if (!flkeys) {
skb_flow_dissect_flow_keys(skb, &keys, flag);
flkeys = &keys;
}
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
hash_keys.ports.src = flkeys->ports.src;
hash_keys.ports.dst = flkeys->ports.dst;
hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
} else {
memset(&hash_keys, 0, sizeof(hash_keys));
hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
hash_keys.addrs.v6addrs.src = fl6->saddr;
hash_keys.addrs.v6addrs.dst = fl6->daddr;
hash_keys.ports.src = fl6->fl6_sport;
hash_keys.ports.dst = fl6->fl6_dport;
hash_keys.basic.ip_proto = fl6->flowi6_proto;
}
break;
}
mhash = flow_hash_from_keys(&hash_keys);
return mhash >> 1;
}
void ip6_route_input(struct sk_buff *skb)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
int flags = RT6_LOOKUP_F_HAS_SADDR;
struct ip_tunnel_info *tun_info;
struct flowi6 fl6 = {
.flowi6_iif = skb->dev->ifindex,
.daddr = iph->daddr,
.saddr = iph->saddr,
.flowlabel = ip6_flowinfo(iph),
.flowi6_mark = skb->mark,
.flowi6_proto = iph->nexthdr,
};
struct flow_keys *flkeys = NULL, _flkeys;
tun_info = skb_tunnel_info(skb);
if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
flkeys = &_flkeys;
if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
skb_dst_drop(skb);
skb_dst_set(skb,
ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
}
static struct rt6_info *ip6_pol_route_output(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
}
struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
struct flowi6 *fl6, int flags)
{
bool any_src;
if (rt6_need_strict(&fl6->daddr)) {
struct dst_entry *dst;
dst = l3mdev_link_scope_lookup(net, fl6);
if (dst)
return dst;
}
fl6->flowi6_iif = LOOPBACK_IFINDEX;
any_src = ipv6_addr_any(&fl6->saddr);
if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
(fl6->flowi6_oif && any_src))
flags |= RT6_LOOKUP_F_IFACE;
if (!any_src)
flags |= RT6_LOOKUP_F_HAS_SADDR;
else if (sk)
flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
}
EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
struct net_device *loopback_dev = net->loopback_dev;
struct dst_entry *new = NULL;
rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
DST_OBSOLETE_DEAD, 0);
if (rt) {
rt6_info_init(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
new = &rt->dst;
new->__use = 1;
new->input = dst_discard;
new->output = dst_discard_out;
dst_copy_metrics(new, &ort->dst);
rt->rt6i_idev = in6_dev_get(loopback_dev);
rt->rt6i_gateway = ort->rt6i_gateway;
rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
rt->rt6i_metric = 0;
memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
#ifdef CONFIG_IPV6_SUBTREES
memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
#endif
}
dst_release(dst_orig);
return new ? new : ERR_PTR(-ENOMEM);
}
/*
* Destination cache support functions
*/
static void rt6_dst_from_metrics_check(struct rt6_info *rt)
{
if (rt->from &&
dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
}
static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
{
u32 rt_cookie = 0;
if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
return NULL;
if (rt6_check_expired(rt))
return NULL;
return &rt->dst;
}
static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
{
if (!__rt6_check_expired(rt) &&
rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
rt6_check(rt->from, cookie))
return &rt->dst;
else
return NULL;
}
static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
{
struct rt6_info *rt;
rt = (struct rt6_info *) dst;
/* All IPV6 dsts are created with ->obsolete set to the value
* DST_OBSOLETE_FORCE_CHK which forces validation calls down
* into this function always.
*/
rt6_dst_from_metrics_check(rt);
if (rt->rt6i_flags & RTF_PCPU ||
(unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
return rt6_dst_from_check(rt, cookie);
else
return rt6_check(rt, cookie);
}
static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
{
struct rt6_info *rt = (struct rt6_info *) dst;
if (rt) {
if (rt->rt6i_flags & RTF_CACHE) {
if (rt6_check_expired(rt)) {
ip6_del_rt(rt);
dst = NULL;
}
} else {
dst_release(dst);
dst = NULL;
}
}
return dst;
}
static void ip6_link_failure(struct sk_buff *skb)
{
struct rt6_info *rt;
icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
rt = (struct rt6_info *) skb_dst(skb);
if (rt) {
if (rt->rt6i_flags & RTF_CACHE) {
if (dst_hold_safe(&rt->dst))
ip6_del_rt(rt);
} else {
struct fib6_node *fn;
rcu_read_lock();
fn = rcu_dereference(rt->rt6i_node);
if (fn && (rt->rt6i_flags & RTF_DEFAULT))
fn->fn_sernum = -1;
rcu_read_unlock();
}
}
}
static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
{
struct net *net = dev_net(rt->dst.dev);
rt->rt6i_flags |= RTF_MODIFIED;
rt->rt6i_pmtu = mtu;
rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}
static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
{
return !(rt->rt6i_flags & RTF_CACHE) &&
(rt->rt6i_flags & RTF_PCPU ||
rcu_access_pointer(rt->rt6i_node));
}
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
const struct ipv6hdr *iph, u32 mtu)
{
const struct in6_addr *daddr, *saddr;
struct rt6_info *rt6 = (struct rt6_info *)dst;
if (rt6->rt6i_flags & RTF_LOCAL)
return;
if (dst_metric_locked(dst, RTAX_MTU))
return;
if (iph) {
daddr = &iph->daddr;
saddr = &iph->saddr;
} else if (sk) {
daddr = &sk->sk_v6_daddr;
saddr = &inet6_sk(sk)->saddr;
} else {
daddr = NULL;
saddr = NULL;
}
dst_confirm_neigh(dst, daddr);
mtu = max_t(u32, mtu, IPV6_MIN_MTU);
if (mtu >= dst_mtu(dst))
return;
if (!rt6_cache_allowed_for_pmtu(rt6)) {
rt6_do_update_pmtu(rt6, mtu);
/* update rt6_ex->stamp for cache */
if (rt6->rt6i_flags & RTF_CACHE)
rt6_update_exception_stamp_rt(rt6);
} else if (daddr) {
struct rt6_info *nrt6;
nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
if (rt6_insert_exception(nrt6, rt6))
dst_release_immediate(&nrt6->dst);
}
}
}
static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb, u32 mtu)
{
__ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
}
void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
int oif, u32 mark, kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
struct flowi6 fl6;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_oif = oif;
fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
fl6.flowi6_uid = uid;
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
__ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_update_pmtu);
void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
{
struct dst_entry *dst;
ip6_update_pmtu(skb, sock_net(sk), mtu,
sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
dst = __sk_dst_get(sk);
if (!dst || !dst->obsolete ||
dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
return;
bh_lock_sock(sk);
if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
ip6_datagram_dst_update(sk, false);
bh_unlock_sock(sk);
}
EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
const struct flowi6 *fl6)
{
#ifdef CONFIG_IPV6_SUBTREES
struct ipv6_pinfo *np = inet6_sk(sk);
#endif
ip6_dst_store(sk, dst,
ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
&sk->sk_v6_daddr : NULL,
#ifdef CONFIG_IPV6_SUBTREES
ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
&np->saddr :
#endif
NULL);
}
/* Handle redirects */
struct ip6rd_flowi {
struct flowi6 fl6;
struct in6_addr gateway;
};
static struct rt6_info *__ip6_route_redirect(struct net *net,
struct fib6_table *table,
struct flowi6 *fl6,
const struct sk_buff *skb,
int flags)
{
struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
struct rt6_info *rt, *rt_cache;
struct fib6_node *fn;
/* Get the "current" route for this destination and
* check if the redirect has come from appropriate router.
*
* RFC 4861 specifies that redirects should only be
* accepted if they come from the nexthop to the target.
* Due to the way the routes are chosen, this notion
* is a bit fuzzy and one might need to check all possible
* routes.
*/
rcu_read_lock();
fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
restart:
for_each_fib6_node_rt_rcu(fn) {
if (rt->rt6i_nh_flags & RTNH_F_DEAD)
continue;
if (rt6_check_expired(rt))
continue;
if (rt->dst.error)
break;
if (!(rt->rt6i_flags & RTF_GATEWAY))
continue;
if (fl6->flowi6_oif != rt->dst.dev->ifindex)
continue;
/* rt_cache's gateway might be different from its 'parent'
* in the case of an ip redirect.
* So we keep searching in the exception table if the gateway
* is different.
*/
if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
rt_cache = rt6_find_cached_rt(rt,
&fl6->daddr,
&fl6->saddr);
if (rt_cache &&
ipv6_addr_equal(&rdfl->gateway,
&rt_cache->rt6i_gateway)) {
rt = rt_cache;
break;
}
continue;
}
break;
}
if (!rt)
rt = net->ipv6.ip6_null_entry;
else if (rt->dst.error) {
rt = net->ipv6.ip6_null_entry;
goto out;
}
if (rt == net->ipv6.ip6_null_entry) {
fn = fib6_backtrack(fn, &fl6->saddr);
if (fn)
goto restart;
}
out:
ip6_hold_safe(net, &rt, true);
rcu_read_unlock();
trace_fib6_table_lookup(net, rt, table, fl6);
return rt;
};
static struct dst_entry *ip6_route_redirect(struct net *net,
const struct flowi6 *fl6,
const struct sk_buff *skb,
const struct in6_addr *gateway)
{
int flags = RT6_LOOKUP_F_HAS_SADDR;
struct ip6rd_flowi rdfl;
rdfl.fl6 = *fl6;
rdfl.gateway = *gateway;
return fib6_rule_lookup(net, &rdfl.fl6, skb,
flags, __ip6_route_redirect);
}
void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
kuid_t uid)
{
const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
struct dst_entry *dst;
struct flowi6 fl6;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_iif = LOOPBACK_IFINDEX;
fl6.flowi6_oif = oif;
fl6.flowi6_mark = mark;
fl6.daddr = iph->daddr;
fl6.saddr = iph->saddr;
fl6.flowlabel = ip6_flowinfo(iph);
fl6.flowi6_uid = uid;
dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
rt6_do_redirect(dst, NULL, skb);
dst_release(dst);
}
EXPORT_SYMBOL_GPL(ip6_redirect);
void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
u32 mark)
{
const struct ipv6hdr *iph = ipv6_hdr(skb);
const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
struct dst_entry *dst;
struct flowi6 fl6;
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_iif = LOOPBACK_IFINDEX;
fl6.flowi6_oif = oif;
fl6.flowi6_mark = mark;
fl6.daddr = msg->dest;
fl6.saddr = iph->daddr;
fl6.flowi6_uid = sock_net_uid(net, NULL);
dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
rt6_do_redirect(dst, NULL, skb);
dst_release(dst);
}
void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
{
ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
sk->sk_uid);
}
EXPORT_SYMBOL_GPL(ip6_sk_redirect);
static unsigned int ip6_default_advmss(const struct dst_entry *dst)
{
struct net_device *dev = dst->dev;
unsigned int mtu = dst_mtu(dst);
struct net *net = dev_net(dev);
mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
/*
* Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
* corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
* IPV6_MAXPLEN is also valid and means: "any MSS,
* rely only on pmtu discovery"
*/
if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
mtu = IPV6_MAXPLEN;
return mtu;
}
static unsigned int ip6_mtu(const struct dst_entry *dst)
{
const struct rt6_info *rt = (const struct rt6_info *)dst;
unsigned int mtu = rt->rt6i_pmtu;
struct inet6_dev *idev;
if (mtu)
goto out;
mtu = dst_metric_raw(dst, RTAX_MTU);
if (mtu)
goto out;
mtu = IPV6_MIN_MTU;
rcu_read_lock();
idev = __in6_dev_get(dst->dev);
if (idev)
mtu = idev->cnf.mtu6;
rcu_read_unlock();
out:
mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
}
struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
struct flowi6 *fl6)
{
struct dst_entry *dst;
struct rt6_info *rt;
struct inet6_dev *idev = in6_dev_get(dev);
struct net *net = dev_net(dev);
if (unlikely(!idev))
return ERR_PTR(-ENODEV);
rt = ip6_dst_alloc(net, dev, 0);
if (unlikely(!rt)) {
in6_dev_put(idev);
dst = ERR_PTR(-ENOMEM);
goto out;
}
rt->dst.flags |= DST_HOST;
rt->dst.input = ip6_input;
rt->dst.output = ip6_output;
rt->rt6i_gateway = fl6->daddr;
rt->rt6i_dst.addr = fl6->daddr;
rt->rt6i_dst.plen = 128;
rt->rt6i_idev = idev;
dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
/* Add this dst into uncached_list so that rt6_disable_ip() can
* do proper release of the net_device
*/
rt6_uncached_list_add(rt);
atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
out:
return dst;
}
static int ip6_dst_gc(struct dst_ops *ops)
{
struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
int entries;
entries = dst_entries_get_fast(ops);
if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
entries <= rt_max_size)
goto out;
net->ipv6.ip6_rt_gc_expire++;
fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
entries = dst_entries_get_slow(ops);
if (entries < ops->gc_thresh)
net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
out:
net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
return entries > rt_max_size;
}
static int ip6_convert_metrics(struct mx6_config *mxc,
const struct fib6_config *cfg)
{
struct net *net = cfg->fc_nlinfo.nl_net;
bool ecn_ca = false;
struct nlattr *nla;
int remaining;
u32 *mp;
if (!cfg->fc_mx)
return 0;
mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
if (unlikely(!mp))
return -ENOMEM;
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
int type = nla_type(nla);
u32 val;
if (!type)
continue;
if (unlikely(type > RTAX_MAX))
goto err;
if (type == RTAX_CC_ALGO) {
char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC)
goto err;
} else {
val = nla_get_u32(nla);
}
if (type == RTAX_HOPLIMIT && val > 255)
val = 255;
if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
goto err;
mp[type - 1] = val;
__set_bit(type - 1, mxc->mx_valid);
}
if (ecn_ca) {
__set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
}
mxc->mx = mp;
return 0;
err:
kfree(mp);
return -EINVAL;
}
static struct rt6_info *ip6_nh_lookup_table(struct net *net,
struct fib6_config *cfg,
const struct in6_addr *gw_addr,
u32 tbid, int flags)
{
struct flowi6 fl6 = {
.flowi6_oif = cfg->fc_ifindex,
.daddr = *gw_addr,
.saddr = cfg->fc_prefsrc,
};
struct fib6_table *table;
struct rt6_info *rt;
table = fib6_get_table(net, tbid);
if (!table)
return NULL;
if (!ipv6_addr_any(&cfg->fc_prefsrc))
flags |= RT6_LOOKUP_F_HAS_SADDR;
flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
/* if table lookup failed, fall back to full lookup */
if (rt == net->ipv6.ip6_null_entry) {
ip6_rt_put(rt);
rt = NULL;
}
return rt;
}
static int ip6_route_check_nh_onlink(struct net *net,
struct fib6_config *cfg,
const struct net_device *dev,
struct netlink_ext_ack *extack)
{
u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
const struct in6_addr *gw_addr = &cfg->fc_gateway;
u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
struct rt6_info *grt;
int err;
err = 0;
grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
if (grt) {
if (!grt->dst.error &&
(grt->rt6i_flags & flags || dev != grt->dst.dev)) {
NL_SET_ERR_MSG(extack,
"Nexthop has invalid gateway or device mismatch");
err = -EINVAL;
}
ip6_rt_put(grt);
}
return err;
}
static int ip6_route_check_nh(struct net *net,
struct fib6_config *cfg,
struct net_device **_dev,
struct inet6_dev **idev)
{
const struct in6_addr *gw_addr = &cfg->fc_gateway;
struct net_device *dev = _dev ? *_dev : NULL;
struct rt6_info *grt = NULL;
int err = -EHOSTUNREACH;
if (cfg->fc_table) {
int flags = RT6_LOOKUP_F_IFACE;
grt = ip6_nh_lookup_table(net, cfg, gw_addr,
cfg->fc_table, flags);
if (grt) {
if (grt->rt6i_flags & RTF_GATEWAY ||
(dev && dev != grt->dst.dev)) {
ip6_rt_put(grt);
grt = NULL;
}
}
}
if (!grt)
grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
if (!grt)
goto out;
if (dev) {
if (dev != grt->dst.dev) {
ip6_rt_put(grt);
goto out;
}
} else {
*_dev = dev = grt->dst.dev;
*idev = grt->rt6i_idev;
dev_hold(dev);
in6_dev_hold(grt->rt6i_idev);
}
if (!(grt->rt6i_flags & RTF_GATEWAY))
err = 0;
ip6_rt_put(grt);
out:
return err;
}
static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
struct net_device **_dev, struct inet6_dev **idev,
struct netlink_ext_ack *extack)
{
const struct in6_addr *gw_addr = &cfg->fc_gateway;
int gwa_type = ipv6_addr_type(gw_addr);
bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
const struct net_device *dev = *_dev;
bool need_addr_check = !dev;
int err = -EINVAL;
/* if gw_addr is local we will fail to detect this in case
* address is still TENTATIVE (DAD in progress). rt6_lookup()
* will return already-added prefix route via interface that
* prefix route was assigned to, which might be non-loopback.
*/
if (dev &&
ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
goto out;
}
if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
/* IPv6 strictly inhibits using not link-local