| From foo@baz Mon Sep 17 13:33:56 CEST 2018 |
| From: Stephen Hemminger <stephen@networkplumber.org> |
| Date: Thu, 13 Sep 2018 07:58:41 -0700 |
| Subject: inet: frags: use rhashtables for reassembly units |
| To: davem@davemloft.net, gregkh@linuxfoundation.org |
| Cc: netdev@vger.kernel.org, stable@vger.kernel.org, edumazet@google.com, Kirill Tkhai <ktkhai@virtuozzo.com>, Herbert Xu <herbert@gondor.apana.org.au>, Florian Westphal <fw@strlen.de>, Jesper Dangaard Brouer <brouer@redhat.com>, Alexander Aring <alex.aring@gmail.com>, Stefan Schmidt <stefan@osg.samsung.com> |
| Message-ID: <20180913145902.17531-10-sthemmin@microsoft.com> |
| |
| From: Eric Dumazet <edumazet@google.com> |
| |
| Some applications still rely on IP fragmentation, and to be fair linux |
| reassembly unit is not working under any serious load. |
| |
| It uses static hash tables of 1024 buckets, and up to 128 items per bucket (!!!) |
| |
| A work queue is supposed to garbage collect items when host is under memory |
| pressure, and doing a hash rebuild, changing seed used in hash computations. |
| |
| This work queue blocks softirqs for up to 25 ms when doing a hash rebuild, |
| occurring every 5 seconds if host is under fire. |
| |
| Then there is the problem of sharing this hash table for all netns. |
| |
| It is time to switch to rhashtables, and allocate one of them per netns |
| to speedup netns dismantle, since this is a critical metric these days. |
| |
| Lookup is now using RCU. A followup patch will even remove |
| the refcount hold/release left from prior implementation and save |
| a couple of atomic operations. |
| |
| Before this patch, 16 cpus (16 RX queue NIC) could not handle more |
| than 1 Mpps frags DDOS. |
| |
| After the patch, I reach 9 Mpps without any tuning, and can use up to 2GB |
| of storage for the fragments (exact number depends on frags being evicted |
| after timeout) |
| |
| $ grep FRAG /proc/net/sockstat |
| FRAG: inuse 1966916 memory 2140004608 |
| |
| A followup patch will change the limits for 64bit arches. |
| |
| Signed-off-by: Eric Dumazet <edumazet@google.com> |
| Cc: Kirill Tkhai <ktkhai@virtuozzo.com> |
| Cc: Herbert Xu <herbert@gondor.apana.org.au> |
| Cc: Florian Westphal <fw@strlen.de> |
| Cc: Jesper Dangaard Brouer <brouer@redhat.com> |
| Cc: Alexander Aring <alex.aring@gmail.com> |
| Cc: Stefan Schmidt <stefan@osg.samsung.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| (cherry picked from commit 648700f76b03b7e8149d13cc2bdb3355035258a9) |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| Documentation/networking/ip-sysctl.txt | 7 |
| include/net/inet_frag.h | 81 +++---- |
| include/net/ipv6.h | 16 - |
| net/ieee802154/6lowpan/6lowpan_i.h | 26 -- |
| net/ieee802154/6lowpan/reassembly.c | 91 +++----- |
| net/ipv4/inet_fragment.c | 346 ++++++-------------------------- |
| net/ipv4/ip_fragment.c | 112 ++++------ |
| net/ipv6/netfilter/nf_conntrack_reasm.c | 51 +--- |
| net/ipv6/reassembly.c | 110 ++++------ |
| 9 files changed, 266 insertions(+), 574 deletions(-) |
| |
| --- a/Documentation/networking/ip-sysctl.txt |
| +++ b/Documentation/networking/ip-sysctl.txt |
| @@ -134,13 +134,10 @@ min_adv_mss - INTEGER |
| IP Fragmentation: |
| |
| ipfrag_high_thresh - INTEGER |
| - Maximum memory used to reassemble IP fragments. When |
| - ipfrag_high_thresh bytes of memory is allocated for this purpose, |
| - the fragment handler will toss packets until ipfrag_low_thresh |
| - is reached. This also serves as a maximum limit to namespaces |
| - different from the initial one. |
| + Maximum memory used to reassemble IP fragments. |
| |
| ipfrag_low_thresh - INTEGER |
| + (Obsolete since linux-4.17) |
| Maximum memory used to reassemble IP fragments before the kernel |
| begins to remove incomplete fragment queues to free up resources. |
| The kernel still accepts new fragments for defragmentation. |
| --- a/include/net/inet_frag.h |
| +++ b/include/net/inet_frag.h |
| @@ -2,7 +2,11 @@ |
| #ifndef __NET_FRAG_H__ |
| #define __NET_FRAG_H__ |
| |
| +#include <linux/rhashtable.h> |
| + |
| struct netns_frags { |
| + struct rhashtable rhashtable ____cacheline_aligned_in_smp; |
| + |
| /* Keep atomic mem on separate cachelines in structs that include it */ |
| atomic_t mem ____cacheline_aligned_in_smp; |
| /* sysctls */ |
| @@ -26,12 +30,30 @@ enum { |
| INET_FRAG_COMPLETE = BIT(2), |
| }; |
| |
| +struct frag_v4_compare_key { |
| + __be32 saddr; |
| + __be32 daddr; |
| + u32 user; |
| + u32 vif; |
| + __be16 id; |
| + u16 protocol; |
| +}; |
| + |
| +struct frag_v6_compare_key { |
| + struct in6_addr saddr; |
| + struct in6_addr daddr; |
| + u32 user; |
| + __be32 id; |
| + u32 iif; |
| +}; |
| + |
| /** |
| * struct inet_frag_queue - fragment queue |
| * |
| - * @lock: spinlock protecting the queue |
| + * @node: rhash node |
| + * @key: keys identifying this frag. |
| * @timer: queue expiration timer |
| - * @list: hash bucket list |
| + * @lock: spinlock protecting this frag |
| * @refcnt: reference count of the queue |
| * @fragments: received fragments head |
| * @fragments_tail: received fragments tail |
| @@ -41,12 +63,16 @@ enum { |
| * @flags: fragment queue flags |
| * @max_size: maximum received fragment size |
| * @net: namespace that this frag belongs to |
| - * @list_evictor: list of queues to forcefully evict (e.g. due to low memory) |
| + * @rcu: rcu head for freeing deferall |
| */ |
| struct inet_frag_queue { |
| - spinlock_t lock; |
| + struct rhash_head node; |
| + union { |
| + struct frag_v4_compare_key v4; |
| + struct frag_v6_compare_key v6; |
| + } key; |
| struct timer_list timer; |
| - struct hlist_node list; |
| + spinlock_t lock; |
| refcount_t refcnt; |
| struct sk_buff *fragments; |
| struct sk_buff *fragments_tail; |
| @@ -55,51 +81,20 @@ struct inet_frag_queue { |
| int meat; |
| __u8 flags; |
| u16 max_size; |
| - struct netns_frags *net; |
| - struct hlist_node list_evictor; |
| -}; |
| - |
| -#define INETFRAGS_HASHSZ 1024 |
| - |
| -/* averaged: |
| - * max_depth = default ipfrag_high_thresh / INETFRAGS_HASHSZ / |
| - * rounded up (SKB_TRUELEN(0) + sizeof(struct ipq or |
| - * struct frag_queue)) |
| - */ |
| -#define INETFRAGS_MAXDEPTH 128 |
| - |
| -struct inet_frag_bucket { |
| - struct hlist_head chain; |
| - spinlock_t chain_lock; |
| + struct netns_frags *net; |
| + struct rcu_head rcu; |
| }; |
| |
| struct inet_frags { |
| - struct inet_frag_bucket hash[INETFRAGS_HASHSZ]; |
| - |
| - struct work_struct frags_work; |
| - unsigned int next_bucket; |
| - unsigned long last_rebuild_jiffies; |
| - bool rebuild; |
| - |
| - /* The first call to hashfn is responsible to initialize |
| - * rnd. This is best done with net_get_random_once. |
| - * |
| - * rnd_seqlock is used to let hash insertion detect |
| - * when it needs to re-lookup the hash chain to use. |
| - */ |
| - u32 rnd; |
| - seqlock_t rnd_seqlock; |
| unsigned int qsize; |
| |
| - unsigned int (*hashfn)(const struct inet_frag_queue *); |
| - bool (*match)(const struct inet_frag_queue *q, |
| - const void *arg); |
| void (*constructor)(struct inet_frag_queue *q, |
| const void *arg); |
| void (*destructor)(struct inet_frag_queue *); |
| void (*frag_expire)(struct timer_list *t); |
| struct kmem_cache *frags_cachep; |
| const char *frags_cache_name; |
| + struct rhashtable_params rhash_params; |
| }; |
| |
| int inet_frags_init(struct inet_frags *); |
| @@ -108,15 +103,13 @@ void inet_frags_fini(struct inet_frags * |
| static inline int inet_frags_init_net(struct netns_frags *nf) |
| { |
| atomic_set(&nf->mem, 0); |
| - return 0; |
| + return rhashtable_init(&nf->rhashtable, &nf->f->rhash_params); |
| } |
| void inet_frags_exit_net(struct netns_frags *nf); |
| |
| void inet_frag_kill(struct inet_frag_queue *q); |
| void inet_frag_destroy(struct inet_frag_queue *q); |
| -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, |
| - struct inet_frags *f, void *key, unsigned int hash); |
| - |
| +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); |
| void inet_frag_maybe_warn_overflow(struct inet_frag_queue *q, |
| const char *prefix); |
| |
| @@ -128,7 +121,7 @@ static inline void inet_frag_put(struct |
| |
| static inline bool inet_frag_evicting(struct inet_frag_queue *q) |
| { |
| - return !hlist_unhashed(&q->list_evictor); |
| + return false; |
| } |
| |
| /* Memory Tracking Functions. */ |
| --- a/include/net/ipv6.h |
| +++ b/include/net/ipv6.h |
| @@ -531,17 +531,8 @@ enum ip6_defrag_users { |
| __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, |
| }; |
| |
| -struct ip6_create_arg { |
| - __be32 id; |
| - u32 user; |
| - const struct in6_addr *src; |
| - const struct in6_addr *dst; |
| - int iif; |
| - u8 ecn; |
| -}; |
| - |
| void ip6_frag_init(struct inet_frag_queue *q, const void *a); |
| -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a); |
| +extern const struct rhashtable_params ip6_rhash_params; |
| |
| /* |
| * Equivalent of ipv4 struct ip |
| @@ -549,11 +540,6 @@ bool ip6_frag_match(const struct inet_fr |
| struct frag_queue { |
| struct inet_frag_queue q; |
| |
| - __be32 id; /* fragment id */ |
| - u32 user; |
| - struct in6_addr saddr; |
| - struct in6_addr daddr; |
| - |
| int iif; |
| unsigned int csum; |
| __u16 nhoffset; |
| --- a/net/ieee802154/6lowpan/6lowpan_i.h |
| +++ b/net/ieee802154/6lowpan/6lowpan_i.h |
| @@ -17,37 +17,19 @@ typedef unsigned __bitwise lowpan_rx_res |
| #define LOWPAN_DISPATCH_FRAG1 0xc0 |
| #define LOWPAN_DISPATCH_FRAGN 0xe0 |
| |
| -struct lowpan_create_arg { |
| +struct frag_lowpan_compare_key { |
| u16 tag; |
| u16 d_size; |
| - const struct ieee802154_addr *src; |
| - const struct ieee802154_addr *dst; |
| + const struct ieee802154_addr src; |
| + const struct ieee802154_addr dst; |
| }; |
| |
| -/* Equivalent of ipv4 struct ip |
| +/* Equivalent of ipv4 struct ipq |
| */ |
| struct lowpan_frag_queue { |
| struct inet_frag_queue q; |
| - |
| - u16 tag; |
| - u16 d_size; |
| - struct ieee802154_addr saddr; |
| - struct ieee802154_addr daddr; |
| }; |
| |
| -static inline u32 ieee802154_addr_hash(const struct ieee802154_addr *a) |
| -{ |
| - switch (a->mode) { |
| - case IEEE802154_ADDR_LONG: |
| - return (((__force u64)a->extended_addr) >> 32) ^ |
| - (((__force u64)a->extended_addr) & 0xffffffff); |
| - case IEEE802154_ADDR_SHORT: |
| - return (__force u32)(a->short_addr + (a->pan_id << 16)); |
| - default: |
| - return 0; |
| - } |
| -} |
| - |
| int lowpan_frag_rcv(struct sk_buff *skb, const u8 frag_type); |
| void lowpan_net_frag_exit(void); |
| int lowpan_net_frag_init(void); |
| --- a/net/ieee802154/6lowpan/reassembly.c |
| +++ b/net/ieee802154/6lowpan/reassembly.c |
| @@ -37,47 +37,15 @@ static struct inet_frags lowpan_frags; |
| static int lowpan_frag_reasm(struct lowpan_frag_queue *fq, |
| struct sk_buff *prev, struct net_device *ldev); |
| |
| -static unsigned int lowpan_hash_frag(u16 tag, u16 d_size, |
| - const struct ieee802154_addr *saddr, |
| - const struct ieee802154_addr *daddr) |
| -{ |
| - net_get_random_once(&lowpan_frags.rnd, sizeof(lowpan_frags.rnd)); |
| - return jhash_3words(ieee802154_addr_hash(saddr), |
| - ieee802154_addr_hash(daddr), |
| - (__force u32)(tag + (d_size << 16)), |
| - lowpan_frags.rnd); |
| -} |
| - |
| -static unsigned int lowpan_hashfn(const struct inet_frag_queue *q) |
| -{ |
| - const struct lowpan_frag_queue *fq; |
| - |
| - fq = container_of(q, struct lowpan_frag_queue, q); |
| - return lowpan_hash_frag(fq->tag, fq->d_size, &fq->saddr, &fq->daddr); |
| -} |
| - |
| -static bool lowpan_frag_match(const struct inet_frag_queue *q, const void *a) |
| -{ |
| - const struct lowpan_frag_queue *fq; |
| - const struct lowpan_create_arg *arg = a; |
| - |
| - fq = container_of(q, struct lowpan_frag_queue, q); |
| - return fq->tag == arg->tag && fq->d_size == arg->d_size && |
| - ieee802154_addr_equal(&fq->saddr, arg->src) && |
| - ieee802154_addr_equal(&fq->daddr, arg->dst); |
| -} |
| - |
| static void lowpan_frag_init(struct inet_frag_queue *q, const void *a) |
| { |
| - const struct lowpan_create_arg *arg = a; |
| + const struct frag_lowpan_compare_key *key = a; |
| struct lowpan_frag_queue *fq; |
| |
| fq = container_of(q, struct lowpan_frag_queue, q); |
| |
| - fq->tag = arg->tag; |
| - fq->d_size = arg->d_size; |
| - fq->saddr = *arg->src; |
| - fq->daddr = *arg->dst; |
| + BUILD_BUG_ON(sizeof(*key) > sizeof(q->key)); |
| + memcpy(&q->key, key, sizeof(*key)); |
| } |
| |
| static void lowpan_frag_expire(struct timer_list *t) |
| @@ -105,21 +73,17 @@ fq_find(struct net *net, const struct lo |
| const struct ieee802154_addr *src, |
| const struct ieee802154_addr *dst) |
| { |
| - struct inet_frag_queue *q; |
| - struct lowpan_create_arg arg; |
| - unsigned int hash; |
| struct netns_ieee802154_lowpan *ieee802154_lowpan = |
| net_ieee802154_lowpan(net); |
| + struct frag_lowpan_compare_key key = { |
| + .tag = cb->d_tag, |
| + .d_size = cb->d_size, |
| + .src = *src, |
| + .dst = *dst, |
| + }; |
| + struct inet_frag_queue *q; |
| |
| - arg.tag = cb->d_tag; |
| - arg.d_size = cb->d_size; |
| - arg.src = src; |
| - arg.dst = dst; |
| - |
| - hash = lowpan_hash_frag(cb->d_tag, cb->d_size, src, dst); |
| - |
| - q = inet_frag_find(&ieee802154_lowpan->frags, |
| - &lowpan_frags, &arg, hash); |
| + q = inet_frag_find(&ieee802154_lowpan->frags, &key); |
| if (IS_ERR_OR_NULL(q)) { |
| inet_frag_maybe_warn_overflow(q, pr_fmt()); |
| return NULL; |
| @@ -611,17 +575,46 @@ static struct pernet_operations lowpan_f |
| .exit = lowpan_frags_exit_net, |
| }; |
| |
| +static u32 lowpan_key_hashfn(const void *data, u32 len, u32 seed) |
| +{ |
| + return jhash2(data, |
| + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); |
| +} |
| + |
| +static u32 lowpan_obj_hashfn(const void *data, u32 len, u32 seed) |
| +{ |
| + const struct inet_frag_queue *fq = data; |
| + |
| + return jhash2((const u32 *)&fq->key, |
| + sizeof(struct frag_lowpan_compare_key) / sizeof(u32), seed); |
| +} |
| + |
| +static int lowpan_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) |
| +{ |
| + const struct frag_lowpan_compare_key *key = arg->key; |
| + const struct inet_frag_queue *fq = ptr; |
| + |
| + return !!memcmp(&fq->key, key, sizeof(*key)); |
| +} |
| + |
| +static const struct rhashtable_params lowpan_rhash_params = { |
| + .head_offset = offsetof(struct inet_frag_queue, node), |
| + .hashfn = lowpan_key_hashfn, |
| + .obj_hashfn = lowpan_obj_hashfn, |
| + .obj_cmpfn = lowpan_obj_cmpfn, |
| + .automatic_shrinking = true, |
| +}; |
| + |
| int __init lowpan_net_frag_init(void) |
| { |
| int ret; |
| |
| - lowpan_frags.hashfn = lowpan_hashfn; |
| lowpan_frags.constructor = lowpan_frag_init; |
| lowpan_frags.destructor = NULL; |
| lowpan_frags.qsize = sizeof(struct frag_queue); |
| - lowpan_frags.match = lowpan_frag_match; |
| lowpan_frags.frag_expire = lowpan_frag_expire; |
| lowpan_frags.frags_cache_name = lowpan_frags_cache_name; |
| + lowpan_frags.rhash_params = lowpan_rhash_params; |
| ret = inet_frags_init(&lowpan_frags); |
| if (ret) |
| goto out; |
| --- a/net/ipv4/inet_fragment.c |
| +++ b/net/ipv4/inet_fragment.c |
| @@ -25,12 +25,6 @@ |
| #include <net/inet_frag.h> |
| #include <net/inet_ecn.h> |
| |
| -#define INETFRAGS_EVICT_BUCKETS 128 |
| -#define INETFRAGS_EVICT_MAX 512 |
| - |
| -/* don't rebuild inetfrag table with new secret more often than this */ |
| -#define INETFRAGS_MIN_REBUILD_INTERVAL (5 * HZ) |
| - |
| /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements |
| * Value : 0xff if frame should be dropped. |
| * 0 or INET_ECN_CE value, to be ORed in to final iph->tos field |
| @@ -52,157 +46,8 @@ const u8 ip_frag_ecn_table[16] = { |
| }; |
| EXPORT_SYMBOL(ip_frag_ecn_table); |
| |
| -static unsigned int |
| -inet_frag_hashfn(const struct inet_frags *f, const struct inet_frag_queue *q) |
| -{ |
| - return f->hashfn(q) & (INETFRAGS_HASHSZ - 1); |
| -} |
| - |
| -static bool inet_frag_may_rebuild(struct inet_frags *f) |
| -{ |
| - return time_after(jiffies, |
| - f->last_rebuild_jiffies + INETFRAGS_MIN_REBUILD_INTERVAL); |
| -} |
| - |
| -static void inet_frag_secret_rebuild(struct inet_frags *f) |
| -{ |
| - int i; |
| - |
| - write_seqlock_bh(&f->rnd_seqlock); |
| - |
| - if (!inet_frag_may_rebuild(f)) |
| - goto out; |
| - |
| - get_random_bytes(&f->rnd, sizeof(u32)); |
| - |
| - for (i = 0; i < INETFRAGS_HASHSZ; i++) { |
| - struct inet_frag_bucket *hb; |
| - struct inet_frag_queue *q; |
| - struct hlist_node *n; |
| - |
| - hb = &f->hash[i]; |
| - spin_lock(&hb->chain_lock); |
| - |
| - hlist_for_each_entry_safe(q, n, &hb->chain, list) { |
| - unsigned int hval = inet_frag_hashfn(f, q); |
| - |
| - if (hval != i) { |
| - struct inet_frag_bucket *hb_dest; |
| - |
| - hlist_del(&q->list); |
| - |
| - /* Relink to new hash chain. */ |
| - hb_dest = &f->hash[hval]; |
| - |
| - /* This is the only place where we take |
| - * another chain_lock while already holding |
| - * one. As this will not run concurrently, |
| - * we cannot deadlock on hb_dest lock below, if its |
| - * already locked it will be released soon since |
| - * other caller cannot be waiting for hb lock |
| - * that we've taken above. |
| - */ |
| - spin_lock_nested(&hb_dest->chain_lock, |
| - SINGLE_DEPTH_NESTING); |
| - hlist_add_head(&q->list, &hb_dest->chain); |
| - spin_unlock(&hb_dest->chain_lock); |
| - } |
| - } |
| - spin_unlock(&hb->chain_lock); |
| - } |
| - |
| - f->rebuild = false; |
| - f->last_rebuild_jiffies = jiffies; |
| -out: |
| - write_sequnlock_bh(&f->rnd_seqlock); |
| -} |
| - |
| -static bool inet_fragq_should_evict(const struct inet_frag_queue *q) |
| -{ |
| - if (!hlist_unhashed(&q->list_evictor)) |
| - return false; |
| - |
| - return q->net->low_thresh == 0 || |
| - frag_mem_limit(q->net) >= q->net->low_thresh; |
| -} |
| - |
| -static unsigned int |
| -inet_evict_bucket(struct inet_frags *f, struct inet_frag_bucket *hb) |
| -{ |
| - struct inet_frag_queue *fq; |
| - struct hlist_node *n; |
| - unsigned int evicted = 0; |
| - HLIST_HEAD(expired); |
| - |
| - spin_lock(&hb->chain_lock); |
| - |
| - hlist_for_each_entry_safe(fq, n, &hb->chain, list) { |
| - if (!inet_fragq_should_evict(fq)) |
| - continue; |
| - |
| - if (!del_timer(&fq->timer)) |
| - continue; |
| - |
| - hlist_add_head(&fq->list_evictor, &expired); |
| - ++evicted; |
| - } |
| - |
| - spin_unlock(&hb->chain_lock); |
| - |
| - hlist_for_each_entry_safe(fq, n, &expired, list_evictor) |
| - f->frag_expire(&fq->timer); |
| - |
| - return evicted; |
| -} |
| - |
| -static void inet_frag_worker(struct work_struct *work) |
| -{ |
| - unsigned int budget = INETFRAGS_EVICT_BUCKETS; |
| - unsigned int i, evicted = 0; |
| - struct inet_frags *f; |
| - |
| - f = container_of(work, struct inet_frags, frags_work); |
| - |
| - BUILD_BUG_ON(INETFRAGS_EVICT_BUCKETS >= INETFRAGS_HASHSZ); |
| - |
| - local_bh_disable(); |
| - |
| - for (i = ACCESS_ONCE(f->next_bucket); budget; --budget) { |
| - evicted += inet_evict_bucket(f, &f->hash[i]); |
| - i = (i + 1) & (INETFRAGS_HASHSZ - 1); |
| - if (evicted > INETFRAGS_EVICT_MAX) |
| - break; |
| - } |
| - |
| - f->next_bucket = i; |
| - |
| - local_bh_enable(); |
| - |
| - if (f->rebuild && inet_frag_may_rebuild(f)) |
| - inet_frag_secret_rebuild(f); |
| -} |
| - |
| -static void inet_frag_schedule_worker(struct inet_frags *f) |
| -{ |
| - if (unlikely(!work_pending(&f->frags_work))) |
| - schedule_work(&f->frags_work); |
| -} |
| - |
| int inet_frags_init(struct inet_frags *f) |
| { |
| - int i; |
| - |
| - INIT_WORK(&f->frags_work, inet_frag_worker); |
| - |
| - for (i = 0; i < INETFRAGS_HASHSZ; i++) { |
| - struct inet_frag_bucket *hb = &f->hash[i]; |
| - |
| - spin_lock_init(&hb->chain_lock); |
| - INIT_HLIST_HEAD(&hb->chain); |
| - } |
| - |
| - seqlock_init(&f->rnd_seqlock); |
| - f->last_rebuild_jiffies = 0; |
| f->frags_cachep = kmem_cache_create(f->frags_cache_name, f->qsize, 0, 0, |
| NULL); |
| if (!f->frags_cachep) |
| @@ -214,66 +59,42 @@ EXPORT_SYMBOL(inet_frags_init); |
| |
| void inet_frags_fini(struct inet_frags *f) |
| { |
| - cancel_work_sync(&f->frags_work); |
| + /* We must wait that all inet_frag_destroy_rcu() have completed. */ |
| + rcu_barrier(); |
| + |
| kmem_cache_destroy(f->frags_cachep); |
| + f->frags_cachep = NULL; |
| } |
| EXPORT_SYMBOL(inet_frags_fini); |
| |
| -void inet_frags_exit_net(struct netns_frags *nf) |
| +static void inet_frags_free_cb(void *ptr, void *arg) |
| { |
| - struct inet_frags *f =nf->f; |
| - unsigned int seq; |
| - int i; |
| - |
| - nf->low_thresh = 0; |
| - |
| -evict_again: |
| - local_bh_disable(); |
| - seq = read_seqbegin(&f->rnd_seqlock); |
| - |
| - for (i = 0; i < INETFRAGS_HASHSZ ; i++) |
| - inet_evict_bucket(f, &f->hash[i]); |
| + struct inet_frag_queue *fq = ptr; |
| |
| - local_bh_enable(); |
| - cond_resched(); |
| - |
| - if (read_seqretry(&f->rnd_seqlock, seq) || |
| - sum_frag_mem_limit(nf)) |
| - goto evict_again; |
| -} |
| -EXPORT_SYMBOL(inet_frags_exit_net); |
| + /* If we can not cancel the timer, it means this frag_queue |
| + * is already disappearing, we have nothing to do. |
| + * Otherwise, we own a refcount until the end of this function. |
| + */ |
| + if (!del_timer(&fq->timer)) |
| + return; |
| |
| -static struct inet_frag_bucket * |
| -get_frag_bucket_locked(struct inet_frag_queue *fq, struct inet_frags *f) |
| -__acquires(hb->chain_lock) |
| -{ |
| - struct inet_frag_bucket *hb; |
| - unsigned int seq, hash; |
| - |
| - restart: |
| - seq = read_seqbegin(&f->rnd_seqlock); |
| - |
| - hash = inet_frag_hashfn(f, fq); |
| - hb = &f->hash[hash]; |
| - |
| - spin_lock(&hb->chain_lock); |
| - if (read_seqretry(&f->rnd_seqlock, seq)) { |
| - spin_unlock(&hb->chain_lock); |
| - goto restart; |
| + spin_lock_bh(&fq->lock); |
| + if (!(fq->flags & INET_FRAG_COMPLETE)) { |
| + fq->flags |= INET_FRAG_COMPLETE; |
| + refcount_dec(&fq->refcnt); |
| } |
| + spin_unlock_bh(&fq->lock); |
| |
| - return hb; |
| + inet_frag_put(fq); |
| } |
| |
| -static inline void fq_unlink(struct inet_frag_queue *fq) |
| +void inet_frags_exit_net(struct netns_frags *nf) |
| { |
| - struct inet_frag_bucket *hb; |
| + nf->low_thresh = 0; /* prevent creation of new frags */ |
| |
| - hb = get_frag_bucket_locked(fq, fq->net->f); |
| - hlist_del(&fq->list); |
| - fq->flags |= INET_FRAG_COMPLETE; |
| - spin_unlock(&hb->chain_lock); |
| + rhashtable_free_and_destroy(&nf->rhashtable, inet_frags_free_cb, NULL); |
| } |
| +EXPORT_SYMBOL(inet_frags_exit_net); |
| |
| void inet_frag_kill(struct inet_frag_queue *fq) |
| { |
| @@ -281,12 +102,26 @@ void inet_frag_kill(struct inet_frag_que |
| refcount_dec(&fq->refcnt); |
| |
| if (!(fq->flags & INET_FRAG_COMPLETE)) { |
| - fq_unlink(fq); |
| + struct netns_frags *nf = fq->net; |
| + |
| + fq->flags |= INET_FRAG_COMPLETE; |
| + rhashtable_remove_fast(&nf->rhashtable, &fq->node, nf->f->rhash_params); |
| refcount_dec(&fq->refcnt); |
| } |
| } |
| EXPORT_SYMBOL(inet_frag_kill); |
| |
| +static void inet_frag_destroy_rcu(struct rcu_head *head) |
| +{ |
| + struct inet_frag_queue *q = container_of(head, struct inet_frag_queue, |
| + rcu); |
| + struct inet_frags *f = q->net->f; |
| + |
| + if (f->destructor) |
| + f->destructor(q); |
| + kmem_cache_free(f->frags_cachep, q); |
| +} |
| + |
| void inet_frag_destroy(struct inet_frag_queue *q) |
| { |
| struct sk_buff *fp; |
| @@ -310,55 +145,21 @@ void inet_frag_destroy(struct inet_frag_ |
| } |
| sum = sum_truesize + f->qsize; |
| |
| - if (f->destructor) |
| - f->destructor(q); |
| - kmem_cache_free(f->frags_cachep, q); |
| + call_rcu(&q->rcu, inet_frag_destroy_rcu); |
| |
| sub_frag_mem_limit(nf, sum); |
| } |
| EXPORT_SYMBOL(inet_frag_destroy); |
| |
| -static struct inet_frag_queue *inet_frag_intern(struct netns_frags *nf, |
| - struct inet_frag_queue *qp_in, |
| - struct inet_frags *f, |
| - void *arg) |
| -{ |
| - struct inet_frag_bucket *hb = get_frag_bucket_locked(qp_in, f); |
| - struct inet_frag_queue *qp; |
| - |
| -#ifdef CONFIG_SMP |
| - /* With SMP race we have to recheck hash table, because |
| - * such entry could have been created on other cpu before |
| - * we acquired hash bucket lock. |
| - */ |
| - hlist_for_each_entry(qp, &hb->chain, list) { |
| - if (qp->net == nf && f->match(qp, arg)) { |
| - refcount_inc(&qp->refcnt); |
| - spin_unlock(&hb->chain_lock); |
| - qp_in->flags |= INET_FRAG_COMPLETE; |
| - inet_frag_put(qp_in); |
| - return qp; |
| - } |
| - } |
| -#endif |
| - qp = qp_in; |
| - if (!mod_timer(&qp->timer, jiffies + nf->timeout)) |
| - refcount_inc(&qp->refcnt); |
| - |
| - refcount_inc(&qp->refcnt); |
| - hlist_add_head(&qp->list, &hb->chain); |
| - |
| - spin_unlock(&hb->chain_lock); |
| - |
| - return qp; |
| -} |
| - |
| static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf, |
| struct inet_frags *f, |
| void *arg) |
| { |
| struct inet_frag_queue *q; |
| |
| + if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) |
| + return NULL; |
| + |
| q = kmem_cache_zalloc(f->frags_cachep, GFP_ATOMIC); |
| if (!q) |
| return NULL; |
| @@ -369,64 +170,52 @@ static struct inet_frag_queue *inet_frag |
| |
| timer_setup(&q->timer, f->frag_expire, 0); |
| spin_lock_init(&q->lock); |
| - refcount_set(&q->refcnt, 1); |
| + refcount_set(&q->refcnt, 3); |
| |
| return q; |
| } |
| |
| static struct inet_frag_queue *inet_frag_create(struct netns_frags *nf, |
| - struct inet_frags *f, |
| void *arg) |
| { |
| + struct inet_frags *f = nf->f; |
| struct inet_frag_queue *q; |
| + int err; |
| |
| q = inet_frag_alloc(nf, f, arg); |
| if (!q) |
| return NULL; |
| |
| - return inet_frag_intern(nf, q, f, arg); |
| -} |
| - |
| -struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, |
| - struct inet_frags *f, void *key, |
| - unsigned int hash) |
| -{ |
| - struct inet_frag_bucket *hb; |
| - struct inet_frag_queue *q; |
| - int depth = 0; |
| + mod_timer(&q->timer, jiffies + nf->timeout); |
| |
| - if (!nf->high_thresh || frag_mem_limit(nf) > nf->high_thresh) { |
| - inet_frag_schedule_worker(f); |
| + err = rhashtable_insert_fast(&nf->rhashtable, &q->node, |
| + f->rhash_params); |
| + if (err < 0) { |
| + q->flags |= INET_FRAG_COMPLETE; |
| + inet_frag_kill(q); |
| + inet_frag_destroy(q); |
| return NULL; |
| } |
| + return q; |
| +} |
| |
| - if (frag_mem_limit(nf) > nf->low_thresh) |
| - inet_frag_schedule_worker(f); |
| - |
| - hash &= (INETFRAGS_HASHSZ - 1); |
| - hb = &f->hash[hash]; |
| - |
| - spin_lock(&hb->chain_lock); |
| - hlist_for_each_entry(q, &hb->chain, list) { |
| - if (q->net == nf && f->match(q, key)) { |
| - refcount_inc(&q->refcnt); |
| - spin_unlock(&hb->chain_lock); |
| - return q; |
| - } |
| - depth++; |
| - } |
| - spin_unlock(&hb->chain_lock); |
| +/* TODO : call from rcu_read_lock() and no longer use refcount_inc_not_zero() */ |
| +struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) |
| +{ |
| + struct inet_frag_queue *fq; |
| |
| - if (depth <= INETFRAGS_MAXDEPTH) |
| - return inet_frag_create(nf, f, key); |
| + rcu_read_lock(); |
| |
| - if (inet_frag_may_rebuild(f)) { |
| - if (!f->rebuild) |
| - f->rebuild = true; |
| - inet_frag_schedule_worker(f); |
| + fq = rhashtable_lookup(&nf->rhashtable, key, nf->f->rhash_params); |
| + if (fq) { |
| + if (!refcount_inc_not_zero(&fq->refcnt)) |
| + fq = NULL; |
| + rcu_read_unlock(); |
| + return fq; |
| } |
| + rcu_read_unlock(); |
| |
| - return ERR_PTR(-ENOBUFS); |
| + return inet_frag_create(nf, key); |
| } |
| EXPORT_SYMBOL(inet_frag_find); |
| |
| @@ -434,8 +223,7 @@ void inet_frag_maybe_warn_overflow(struc |
| const char *prefix) |
| { |
| static const char msg[] = "inet_frag_find: Fragment hash bucket" |
| - " list length grew over limit " __stringify(INETFRAGS_MAXDEPTH) |
| - ". Dropping fragment.\n"; |
| + " list length grew over limit. Dropping fragment.\n"; |
| |
| if (PTR_ERR(q) == -ENOBUFS) |
| net_dbg_ratelimited("%s%s", prefix, msg); |
| --- a/net/ipv4/ip_fragment.c |
| +++ b/net/ipv4/ip_fragment.c |
| @@ -69,15 +69,9 @@ struct ipfrag_skb_cb |
| struct ipq { |
| struct inet_frag_queue q; |
| |
| - u32 user; |
| - __be32 saddr; |
| - __be32 daddr; |
| - __be16 id; |
| - u8 protocol; |
| u8 ecn; /* RFC3168 support */ |
| u16 max_df_size; /* largest frag with DF set seen */ |
| int iif; |
| - int vif; /* L3 master device index */ |
| unsigned int rid; |
| struct inet_peer *peer; |
| }; |
| @@ -97,41 +91,6 @@ int ip_frag_mem(struct net *net) |
| static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, |
| struct net_device *dev); |
| |
| -struct ip4_create_arg { |
| - struct iphdr *iph; |
| - u32 user; |
| - int vif; |
| -}; |
| - |
| -static unsigned int ipqhashfn(__be16 id, __be32 saddr, __be32 daddr, u8 prot) |
| -{ |
| - net_get_random_once(&ip4_frags.rnd, sizeof(ip4_frags.rnd)); |
| - return jhash_3words((__force u32)id << 16 | prot, |
| - (__force u32)saddr, (__force u32)daddr, |
| - ip4_frags.rnd); |
| -} |
| - |
| -static unsigned int ip4_hashfn(const struct inet_frag_queue *q) |
| -{ |
| - const struct ipq *ipq; |
| - |
| - ipq = container_of(q, struct ipq, q); |
| - return ipqhashfn(ipq->id, ipq->saddr, ipq->daddr, ipq->protocol); |
| -} |
| - |
| -static bool ip4_frag_match(const struct inet_frag_queue *q, const void *a) |
| -{ |
| - const struct ipq *qp; |
| - const struct ip4_create_arg *arg = a; |
| - |
| - qp = container_of(q, struct ipq, q); |
| - return qp->id == arg->iph->id && |
| - qp->saddr == arg->iph->saddr && |
| - qp->daddr == arg->iph->daddr && |
| - qp->protocol == arg->iph->protocol && |
| - qp->user == arg->user && |
| - qp->vif == arg->vif; |
| -} |
| |
| static void ip4_frag_init(struct inet_frag_queue *q, const void *a) |
| { |
| @@ -140,17 +99,12 @@ static void ip4_frag_init(struct inet_fr |
| frags); |
| struct net *net = container_of(ipv4, struct net, ipv4); |
| |
| - const struct ip4_create_arg *arg = a; |
| + const struct frag_v4_compare_key *key = a; |
| |
| - qp->protocol = arg->iph->protocol; |
| - qp->id = arg->iph->id; |
| - qp->ecn = ip4_frag_ecn(arg->iph->tos); |
| - qp->saddr = arg->iph->saddr; |
| - qp->daddr = arg->iph->daddr; |
| - qp->vif = arg->vif; |
| - qp->user = arg->user; |
| + q->key.v4 = *key; |
| + qp->ecn = 0; |
| qp->peer = q->net->max_dist ? |
| - inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) : |
| + inet_getpeer_v4(net->ipv4.peers, key->saddr, key->vif, 1) : |
| NULL; |
| } |
| |
| @@ -234,7 +188,7 @@ static void ip_expire(struct timer_list |
| /* Only an end host needs to send an ICMP |
| * "Fragment Reassembly Timeout" message, per RFC792. |
| */ |
| - if (frag_expire_skip_icmp(qp->user) && |
| + if (frag_expire_skip_icmp(qp->q.key.v4.user) && |
| (skb_rtable(head)->rt_type != RTN_LOCAL)) |
| goto out; |
| |
| @@ -262,17 +216,17 @@ out_rcu_unlock: |
| static struct ipq *ip_find(struct net *net, struct iphdr *iph, |
| u32 user, int vif) |
| { |
| + struct frag_v4_compare_key key = { |
| + .saddr = iph->saddr, |
| + .daddr = iph->daddr, |
| + .user = user, |
| + .vif = vif, |
| + .id = iph->id, |
| + .protocol = iph->protocol, |
| + }; |
| struct inet_frag_queue *q; |
| - struct ip4_create_arg arg; |
| - unsigned int hash; |
| - |
| - arg.iph = iph; |
| - arg.user = user; |
| - arg.vif = vif; |
| |
| - hash = ipqhashfn(iph->id, iph->saddr, iph->daddr, iph->protocol); |
| - |
| - q = inet_frag_find(&net->ipv4.frags, &ip4_frags, &arg, hash); |
| + q = inet_frag_find(&net->ipv4.frags, &key); |
| if (IS_ERR_OR_NULL(q)) { |
| inet_frag_maybe_warn_overflow(q, pr_fmt()); |
| return NULL; |
| @@ -661,7 +615,7 @@ out_nomem: |
| err = -ENOMEM; |
| goto out_fail; |
| out_oversize: |
| - net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->saddr); |
| + net_info_ratelimited("Oversized IP packet from %pI4\n", &qp->q.key.v4.saddr); |
| out_fail: |
| __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); |
| return err; |
| @@ -899,15 +853,47 @@ static struct pernet_operations ip4_frag |
| .exit = ipv4_frags_exit_net, |
| }; |
| |
| + |
| +static u32 ip4_key_hashfn(const void *data, u32 len, u32 seed) |
| +{ |
| + return jhash2(data, |
| + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); |
| +} |
| + |
| +static u32 ip4_obj_hashfn(const void *data, u32 len, u32 seed) |
| +{ |
| + const struct inet_frag_queue *fq = data; |
| + |
| + return jhash2((const u32 *)&fq->key.v4, |
| + sizeof(struct frag_v4_compare_key) / sizeof(u32), seed); |
| +} |
| + |
| +static int ip4_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) |
| +{ |
| + const struct frag_v4_compare_key *key = arg->key; |
| + const struct inet_frag_queue *fq = ptr; |
| + |
| + return !!memcmp(&fq->key, key, sizeof(*key)); |
| +} |
| + |
| +static const struct rhashtable_params ip4_rhash_params = { |
| + .head_offset = offsetof(struct inet_frag_queue, node), |
| + .key_offset = offsetof(struct inet_frag_queue, key), |
| + .key_len = sizeof(struct frag_v4_compare_key), |
| + .hashfn = ip4_key_hashfn, |
| + .obj_hashfn = ip4_obj_hashfn, |
| + .obj_cmpfn = ip4_obj_cmpfn, |
| + .automatic_shrinking = true, |
| +}; |
| + |
| void __init ipfrag_init(void) |
| { |
| - ip4_frags.hashfn = ip4_hashfn; |
| ip4_frags.constructor = ip4_frag_init; |
| ip4_frags.destructor = ip4_frag_free; |
| ip4_frags.qsize = sizeof(struct ipq); |
| - ip4_frags.match = ip4_frag_match; |
| ip4_frags.frag_expire = ip_expire; |
| ip4_frags.frags_cache_name = ip_frag_cache_name; |
| + ip4_frags.rhash_params = ip4_rhash_params; |
| if (inet_frags_init(&ip4_frags)) |
| panic("IP: failed to allocate ip4_frags cache\n"); |
| ip4_frags_ctl_register(); |
| --- a/net/ipv6/netfilter/nf_conntrack_reasm.c |
| +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c |
| @@ -152,23 +152,6 @@ static inline u8 ip6_frag_ecn(const stru |
| return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); |
| } |
| |
| -static unsigned int nf_hash_frag(__be32 id, const struct in6_addr *saddr, |
| - const struct in6_addr *daddr) |
| -{ |
| - net_get_random_once(&nf_frags.rnd, sizeof(nf_frags.rnd)); |
| - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), |
| - (__force u32)id, nf_frags.rnd); |
| -} |
| - |
| - |
| -static unsigned int nf_hashfn(const struct inet_frag_queue *q) |
| -{ |
| - const struct frag_queue *nq; |
| - |
| - nq = container_of(q, struct frag_queue, q); |
| - return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr); |
| -} |
| - |
| static void nf_ct_frag6_expire(struct timer_list *t) |
| { |
| struct inet_frag_queue *frag = from_timer(frag, t, timer); |
| @@ -182,26 +165,19 @@ static void nf_ct_frag6_expire(struct ti |
| } |
| |
| /* Creation primitives. */ |
| -static inline struct frag_queue *fq_find(struct net *net, __be32 id, |
| - u32 user, struct in6_addr *src, |
| - struct in6_addr *dst, int iif, u8 ecn) |
| +static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, |
| + const struct ipv6hdr *hdr, int iif) |
| { |
| + struct frag_v6_compare_key key = { |
| + .id = id, |
| + .saddr = hdr->saddr, |
| + .daddr = hdr->daddr, |
| + .user = user, |
| + .iif = iif, |
| + }; |
| struct inet_frag_queue *q; |
| - struct ip6_create_arg arg; |
| - unsigned int hash; |
| - |
| - arg.id = id; |
| - arg.user = user; |
| - arg.src = src; |
| - arg.dst = dst; |
| - arg.iif = iif; |
| - arg.ecn = ecn; |
| - |
| - local_bh_disable(); |
| - hash = nf_hash_frag(id, src, dst); |
| |
| - q = inet_frag_find(&net->nf_frag.frags, &nf_frags, &arg, hash); |
| - local_bh_enable(); |
| + q = inet_frag_find(&net->nf_frag.frags, &key); |
| if (IS_ERR_OR_NULL(q)) { |
| inet_frag_maybe_warn_overflow(q, pr_fmt()); |
| return NULL; |
| @@ -593,8 +569,8 @@ int nf_ct_frag6_gather(struct net *net, |
| fhdr = (struct frag_hdr *)skb_transport_header(skb); |
| |
| skb_orphan(skb); |
| - fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr, |
| - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); |
| + fq = fq_find(net, fhdr->identification, user, hdr, |
| + skb->dev ? skb->dev->ifindex : 0); |
| if (fq == NULL) { |
| pr_debug("Can't find and can't create new queue\n"); |
| return -ENOMEM; |
| @@ -662,13 +638,12 @@ int nf_ct_frag6_init(void) |
| { |
| int ret = 0; |
| |
| - nf_frags.hashfn = nf_hashfn; |
| nf_frags.constructor = ip6_frag_init; |
| nf_frags.destructor = NULL; |
| nf_frags.qsize = sizeof(struct frag_queue); |
| - nf_frags.match = ip6_frag_match; |
| nf_frags.frag_expire = nf_ct_frag6_expire; |
| nf_frags.frags_cache_name = nf_frags_cache_name; |
| + nf_frags.rhash_params = ip6_rhash_params; |
| ret = inet_frags_init(&nf_frags); |
| if (ret) |
| goto out; |
| --- a/net/ipv6/reassembly.c |
| +++ b/net/ipv6/reassembly.c |
| @@ -79,52 +79,13 @@ static struct inet_frags ip6_frags; |
| static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, |
| struct net_device *dev); |
| |
| -/* |
| - * callers should be careful not to use the hash value outside the ipfrag_lock |
| - * as doing so could race with ipfrag_hash_rnd being recalculated. |
| - */ |
| -static unsigned int inet6_hash_frag(__be32 id, const struct in6_addr *saddr, |
| - const struct in6_addr *daddr) |
| -{ |
| - net_get_random_once(&ip6_frags.rnd, sizeof(ip6_frags.rnd)); |
| - return jhash_3words(ipv6_addr_hash(saddr), ipv6_addr_hash(daddr), |
| - (__force u32)id, ip6_frags.rnd); |
| -} |
| - |
| -static unsigned int ip6_hashfn(const struct inet_frag_queue *q) |
| -{ |
| - const struct frag_queue *fq; |
| - |
| - fq = container_of(q, struct frag_queue, q); |
| - return inet6_hash_frag(fq->id, &fq->saddr, &fq->daddr); |
| -} |
| - |
| -bool ip6_frag_match(const struct inet_frag_queue *q, const void *a) |
| -{ |
| - const struct frag_queue *fq; |
| - const struct ip6_create_arg *arg = a; |
| - |
| - fq = container_of(q, struct frag_queue, q); |
| - return fq->id == arg->id && |
| - fq->user == arg->user && |
| - ipv6_addr_equal(&fq->saddr, arg->src) && |
| - ipv6_addr_equal(&fq->daddr, arg->dst) && |
| - (arg->iif == fq->iif || |
| - !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST | |
| - IPV6_ADDR_LINKLOCAL))); |
| -} |
| -EXPORT_SYMBOL(ip6_frag_match); |
| - |
| void ip6_frag_init(struct inet_frag_queue *q, const void *a) |
| { |
| struct frag_queue *fq = container_of(q, struct frag_queue, q); |
| - const struct ip6_create_arg *arg = a; |
| + const struct frag_v6_compare_key *key = a; |
| |
| - fq->id = arg->id; |
| - fq->user = arg->user; |
| - fq->saddr = *arg->src; |
| - fq->daddr = *arg->dst; |
| - fq->ecn = arg->ecn; |
| + q->key.v6 = *key; |
| + fq->ecn = 0; |
| } |
| EXPORT_SYMBOL(ip6_frag_init); |
| |
| @@ -182,23 +143,22 @@ static void ip6_frag_expire(struct timer |
| } |
| |
| static struct frag_queue * |
| -fq_find(struct net *net, __be32 id, const struct in6_addr *src, |
| - const struct in6_addr *dst, int iif, u8 ecn) |
| +fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) |
| { |
| + struct frag_v6_compare_key key = { |
| + .id = id, |
| + .saddr = hdr->saddr, |
| + .daddr = hdr->daddr, |
| + .user = IP6_DEFRAG_LOCAL_DELIVER, |
| + .iif = iif, |
| + }; |
| struct inet_frag_queue *q; |
| - struct ip6_create_arg arg; |
| - unsigned int hash; |
| - |
| - arg.id = id; |
| - arg.user = IP6_DEFRAG_LOCAL_DELIVER; |
| - arg.src = src; |
| - arg.dst = dst; |
| - arg.iif = iif; |
| - arg.ecn = ecn; |
| |
| - hash = inet6_hash_frag(id, src, dst); |
| + if (!(ipv6_addr_type(&hdr->daddr) & (IPV6_ADDR_MULTICAST | |
| + IPV6_ADDR_LINKLOCAL))) |
| + key.iif = 0; |
| |
| - q = inet_frag_find(&net->ipv6.frags, &ip6_frags, &arg, hash); |
| + q = inet_frag_find(&net->ipv6.frags, &key); |
| if (IS_ERR_OR_NULL(q)) { |
| inet_frag_maybe_warn_overflow(q, pr_fmt()); |
| return NULL; |
| @@ -530,6 +490,7 @@ static int ipv6_frag_rcv(struct sk_buff |
| struct frag_queue *fq; |
| const struct ipv6hdr *hdr = ipv6_hdr(skb); |
| struct net *net = dev_net(skb_dst(skb)->dev); |
| + int iif; |
| |
| if (IP6CB(skb)->flags & IP6SKB_FRAGMENTED) |
| goto fail_hdr; |
| @@ -558,13 +519,14 @@ static int ipv6_frag_rcv(struct sk_buff |
| return 1; |
| } |
| |
| - fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr, |
| - skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr)); |
| + iif = skb->dev ? skb->dev->ifindex : 0; |
| + fq = fq_find(net, fhdr->identification, hdr, iif); |
| if (fq) { |
| int ret; |
| |
| spin_lock(&fq->q.lock); |
| |
| + fq->iif = iif; |
| ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); |
| |
| spin_unlock(&fq->q.lock); |
| @@ -738,17 +700,47 @@ static struct pernet_operations ip6_frag |
| .exit = ipv6_frags_exit_net, |
| }; |
| |
| +static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) |
| +{ |
| + return jhash2(data, |
| + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); |
| +} |
| + |
| +static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) |
| +{ |
| + const struct inet_frag_queue *fq = data; |
| + |
| + return jhash2((const u32 *)&fq->key.v6, |
| + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); |
| +} |
| + |
| +static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) |
| +{ |
| + const struct frag_v6_compare_key *key = arg->key; |
| + const struct inet_frag_queue *fq = ptr; |
| + |
| + return !!memcmp(&fq->key, key, sizeof(*key)); |
| +} |
| + |
| +const struct rhashtable_params ip6_rhash_params = { |
| + .head_offset = offsetof(struct inet_frag_queue, node), |
| + .hashfn = ip6_key_hashfn, |
| + .obj_hashfn = ip6_obj_hashfn, |
| + .obj_cmpfn = ip6_obj_cmpfn, |
| + .automatic_shrinking = true, |
| +}; |
| +EXPORT_SYMBOL(ip6_rhash_params); |
| + |
| int __init ipv6_frag_init(void) |
| { |
| int ret; |
| |
| - ip6_frags.hashfn = ip6_hashfn; |
| ip6_frags.constructor = ip6_frag_init; |
| ip6_frags.destructor = NULL; |
| ip6_frags.qsize = sizeof(struct frag_queue); |
| - ip6_frags.match = ip6_frag_match; |
| ip6_frags.frag_expire = ip6_frag_expire; |
| ip6_frags.frags_cache_name = ip6_frag_cache_name; |
| + ip6_frags.rhash_params = ip6_rhash_params; |
| ret = inet_frags_init(&ip6_frags); |
| if (ret) |
| goto out; |