| From foo@baz Mon Sep 17 13:33:56 CEST 2018 |
| From: Stephen Hemminger <stephen@networkplumber.org> |
| Date: Thu, 13 Sep 2018 07:58:58 -0700 |
| Subject: net: sk_buff rbnode reorg |
| To: davem@davemloft.net, gregkh@linuxfoundation.org |
| Cc: netdev@vger.kernel.org, stable@vger.kernel.org, edumazet@google.com, Soheil Hassas Yeganeh <soheil@google.com>, Wei Wang <weiwan@google.com>, Willem de Bruijn <willemb@google.com> |
| Message-ID: <20180913145902.17531-27-sthemmin@microsoft.com> |
| |
| From: Eric Dumazet <edumazet@google.com> |
| |
| commit bffa72cf7f9df842f0016ba03586039296b4caaf upstream |
| |
| skb->rbnode shares space with skb->next, skb->prev and skb->tstamp |
| |
| Current uses (TCP receive ofo queue and netem) need to save/restore |
| tstamp, while skb->dev is either NULL (TCP) or a constant for a given |
| queue (netem). |
| |
| Since we plan using an RB tree for TCP retransmit queue to speedup SACK |
| processing with large BDP, this patch exchanges skb->dev and |
| skb->tstamp. |
| |
| This saves some overhead in both TCP and netem. |
| |
| v2: removes the swtstamp field from struct tcp_skb_cb |
| |
| Signed-off-by: Eric Dumazet <edumazet@google.com> |
| Cc: Soheil Hassas Yeganeh <soheil@google.com> |
| Cc: Wei Wang <weiwan@google.com> |
| Cc: Willem de Bruijn <willemb@google.com> |
| Acked-by: Soheil Hassas Yeganeh <soheil@google.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| include/linux/skbuff.h | 24 ++-- |
| include/net/inet_frag.h | 3 |
| net/ipv4/inet_fragment.c | 14 +- |
| net/ipv4/ip_fragment.c | 182 +++++++++++++++++--------------- |
| net/ipv6/netfilter/nf_conntrack_reasm.c | 1 |
| net/ipv6/reassembly.c | 1 |
| 6 files changed, 128 insertions(+), 97 deletions(-) |
| |
| --- a/include/linux/skbuff.h |
| +++ b/include/linux/skbuff.h |
| @@ -663,23 +663,27 @@ struct sk_buff { |
| struct sk_buff *prev; |
| |
| union { |
| - ktime_t tstamp; |
| - u64 skb_mstamp; |
| + struct net_device *dev; |
| + /* Some protocols might use this space to store information, |
| + * while device pointer would be NULL. |
| + * UDP receive path is one user. |
| + */ |
| + unsigned long dev_scratch; |
| }; |
| }; |
| - struct rb_node rbnode; /* used in netem & tcp stack */ |
| + struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */ |
| + struct list_head list; |
| }; |
| - struct sock *sk; |
| |
| union { |
| - struct net_device *dev; |
| - /* Some protocols might use this space to store information, |
| - * while device pointer would be NULL. |
| - * UDP receive path is one user. |
| - */ |
| - unsigned long dev_scratch; |
| + struct sock *sk; |
| int ip_defrag_offset; |
| }; |
| + |
| + union { |
| + ktime_t tstamp; |
| + u64 skb_mstamp; |
| + }; |
| /* |
| * This is the control buffer. It is free to use for every |
| * layer. Please put your private variables there. If you |
| --- a/include/net/inet_frag.h |
| +++ b/include/net/inet_frag.h |
| @@ -75,7 +75,8 @@ struct inet_frag_queue { |
| struct timer_list timer; |
| spinlock_t lock; |
| refcount_t refcnt; |
| - struct sk_buff *fragments; |
| + struct sk_buff *fragments; /* Used in IPv6. */ |
| + struct rb_root rb_fragments; /* Used in IPv4. */ |
| struct sk_buff *fragments_tail; |
| ktime_t stamp; |
| int len; |
| --- a/net/ipv4/inet_fragment.c |
| +++ b/net/ipv4/inet_fragment.c |
| @@ -136,12 +136,16 @@ void inet_frag_destroy(struct inet_frag_ |
| fp = q->fragments; |
| nf = q->net; |
| f = nf->f; |
| - while (fp) { |
| - struct sk_buff *xp = fp->next; |
| + if (fp) { |
| + do { |
| + struct sk_buff *xp = fp->next; |
| |
| - sum_truesize += fp->truesize; |
| - kfree_skb(fp); |
| - fp = xp; |
| + sum_truesize += fp->truesize; |
| + kfree_skb(fp); |
| + fp = xp; |
| + } while (fp); |
| + } else { |
| + sum_truesize = skb_rbtree_purge(&q->rb_fragments); |
| } |
| sum = sum_truesize + f->qsize; |
| |
| --- a/net/ipv4/ip_fragment.c |
| +++ b/net/ipv4/ip_fragment.c |
| @@ -136,7 +136,7 @@ static void ip_expire(struct timer_list |
| { |
| struct inet_frag_queue *frag = from_timer(frag, t, timer); |
| const struct iphdr *iph; |
| - struct sk_buff *head; |
| + struct sk_buff *head = NULL; |
| struct net *net; |
| struct ipq *qp; |
| int err; |
| @@ -152,14 +152,31 @@ static void ip_expire(struct timer_list |
| |
| ipq_kill(qp); |
| __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); |
| - |
| - head = qp->q.fragments; |
| - |
| __IP_INC_STATS(net, IPSTATS_MIB_REASMTIMEOUT); |
| |
| - if (!(qp->q.flags & INET_FRAG_FIRST_IN) || !head) |
| + if (!qp->q.flags & INET_FRAG_FIRST_IN) |
| goto out; |
| |
| + /* sk_buff::dev and sk_buff::rbnode are unionized. So we |
| + * pull the head out of the tree in order to be able to |
| + * deal with head->dev. |
| + */ |
| + if (qp->q.fragments) { |
| + head = qp->q.fragments; |
| + qp->q.fragments = head->next; |
| + } else { |
| + head = skb_rb_first(&qp->q.rb_fragments); |
| + if (!head) |
| + goto out; |
| + rb_erase(&head->rbnode, &qp->q.rb_fragments); |
| + memset(&head->rbnode, 0, sizeof(head->rbnode)); |
| + barrier(); |
| + } |
| + if (head == qp->q.fragments_tail) |
| + qp->q.fragments_tail = NULL; |
| + |
| + sub_frag_mem_limit(qp->q.net, head->truesize); |
| + |
| head->dev = dev_get_by_index_rcu(net, qp->iif); |
| if (!head->dev) |
| goto out; |
| @@ -179,16 +196,16 @@ static void ip_expire(struct timer_list |
| (skb_rtable(head)->rt_type != RTN_LOCAL)) |
| goto out; |
| |
| - skb_get(head); |
| spin_unlock(&qp->q.lock); |
| icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); |
| - kfree_skb(head); |
| goto out_rcu_unlock; |
| |
| out: |
| spin_unlock(&qp->q.lock); |
| out_rcu_unlock: |
| rcu_read_unlock(); |
| + if (head) |
| + kfree_skb(head); |
| ipq_put(qp); |
| } |
| |
| @@ -231,7 +248,7 @@ static int ip_frag_too_far(struct ipq *q |
| end = atomic_inc_return(&peer->rid); |
| qp->rid = end; |
| |
| - rc = qp->q.fragments && (end - start) > max; |
| + rc = qp->q.fragments_tail && (end - start) > max; |
| |
| if (rc) { |
| struct net *net; |
| @@ -245,7 +262,6 @@ static int ip_frag_too_far(struct ipq *q |
| |
| static int ip_frag_reinit(struct ipq *qp) |
| { |
| - struct sk_buff *fp; |
| unsigned int sum_truesize = 0; |
| |
| if (!mod_timer(&qp->q.timer, jiffies + qp->q.net->timeout)) { |
| @@ -253,20 +269,14 @@ static int ip_frag_reinit(struct ipq *qp |
| return -ETIMEDOUT; |
| } |
| |
| - fp = qp->q.fragments; |
| - do { |
| - struct sk_buff *xp = fp->next; |
| - |
| - sum_truesize += fp->truesize; |
| - kfree_skb(fp); |
| - fp = xp; |
| - } while (fp); |
| + sum_truesize = skb_rbtree_purge(&qp->q.rb_fragments); |
| sub_frag_mem_limit(qp->q.net, sum_truesize); |
| |
| qp->q.flags = 0; |
| qp->q.len = 0; |
| qp->q.meat = 0; |
| qp->q.fragments = NULL; |
| + qp->q.rb_fragments = RB_ROOT; |
| qp->q.fragments_tail = NULL; |
| qp->iif = 0; |
| qp->ecn = 0; |
| @@ -278,7 +288,8 @@ static int ip_frag_reinit(struct ipq *qp |
| static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) |
| { |
| struct net *net = container_of(qp->q.net, struct net, ipv4.frags); |
| - struct sk_buff *prev, *next; |
| + struct rb_node **rbn, *parent; |
| + struct sk_buff *skb1; |
| struct net_device *dev; |
| unsigned int fragsize; |
| int flags, offset; |
| @@ -341,58 +352,58 @@ static int ip_frag_queue(struct ipq *qp, |
| if (err) |
| goto err; |
| |
| - /* Find out which fragments are in front and at the back of us |
| - * in the chain of fragments so far. We must know where to put |
| - * this fragment, right? |
| - */ |
| - prev = qp->q.fragments_tail; |
| - if (!prev || prev->ip_defrag_offset < offset) { |
| - next = NULL; |
| - goto found; |
| - } |
| - prev = NULL; |
| - for (next = qp->q.fragments; next != NULL; next = next->next) { |
| - if (next->ip_defrag_offset >= offset) |
| - break; /* bingo! */ |
| - prev = next; |
| - } |
| + /* Note : skb->rbnode and skb->dev share the same location. */ |
| + dev = skb->dev; |
| + /* Makes sure compiler wont do silly aliasing games */ |
| + barrier(); |
| |
| -found: |
| /* RFC5722, Section 4, amended by Errata ID : 3089 |
| * When reassembling an IPv6 datagram, if |
| * one or more its constituent fragments is determined to be an |
| * overlapping fragment, the entire datagram (and any constituent |
| * fragments) MUST be silently discarded. |
| * |
| - * We do the same here for IPv4. |
| + * We do the same here for IPv4 (and increment an snmp counter). |
| */ |
| |
| - /* Is there an overlap with the previous fragment? */ |
| - if (prev && |
| - (prev->ip_defrag_offset + prev->len) > offset) |
| - goto discard_qp; |
| - |
| - /* Is there an overlap with the next fragment? */ |
| - if (next && next->ip_defrag_offset < end) |
| - goto discard_qp; |
| + /* Find out where to put this fragment. */ |
| + skb1 = qp->q.fragments_tail; |
| + if (!skb1) { |
| + /* This is the first fragment we've received. */ |
| + rb_link_node(&skb->rbnode, NULL, &qp->q.rb_fragments.rb_node); |
| + qp->q.fragments_tail = skb; |
| + } else if ((skb1->ip_defrag_offset + skb1->len) < end) { |
| + /* This is the common/special case: skb goes to the end. */ |
| + /* Detect and discard overlaps. */ |
| + if (offset < (skb1->ip_defrag_offset + skb1->len)) |
| + goto discard_qp; |
| + /* Insert after skb1. */ |
| + rb_link_node(&skb->rbnode, &skb1->rbnode, &skb1->rbnode.rb_right); |
| + qp->q.fragments_tail = skb; |
| + } else { |
| + /* Binary search. Note that skb can become the first fragment, but |
| + * not the last (covered above). */ |
| + rbn = &qp->q.rb_fragments.rb_node; |
| + do { |
| + parent = *rbn; |
| + skb1 = rb_to_skb(parent); |
| + if (end <= skb1->ip_defrag_offset) |
| + rbn = &parent->rb_left; |
| + else if (offset >= skb1->ip_defrag_offset + skb1->len) |
| + rbn = &parent->rb_right; |
| + else /* Found an overlap with skb1. */ |
| + goto discard_qp; |
| + } while (*rbn); |
| + /* Here we have parent properly set, and rbn pointing to |
| + * one of its NULL left/right children. Insert skb. */ |
| + rb_link_node(&skb->rbnode, parent, rbn); |
| + } |
| + rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); |
| |
| - /* Note : skb->ip_defrag_offset and skb->dev share the same location */ |
| - dev = skb->dev; |
| if (dev) |
| qp->iif = dev->ifindex; |
| - /* Makes sure compiler wont do silly aliasing games */ |
| - barrier(); |
| skb->ip_defrag_offset = offset; |
| |
| - /* Insert this fragment in the chain of fragments. */ |
| - skb->next = next; |
| - if (!next) |
| - qp->q.fragments_tail = skb; |
| - if (prev) |
| - prev->next = skb; |
| - else |
| - qp->q.fragments = skb; |
| - |
| qp->q.stamp = skb->tstamp; |
| qp->q.meat += skb->len; |
| qp->ecn |= ecn; |
| @@ -414,7 +425,7 @@ found: |
| unsigned long orefdst = skb->_skb_refdst; |
| |
| skb->_skb_refdst = 0UL; |
| - err = ip_frag_reasm(qp, prev, dev); |
| + err = ip_frag_reasm(qp, skb, dev); |
| skb->_skb_refdst = orefdst; |
| return err; |
| } |
| @@ -431,15 +442,15 @@ err: |
| return err; |
| } |
| |
| - |
| /* Build a new IP datagram from all its fragments. */ |
| - |
| -static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, |
| +static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, |
| struct net_device *dev) |
| { |
| struct net *net = container_of(qp->q.net, struct net, ipv4.frags); |
| struct iphdr *iph; |
| - struct sk_buff *fp, *head = qp->q.fragments; |
| + struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); |
| + struct sk_buff **nextp; /* To build frag_list. */ |
| + struct rb_node *rbn; |
| int len; |
| int ihlen; |
| int err; |
| @@ -453,25 +464,20 @@ static int ip_frag_reasm(struct ipq *qp, |
| goto out_fail; |
| } |
| /* Make the one we just received the head. */ |
| - if (prev) { |
| - head = prev->next; |
| - fp = skb_clone(head, GFP_ATOMIC); |
| + if (head != skb) { |
| + fp = skb_clone(skb, GFP_ATOMIC); |
| if (!fp) |
| goto out_nomem; |
| - |
| - fp->next = head->next; |
| - if (!fp->next) |
| + rb_replace_node(&skb->rbnode, &fp->rbnode, &qp->q.rb_fragments); |
| + if (qp->q.fragments_tail == skb) |
| qp->q.fragments_tail = fp; |
| - prev->next = fp; |
| - |
| - skb_morph(head, qp->q.fragments); |
| - head->next = qp->q.fragments->next; |
| - |
| - consume_skb(qp->q.fragments); |
| - qp->q.fragments = head; |
| + skb_morph(skb, head); |
| + rb_replace_node(&head->rbnode, &skb->rbnode, |
| + &qp->q.rb_fragments); |
| + consume_skb(head); |
| + head = skb; |
| } |
| |
| - WARN_ON(!head); |
| WARN_ON(head->ip_defrag_offset != 0); |
| |
| /* Allocate a new buffer for the datagram. */ |
| @@ -496,24 +502,35 @@ static int ip_frag_reasm(struct ipq *qp, |
| clone = alloc_skb(0, GFP_ATOMIC); |
| if (!clone) |
| goto out_nomem; |
| - clone->next = head->next; |
| - head->next = clone; |
| skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; |
| skb_frag_list_init(head); |
| for (i = 0; i < skb_shinfo(head)->nr_frags; i++) |
| plen += skb_frag_size(&skb_shinfo(head)->frags[i]); |
| clone->len = clone->data_len = head->data_len - plen; |
| - head->data_len -= clone->len; |
| - head->len -= clone->len; |
| + skb->truesize += clone->truesize; |
| clone->csum = 0; |
| clone->ip_summed = head->ip_summed; |
| add_frag_mem_limit(qp->q.net, clone->truesize); |
| + skb_shinfo(head)->frag_list = clone; |
| + nextp = &clone->next; |
| + } else { |
| + nextp = &skb_shinfo(head)->frag_list; |
| } |
| |
| - skb_shinfo(head)->frag_list = head->next; |
| skb_push(head, head->data - skb_network_header(head)); |
| |
| - for (fp=head->next; fp; fp = fp->next) { |
| + /* Traverse the tree in order, to build frag_list. */ |
| + rbn = rb_next(&head->rbnode); |
| + rb_erase(&head->rbnode, &qp->q.rb_fragments); |
| + while (rbn) { |
| + struct rb_node *rbnext = rb_next(rbn); |
| + fp = rb_to_skb(rbn); |
| + rb_erase(rbn, &qp->q.rb_fragments); |
| + rbn = rbnext; |
| + *nextp = fp; |
| + nextp = &fp->next; |
| + fp->prev = NULL; |
| + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); |
| head->data_len += fp->len; |
| head->len += fp->len; |
| if (head->ip_summed != fp->ip_summed) |
| @@ -524,7 +541,9 @@ static int ip_frag_reasm(struct ipq *qp, |
| } |
| sub_frag_mem_limit(qp->q.net, head->truesize); |
| |
| + *nextp = NULL; |
| head->next = NULL; |
| + head->prev = NULL; |
| head->dev = dev; |
| head->tstamp = qp->q.stamp; |
| IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); |
| @@ -552,6 +571,7 @@ static int ip_frag_reasm(struct ipq *qp, |
| |
| __IP_INC_STATS(net, IPSTATS_MIB_REASMOKS); |
| qp->q.fragments = NULL; |
| + qp->q.rb_fragments = RB_ROOT; |
| qp->q.fragments_tail = NULL; |
| return 0; |
| |
| --- a/net/ipv6/netfilter/nf_conntrack_reasm.c |
| +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c |
| @@ -471,6 +471,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, |
| head->csum); |
| |
| fq->q.fragments = NULL; |
| + fq->q.rb_fragments = RB_ROOT; |
| fq->q.fragments_tail = NULL; |
| |
| return true; |
| --- a/net/ipv6/reassembly.c |
| +++ b/net/ipv6/reassembly.c |
| @@ -472,6 +472,7 @@ static int ip6_frag_reasm(struct frag_qu |
| __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); |
| rcu_read_unlock(); |
| fq->q.fragments = NULL; |
| + fq->q.rb_fragments = RB_ROOT; |
| fq->q.fragments_tail = NULL; |
| return 1; |
| |