| From foo@baz Mon Sep 17 13:33:56 CEST 2018 |
| From: Stephen Hemminger <stephen@networkplumber.org> |
| Date: Thu, 13 Sep 2018 07:59:00 -0700 |
| Subject: ip: add helpers to process in-order fragments faster. |
| To: davem@davemloft.net, gregkh@linuxfoundation.org |
| Cc: netdev@vger.kernel.org, stable@vger.kernel.org, edumazet@google.com, Peter Oskolkov <posk@google.com>, Florian Westphal <fw@strlen.de> |
| Message-ID: <20180913145902.17531-29-sthemmin@microsoft.com> |
| |
| From: Peter Oskolkov <posk@google.com> |
| |
| This patch introduces several helper functions/macros that will be |
| used in the follow-up patch. No runtime changes yet. |
| |
| The new logic (fully implemented in the second patch) is as follows: |
| |
| * Nodes in the rb-tree will now contain not single fragments, but lists |
| of consecutive fragments ("runs"). |
| |
| * At each point in time, the current "active" run at the tail is |
| maintained/tracked. Fragments that arrive in-order, adjacent |
| to the previous tail fragment, are added to this tail run without |
| triggering the re-balancing of the rb-tree. |
| |
| * If a fragment arrives out of order with the offset _before_ the tail run, |
| it is inserted into the rb-tree as a single fragment. |
| |
| * If a fragment arrives after the current tail fragment (with a gap), |
| it starts a new "tail" run, as is inserted into the rb-tree |
| at the end as the head of the new run. |
| |
| skb->cb is used to store additional information |
| needed here (suggested by Eric Dumazet). |
| |
| Reported-by: Willem de Bruijn <willemb@google.com> |
| Signed-off-by: Peter Oskolkov <posk@google.com> |
| Cc: Eric Dumazet <edumazet@google.com> |
| Cc: Florian Westphal <fw@strlen.de> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| (cherry picked from commit 353c9cb360874e737fb000545f783df756c06f9a) |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| include/net/inet_frag.h | 6 +++ |
| net/ipv4/ip_fragment.c | 73 ++++++++++++++++++++++++++++++++++++++++++++++++ |
| 2 files changed, 79 insertions(+) |
| |
| --- a/include/net/inet_frag.h |
| +++ b/include/net/inet_frag.h |
| @@ -57,7 +57,9 @@ struct frag_v6_compare_key { |
| * @lock: spinlock protecting this frag |
| * @refcnt: reference count of the queue |
| * @fragments: received fragments head |
| + * @rb_fragments: received fragments rb-tree root |
| * @fragments_tail: received fragments tail |
| + * @last_run_head: the head of the last "run". see ip_fragment.c |
| * @stamp: timestamp of the last received fragment |
| * @len: total length of the original datagram |
| * @meat: length of received fragments so far |
| @@ -78,6 +80,7 @@ struct inet_frag_queue { |
| struct sk_buff *fragments; /* Used in IPv6. */ |
| struct rb_root rb_fragments; /* Used in IPv4. */ |
| struct sk_buff *fragments_tail; |
| + struct sk_buff *last_run_head; |
| ktime_t stamp; |
| int len; |
| int meat; |
| @@ -113,6 +116,9 @@ void inet_frag_kill(struct inet_frag_que |
| void inet_frag_destroy(struct inet_frag_queue *q); |
| struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key); |
| |
| +/* Free all skbs in the queue; return the sum of their truesizes. */ |
| +unsigned int inet_frag_rbtree_purge(struct rb_root *root); |
| + |
| static inline void inet_frag_put(struct inet_frag_queue *q) |
| { |
| if (refcount_dec_and_test(&q->refcnt)) |
| --- a/net/ipv4/ip_fragment.c |
| +++ b/net/ipv4/ip_fragment.c |
| @@ -57,6 +57,57 @@ |
| */ |
| static const char ip_frag_cache_name[] = "ip4-frags"; |
| |
| +/* Use skb->cb to track consecutive/adjacent fragments coming at |
| + * the end of the queue. Nodes in the rb-tree queue will |
| + * contain "runs" of one or more adjacent fragments. |
| + * |
| + * Invariants: |
| + * - next_frag is NULL at the tail of a "run"; |
| + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. |
| + */ |
| +struct ipfrag_skb_cb { |
| + struct inet_skb_parm h; |
| + struct sk_buff *next_frag; |
| + int frag_run_len; |
| +}; |
| + |
| +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) |
| + |
| +static void ip4_frag_init_run(struct sk_buff *skb) |
| +{ |
| + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); |
| + |
| + FRAG_CB(skb)->next_frag = NULL; |
| + FRAG_CB(skb)->frag_run_len = skb->len; |
| +} |
| + |
| +/* Append skb to the last "run". */ |
| +static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, |
| + struct sk_buff *skb) |
| +{ |
| + RB_CLEAR_NODE(&skb->rbnode); |
| + FRAG_CB(skb)->next_frag = NULL; |
| + |
| + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; |
| + FRAG_CB(q->fragments_tail)->next_frag = skb; |
| + q->fragments_tail = skb; |
| +} |
| + |
| +/* Create a new "run" with the skb. */ |
| +static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) |
| +{ |
| + if (q->last_run_head) |
| + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, |
| + &q->last_run_head->rbnode.rb_right); |
| + else |
| + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); |
| + rb_insert_color(&skb->rbnode, &q->rb_fragments); |
| + |
| + ip4_frag_init_run(skb); |
| + q->fragments_tail = skb; |
| + q->last_run_head = skb; |
| +} |
| + |
| /* Describe an entry in the "incomplete datagrams" queue. */ |
| struct ipq { |
| struct inet_frag_queue q; |
| @@ -654,6 +705,28 @@ struct sk_buff *ip_check_defrag(struct n |
| } |
| EXPORT_SYMBOL(ip_check_defrag); |
| |
| +unsigned int inet_frag_rbtree_purge(struct rb_root *root) |
| +{ |
| + struct rb_node *p = rb_first(root); |
| + unsigned int sum = 0; |
| + |
| + while (p) { |
| + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); |
| + |
| + p = rb_next(p); |
| + rb_erase(&skb->rbnode, root); |
| + while (skb) { |
| + struct sk_buff *next = FRAG_CB(skb)->next_frag; |
| + |
| + sum += skb->truesize; |
| + kfree_skb(skb); |
| + skb = next; |
| + } |
| + } |
| + return sum; |
| +} |
| +EXPORT_SYMBOL(inet_frag_rbtree_purge); |
| + |
| #ifdef CONFIG_SYSCTL |
| static int dist_min; |
| |