| From 4eee57b42ebfae3524ab752b33b3b263b052eaee Mon Sep 17 00:00:00 2001 |
| From: Willem de Bruijn <willemb@google.com> |
| Date: Fri, 13 Mar 2020 12:18:09 -0400 |
| Subject: [PATCH] net/packet: tpacket_rcv: avoid a producer race condition |
| |
| commit 61fad6816fc10fb8793a925d5c1256d1c3db0cd2 upstream. |
| |
| PACKET_RX_RING can cause multiple writers to access the same slot if a |
| fast writer wraps the ring while a slow writer is still copying. This |
| is particularly likely with few, large, slots (e.g., GSO packets). |
| |
| Synchronize kernel thread ownership of rx ring slots with a bitmap. |
| |
| Writers acquire a slot race-free by testing tp_status TP_STATUS_KERNEL |
| while holding the sk receive queue lock. They release this lock before |
| copying and set tp_status to TP_STATUS_USER to release to userspace |
| when done. During copying, another writer may take the lock, also see |
| TP_STATUS_KERNEL, and start writing to the same slot. |
| |
| Introduce a new rx_owner_map bitmap with a bit per slot. To acquire a |
| slot, test and set with the lock held. To release race-free, update |
| tp_status and owner bit as a transaction, so take the lock again. |
| |
| This is the one of a variety of discussed options (see Link below): |
| |
| * instead of a shadow ring, embed the data in the slot itself, such as |
| in tp_padding. But any test for this field may match a value left by |
| userspace, causing deadlock. |
| |
| * avoid the lock on release. This leaves a small race if releasing the |
| shadow slot before setting TP_STATUS_USER. The below reproducer showed |
| that this race is not academic. If releasing the slot after tp_status, |
| the race is more subtle. See the first link for details. |
| |
| * add a new tp_status TP_KERNEL_OWNED to avoid the transactional store |
| of two fields. But, legacy applications may interpret all non-zero |
| tp_status as owned by the user. As libpcap does. So this is possible |
| only opt-in by newer processes. It can be added as an optional mode. |
| |
| * embed the struct at the tail of pg_vec to avoid extra allocation. |
| The implementation proved no less complex than a separate field. |
| |
| The additional locking cost on release adds contention, no different |
| than scaling on multicore or multiqueue h/w. In practice, below |
| reproducer nor small packet tcpdump showed a noticeable change in |
| perf report in cycles spent in spinlock. Where contention is |
| problematic, packet sockets support mitigation through PACKET_FANOUT. |
| And we can consider adding opt-in state TP_KERNEL_OWNED. |
| |
| Easy to reproduce by running multiple netperf or similar TCP_STREAM |
| flows concurrently with `tcpdump -B 129 -n greater 60000`. |
| |
| Based on an earlier patchset by Jon Rosen. See links below. |
| |
| I believe this issue goes back to the introduction of tpacket_rcv, |
| which predates git history. |
| |
| Link: https://www.mail-archive.com/netdev@vger.kernel.org/msg237222.html |
| Suggested-by: Jon Rosen <jrosen@cisco.com> |
| Signed-off-by: Willem de Bruijn <willemb@google.com> |
| Signed-off-by: Jon Rosen <jrosen@cisco.com> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c |
| index 19b88199871b..3fc423b63d9b 100644 |
| --- a/net/packet/af_packet.c |
| +++ b/net/packet/af_packet.c |
| @@ -2167,6 +2167,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, |
| struct timespec ts; |
| __u32 ts_status; |
| bool is_drop_n_account = false; |
| + unsigned int slot_id = 0; |
| bool do_vnet = false; |
| |
| /* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT. |
| @@ -2263,6 +2264,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, |
| if (!h.raw) |
| goto drop_n_account; |
| |
| + if (po->tp_version <= TPACKET_V2) { |
| + slot_id = po->rx_ring.head; |
| + if (test_bit(slot_id, po->rx_ring.rx_owner_map)) |
| + goto drop_n_account; |
| + __set_bit(slot_id, po->rx_ring.rx_owner_map); |
| + } |
| + |
| if (do_vnet && |
| virtio_net_hdr_from_skb(skb, h.raw + macoff - |
| sizeof(struct virtio_net_hdr), |
| @@ -2368,7 +2376,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev, |
| #endif |
| |
| if (po->tp_version <= TPACKET_V2) { |
| + spin_lock(&sk->sk_receive_queue.lock); |
| __packet_set_status(po, h.raw, status); |
| + __clear_bit(slot_id, po->rx_ring.rx_owner_map); |
| + spin_unlock(&sk->sk_receive_queue.lock); |
| sk->sk_data_ready(sk); |
| } else { |
| prb_clear_blk_fill_status(&po->rx_ring); |
| @@ -4263,6 +4274,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, |
| { |
| struct pgv *pg_vec = NULL; |
| struct packet_sock *po = pkt_sk(sk); |
| + unsigned long *rx_owner_map = NULL; |
| int was_running, order = 0; |
| struct packet_ring_buffer *rb; |
| struct sk_buff_head *rb_queue; |
| @@ -4348,6 +4360,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, |
| } |
| break; |
| default: |
| + if (!tx_ring) { |
| + rx_owner_map = bitmap_alloc(req->tp_frame_nr, |
| + GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO); |
| + if (!rx_owner_map) |
| + goto out_free_pg_vec; |
| + } |
| break; |
| } |
| } |
| @@ -4377,6 +4395,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, |
| err = 0; |
| spin_lock_bh(&rb_queue->lock); |
| swap(rb->pg_vec, pg_vec); |
| + if (po->tp_version <= TPACKET_V2) |
| + swap(rb->rx_owner_map, rx_owner_map); |
| rb->frame_max = (req->tp_frame_nr - 1); |
| rb->head = 0; |
| rb->frame_size = req->tp_frame_size; |
| @@ -4408,6 +4428,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u, |
| } |
| |
| out_free_pg_vec: |
| + bitmap_free(rx_owner_map); |
| if (pg_vec) |
| free_pg_vec(pg_vec, order, req->tp_block_nr); |
| out: |
| diff --git a/net/packet/internal.h b/net/packet/internal.h |
| index c70a2794456f..f10294800aaf 100644 |
| --- a/net/packet/internal.h |
| +++ b/net/packet/internal.h |
| @@ -70,7 +70,10 @@ struct packet_ring_buffer { |
| |
| unsigned int __percpu *pending_refcnt; |
| |
| - struct tpacket_kbdq_core prb_bdqc; |
| + union { |
| + unsigned long *rx_owner_map; |
| + struct tpacket_kbdq_core prb_bdqc; |
| + }; |
| }; |
| |
| extern struct mutex fanout_mutex; |
| -- |
| 2.7.4 |
| |