blob: 63394a38528ebea292655c801e68127a53177f77 [file] [log] [blame]
From 4eee57b42ebfae3524ab752b33b3b263b052eaee Mon Sep 17 00:00:00 2001
From: Willem de Bruijn <>
Date: Fri, 13 Mar 2020 12:18:09 -0400
Subject: [PATCH] net/packet: tpacket_rcv: avoid a producer race condition
commit 61fad6816fc10fb8793a925d5c1256d1c3db0cd2 upstream.
PACKET_RX_RING can cause multiple writers to access the same slot if a
fast writer wraps the ring while a slow writer is still copying. This
is particularly likely with few, large, slots (e.g., GSO packets).
Synchronize kernel thread ownership of rx ring slots with a bitmap.
Writers acquire a slot race-free by testing tp_status TP_STATUS_KERNEL
while holding the sk receive queue lock. They release this lock before
copying and set tp_status to TP_STATUS_USER to release to userspace
when done. During copying, another writer may take the lock, also see
TP_STATUS_KERNEL, and start writing to the same slot.
Introduce a new rx_owner_map bitmap with a bit per slot. To acquire a
slot, test and set with the lock held. To release race-free, update
tp_status and owner bit as a transaction, so take the lock again.
This is the one of a variety of discussed options (see Link below):
* instead of a shadow ring, embed the data in the slot itself, such as
in tp_padding. But any test for this field may match a value left by
userspace, causing deadlock.
* avoid the lock on release. This leaves a small race if releasing the
shadow slot before setting TP_STATUS_USER. The below reproducer showed
that this race is not academic. If releasing the slot after tp_status,
the race is more subtle. See the first link for details.
* add a new tp_status TP_KERNEL_OWNED to avoid the transactional store
of two fields. But, legacy applications may interpret all non-zero
tp_status as owned by the user. As libpcap does. So this is possible
only opt-in by newer processes. It can be added as an optional mode.
* embed the struct at the tail of pg_vec to avoid extra allocation.
The implementation proved no less complex than a separate field.
The additional locking cost on release adds contention, no different
than scaling on multicore or multiqueue h/w. In practice, below
reproducer nor small packet tcpdump showed a noticeable change in
perf report in cycles spent in spinlock. Where contention is
problematic, packet sockets support mitigation through PACKET_FANOUT.
And we can consider adding opt-in state TP_KERNEL_OWNED.
Easy to reproduce by running multiple netperf or similar TCP_STREAM
flows concurrently with `tcpdump -B 129 -n greater 60000`.
Based on an earlier patchset by Jon Rosen. See links below.
I believe this issue goes back to the introduction of tpacket_rcv,
which predates git history.
Suggested-by: Jon Rosen <>
Signed-off-by: Willem de Bruijn <>
Signed-off-by: Jon Rosen <>
Signed-off-by: David S. Miller <>
Signed-off-by: Paul Gortmaker <>
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 19b88199871b..3fc423b63d9b 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2167,6 +2167,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
struct timespec ts;
__u32 ts_status;
bool is_drop_n_account = false;
+ unsigned int slot_id = 0;
bool do_vnet = false;
/* struct tpacket{2,3}_hdr is aligned to a multiple of TPACKET_ALIGNMENT.
@@ -2263,6 +2264,13 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!h.raw)
goto drop_n_account;
+ if (po->tp_version <= TPACKET_V2) {
+ slot_id = po->rx_ring.head;
+ if (test_bit(slot_id, po->rx_ring.rx_owner_map))
+ goto drop_n_account;
+ __set_bit(slot_id, po->rx_ring.rx_owner_map);
+ }
if (do_vnet &&
virtio_net_hdr_from_skb(skb, h.raw + macoff -
sizeof(struct virtio_net_hdr),
@@ -2368,7 +2376,10 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (po->tp_version <= TPACKET_V2) {
+ spin_lock(&sk->sk_receive_queue.lock);
__packet_set_status(po, h.raw, status);
+ __clear_bit(slot_id, po->rx_ring.rx_owner_map);
+ spin_unlock(&sk->sk_receive_queue.lock);
} else {
@@ -4263,6 +4274,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
struct pgv *pg_vec = NULL;
struct packet_sock *po = pkt_sk(sk);
+ unsigned long *rx_owner_map = NULL;
int was_running, order = 0;
struct packet_ring_buffer *rb;
struct sk_buff_head *rb_queue;
@@ -4348,6 +4360,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
+ if (!tx_ring) {
+ rx_owner_map = bitmap_alloc(req->tp_frame_nr,
+ if (!rx_owner_map)
+ goto out_free_pg_vec;
+ }
@@ -4377,6 +4395,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
err = 0;
swap(rb->pg_vec, pg_vec);
+ if (po->tp_version <= TPACKET_V2)
+ swap(rb->rx_owner_map, rx_owner_map);
rb->frame_max = (req->tp_frame_nr - 1);
rb->head = 0;
rb->frame_size = req->tp_frame_size;
@@ -4408,6 +4428,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
+ bitmap_free(rx_owner_map);
if (pg_vec)
free_pg_vec(pg_vec, order, req->tp_block_nr);
diff --git a/net/packet/internal.h b/net/packet/internal.h
index c70a2794456f..f10294800aaf 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -70,7 +70,10 @@ struct packet_ring_buffer {
unsigned int __percpu *pending_refcnt;
- struct tpacket_kbdq_core prb_bdqc;
+ union {
+ unsigned long *rx_owner_map;
+ struct tpacket_kbdq_core prb_bdqc;
+ };
extern struct mutex fanout_mutex;