| From: Florian Westphal <fw@strlen.de> |
| Date: Thu, 18 Feb 2016 15:03:24 +0100 |
| Subject: netlink: remove mmapped netlink support |
| |
| commit d1b4c689d4130bcfd3532680b64db562300716b6 upstream. |
| |
| mmapped netlink has a number of unresolved issues: |
| |
| - TX zerocopy support had to be disabled more than a year ago via |
| commit 4682a0358639b29cf ("netlink: Always copy on mmap TX.") |
| because the content of the mmapped area can change after netlink |
| attribute validation but before message processing. |
| |
| - RX support was implemented mainly to speed up nfqueue dumping packet |
| payload to userspace. However, since commit ae08ce0021087a5d812d2 |
| ("netfilter: nfnetlink_queue: zero copy support") we avoid one copy |
| with the socket-based interface too (via the skb_zerocopy helper). |
| |
| The other problem is that skbs attached to mmaped netlink socket |
| behave different from normal skbs: |
| |
| - they don't have a shinfo area, so all functions that use skb_shinfo() |
| (e.g. skb_clone) cannot be used. |
| |
| - reserving headroom prevents userspace from seeing the content as |
| it expects message to start at skb->head. |
| See for instance |
| commit aa3a022094fa ("netlink: not trim skb for mmaped socket when dump"). |
| |
| - skbs handed e.g. to netlink_ack must have non-NULL skb->sk, else we |
| crash because it needs the sk to check if a tx ring is attached. |
| |
| Also not obvious, leads to non-intuitive bug fixes such as 7c7bdf359 |
| ("netfilter: nfnetlink: use original skbuff when acking batches"). |
| |
| mmaped netlink also didn't play nicely with the skb_zerocopy helper |
| used by nfqueue and openvswitch. Daniel Borkmann fixed this via |
| commit 6bb0fef489f6 ("netlink, mmap: fix edge-case leakages in nf queue |
| zero-copy")' but at the cost of also needing to provide remaining |
| length to the allocation function. |
| |
| nfqueue also has problems when used with mmaped rx netlink: |
| - mmaped netlink doesn't allow use of nfqueue batch verdict messages. |
| Problem is that in the mmap case, the allocation time also determines |
| the ordering in which the frame will be seen by userspace (A |
| allocating before B means that A is located in earlier ring slot, |
| but this also means that B might get a lower sequence number then A |
| since seqno is decided later. To fix this we would need to extend the |
| spinlocked region to also cover the allocation and message setup which |
| isn't desirable. |
| - nfqueue can now be configured to queue large (GSO) skbs to userspace. |
| Queing GSO packets is faster than having to force a software segmentation |
| in the kernel, so this is a desirable option. However, with a mmap based |
| ring one has to use 64kb per ring slot element, else mmap has to fall back |
| to the socket path (NL_MMAP_STATUS_COPY) for all large packets. |
| |
| To use the mmap interface, userspace not only has to probe for mmap netlink |
| support, it also has to implement a recv/socket receive path in order to |
| handle messages that exceed the size of an rx ring element. |
| |
| Cc: Daniel Borkmann <daniel@iogearbox.net> |
| Cc: Ken-ichirou MATSUZAWA <chamaken@gmail.com> |
| Cc: Pablo Neira Ayuso <pablo@netfilter.org> |
| Cc: Patrick McHardy <kaber@trash.net> |
| Cc: Thomas Graf <tgraf@suug.ch> |
| Signed-off-by: Florian Westphal <fw@strlen.de> |
| Signed-off-by: David S. Miller <davem@davemloft.net> |
| [bwh: Backported to 3.16: deleted code and documentation is different in places] |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| Cc: Shi Yuejie <shiyuejie@outlook.com> |
| --- |
| --- a/Documentation/networking/netlink_mmap.txt |
| +++ /dev/null |
| @@ -1,339 +0,0 @@ |
| -This file documents how to use memory mapped I/O with netlink. |
| - |
| -Author: Patrick McHardy <kaber@trash.net> |
| - |
| -Overview |
| --------- |
| - |
| -Memory mapped netlink I/O can be used to increase throughput and decrease |
| -overhead of unicast receive and transmit operations. Some netlink subsystems |
| -require high throughput, these are mainly the netfilter subsystems |
| -nfnetlink_queue and nfnetlink_log, but it can also help speed up large |
| -dump operations of f.i. the routing database. |
| - |
| -Memory mapped netlink I/O used two circular ring buffers for RX and TX which |
| -are mapped into the processes address space. |
| - |
| -The RX ring is used by the kernel to directly construct netlink messages into |
| -user-space memory without copying them as done with regular socket I/O, |
| -additionally as long as the ring contains messages no recvmsg() or poll() |
| -syscalls have to be issued by user-space to get more message. |
| - |
| -The TX ring is used to process messages directly from user-space memory, the |
| -kernel processes all messages contained in the ring using a single sendmsg() |
| -call. |
| - |
| -Usage overview |
| --------------- |
| - |
| -In order to use memory mapped netlink I/O, user-space needs three main changes: |
| - |
| -- ring setup |
| -- conversion of the RX path to get messages from the ring instead of recvmsg() |
| -- conversion of the TX path to construct messages into the ring |
| - |
| -Ring setup is done using setsockopt() to provide the ring parameters to the |
| -kernel, then a call to mmap() to map the ring into the processes address space: |
| - |
| -- setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, ¶ms, sizeof(params)); |
| -- setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, ¶ms, sizeof(params)); |
| -- ring = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0) |
| - |
| -Usage of either ring is optional, but even if only the RX ring is used the |
| -mapping still needs to be writable in order to update the frame status after |
| -processing. |
| - |
| -Conversion of the reception path involves calling poll() on the file |
| -descriptor, once the socket is readable the frames from the ring are |
| -processed in order until no more messages are available, as indicated by |
| -a status word in the frame header. |
| - |
| -On kernel side, in order to make use of memory mapped I/O on receive, the |
| -originating netlink subsystem needs to support memory mapped I/O, otherwise |
| -it will use an allocated socket buffer as usual and the contents will be |
| - copied to the ring on transmission, nullifying most of the performance gains. |
| -Dumps of kernel databases automatically support memory mapped I/O. |
| - |
| -Conversion of the transmit path involves changing message construction to |
| -use memory from the TX ring instead of (usually) a buffer declared on the |
| -stack and setting up the frame header appropriately. Optionally poll() can |
| -be used to wait for free frames in the TX ring. |
| - |
| -Structured and definitions for using memory mapped I/O are contained in |
| -<linux/netlink.h>. |
| - |
| -RX and TX rings |
| ----------------- |
| - |
| -Each ring contains a number of continuous memory blocks, containing frames of |
| -fixed size dependent on the parameters used for ring setup. |
| - |
| -Ring: [ block 0 ] |
| - [ frame 0 ] |
| - [ frame 1 ] |
| - [ block 1 ] |
| - [ frame 2 ] |
| - [ frame 3 ] |
| - ... |
| - [ block n ] |
| - [ frame 2 * n ] |
| - [ frame 2 * n + 1 ] |
| - |
| -The blocks are only visible to the kernel, from the point of view of user-space |
| -the ring just contains the frames in a continuous memory zone. |
| - |
| -The ring parameters used for setting up the ring are defined as follows: |
| - |
| -struct nl_mmap_req { |
| - unsigned int nm_block_size; |
| - unsigned int nm_block_nr; |
| - unsigned int nm_frame_size; |
| - unsigned int nm_frame_nr; |
| -}; |
| - |
| -Frames are grouped into blocks, where each block is a continuous region of memory |
| -and holds nm_block_size / nm_frame_size frames. The total number of frames in |
| -the ring is nm_frame_nr. The following invariants hold: |
| - |
| -- frames_per_block = nm_block_size / nm_frame_size |
| - |
| -- nm_frame_nr = frames_per_block * nm_block_nr |
| - |
| -Some parameters are constrained, specifically: |
| - |
| -- nm_block_size must be a multiple of the architectures memory page size. |
| - The getpagesize() function can be used to get the page size. |
| - |
| -- nm_frame_size must be equal or larger to NL_MMAP_HDRLEN, IOW a frame must be |
| - able to hold at least the frame header |
| - |
| -- nm_frame_size must be smaller or equal to nm_block_size |
| - |
| -- nm_frame_size must be a multiple of NL_MMAP_MSG_ALIGNMENT |
| - |
| -- nm_frame_nr must equal the actual number of frames as specified above. |
| - |
| -When the kernel can't allocate physically continuous memory for a ring block, |
| -it will fall back to use physically discontinuous memory. This might affect |
| -performance negatively, in order to avoid this the nm_frame_size parameter |
| -should be chosen to be as small as possible for the required frame size and |
| -the number of blocks should be increased instead. |
| - |
| -Ring frames |
| ------------- |
| - |
| -Each frames contain a frame header, consisting of a synchronization word and some |
| -meta-data, and the message itself. |
| - |
| -Frame: [ header message ] |
| - |
| -The frame header is defined as follows: |
| - |
| -struct nl_mmap_hdr { |
| - unsigned int nm_status; |
| - unsigned int nm_len; |
| - __u32 nm_group; |
| - /* credentials */ |
| - __u32 nm_pid; |
| - __u32 nm_uid; |
| - __u32 nm_gid; |
| -}; |
| - |
| -- nm_status is used for synchronizing processing between the kernel and user- |
| - space and specifies ownership of the frame as well as the operation to perform |
| - |
| -- nm_len contains the length of the message contained in the data area |
| - |
| -- nm_group specified the destination multicast group of message |
| - |
| -- nm_pid, nm_uid and nm_gid contain the netlink pid, UID and GID of the sending |
| - process. These values correspond to the data available using SOCK_PASSCRED in |
| - the SCM_CREDENTIALS cmsg. |
| - |
| -The possible values in the status word are: |
| - |
| -- NL_MMAP_STATUS_UNUSED: |
| - RX ring: frame belongs to the kernel and contains no message |
| - for user-space. Approriate action is to invoke poll() |
| - to wait for new messages. |
| - |
| - TX ring: frame belongs to user-space and can be used for |
| - message construction. |
| - |
| -- NL_MMAP_STATUS_RESERVED: |
| - RX ring only: frame is currently used by the kernel for message |
| - construction and contains no valid message yet. |
| - Appropriate action is to invoke poll() to wait for |
| - new messages. |
| - |
| -- NL_MMAP_STATUS_VALID: |
| - RX ring: frame contains a valid message. Approriate action is |
| - to process the message and release the frame back to |
| - the kernel by setting the status to |
| - NL_MMAP_STATUS_UNUSED or queue the frame by setting the |
| - status to NL_MMAP_STATUS_SKIP. |
| - |
| - TX ring: the frame contains a valid message from user-space to |
| - be processed by the kernel. After completing processing |
| - the kernel will release the frame back to user-space by |
| - setting the status to NL_MMAP_STATUS_UNUSED. |
| - |
| -- NL_MMAP_STATUS_COPY: |
| - RX ring only: a message is ready to be processed but could not be |
| - stored in the ring, either because it exceeded the |
| - frame size or because the originating subsystem does |
| - not support memory mapped I/O. Appropriate action is |
| - to invoke recvmsg() to receive the message and release |
| - the frame back to the kernel by setting the status to |
| - NL_MMAP_STATUS_UNUSED. |
| - |
| -- NL_MMAP_STATUS_SKIP: |
| - RX ring only: user-space queued the message for later processing, but |
| - processed some messages following it in the ring. The |
| - kernel should skip this frame when looking for unused |
| - frames. |
| - |
| -The data area of a frame begins at a offset of NL_MMAP_HDRLEN relative to the |
| -frame header. |
| - |
| -TX limitations |
| --------------- |
| - |
| -Kernel processing usually involves validation of the message received by |
| -user-space, then processing its contents. The kernel must assure that |
| -userspace is not able to modify the message contents after they have been |
| -validated. In order to do so, the message is copied from the ring frame |
| -to an allocated buffer if either of these conditions is false: |
| - |
| -- only a single mapping of the ring exists |
| -- the file descriptor is not shared between processes |
| - |
| -This means that for threaded programs, the kernel will fall back to copying. |
| - |
| -Example |
| -------- |
| - |
| -Ring setup: |
| - |
| - unsigned int block_size = 16 * getpagesize(); |
| - struct nl_mmap_req req = { |
| - .nm_block_size = block_size, |
| - .nm_block_nr = 64, |
| - .nm_frame_size = 16384, |
| - .nm_frame_nr = 64 * block_size / 16384, |
| - }; |
| - unsigned int ring_size; |
| - void *rx_ring, *tx_ring; |
| - |
| - /* Configure ring parameters */ |
| - if (setsockopt(fd, SOL_NETLINK, NETLINK_RX_RING, &req, sizeof(req)) < 0) |
| - exit(1); |
| - if (setsockopt(fd, SOL_NETLINK, NETLINK_TX_RING, &req, sizeof(req)) < 0) |
| - exit(1) |
| - |
| - /* Calculate size of each individual ring */ |
| - ring_size = req.nm_block_nr * req.nm_block_size; |
| - |
| - /* Map RX/TX rings. The TX ring is located after the RX ring */ |
| - rx_ring = mmap(NULL, 2 * ring_size, PROT_READ | PROT_WRITE, |
| - MAP_SHARED, fd, 0); |
| - if ((long)rx_ring == -1L) |
| - exit(1); |
| - tx_ring = rx_ring + ring_size: |
| - |
| -Message reception: |
| - |
| -This example assumes some ring parameters of the ring setup are available. |
| - |
| - unsigned int frame_offset = 0; |
| - struct nl_mmap_hdr *hdr; |
| - struct nlmsghdr *nlh; |
| - unsigned char buf[16384]; |
| - ssize_t len; |
| - |
| - while (1) { |
| - struct pollfd pfds[1]; |
| - |
| - pfds[0].fd = fd; |
| - pfds[0].events = POLLIN | POLLERR; |
| - pfds[0].revents = 0; |
| - |
| - if (poll(pfds, 1, -1) < 0 && errno != -EINTR) |
| - exit(1); |
| - |
| - /* Check for errors. Error handling omitted */ |
| - if (pfds[0].revents & POLLERR) |
| - <handle error> |
| - |
| - /* If no new messages, poll again */ |
| - if (!(pfds[0].revents & POLLIN)) |
| - continue; |
| - |
| - /* Process all frames */ |
| - while (1) { |
| - /* Get next frame header */ |
| - hdr = rx_ring + frame_offset; |
| - |
| - if (hdr->nm_status == NL_MMAP_STATUS_VALID) { |
| - /* Regular memory mapped frame */ |
| - nlh = (void *)hdr + NL_MMAP_HDRLEN; |
| - len = hdr->nm_len; |
| - |
| - /* Release empty message immediately. May happen |
| - * on error during message construction. |
| - */ |
| - if (len == 0) |
| - goto release; |
| - } else if (hdr->nm_status == NL_MMAP_STATUS_COPY) { |
| - /* Frame queued to socket receive queue */ |
| - len = recv(fd, buf, sizeof(buf), MSG_DONTWAIT); |
| - if (len <= 0) |
| - break; |
| - nlh = buf; |
| - } else |
| - /* No more messages to process, continue polling */ |
| - break; |
| - |
| - process_msg(nlh); |
| -release: |
| - /* Release frame back to the kernel */ |
| - hdr->nm_status = NL_MMAP_STATUS_UNUSED; |
| - |
| - /* Advance frame offset to next frame */ |
| - frame_offset = (frame_offset + frame_size) % ring_size; |
| - } |
| - } |
| - |
| -Message transmission: |
| - |
| -This example assumes some ring parameters of the ring setup are available. |
| -A single message is constructed and transmitted, to send multiple messages |
| -at once they would be constructed in consecutive frames before a final call |
| -to sendto(). |
| - |
| - unsigned int frame_offset = 0; |
| - struct nl_mmap_hdr *hdr; |
| - struct nlmsghdr *nlh; |
| - struct sockaddr_nl addr = { |
| - .nl_family = AF_NETLINK, |
| - }; |
| - |
| - hdr = tx_ring + frame_offset; |
| - if (hdr->nm_status != NL_MMAP_STATUS_UNUSED) |
| - /* No frame available. Use poll() to avoid. */ |
| - exit(1); |
| - |
| - nlh = (void *)hdr + NL_MMAP_HDRLEN; |
| - |
| - /* Build message */ |
| - build_message(nlh); |
| - |
| - /* Fill frame header: length and status need to be set */ |
| - hdr->nm_len = nlh->nlmsg_len; |
| - hdr->nm_status = NL_MMAP_STATUS_VALID; |
| - |
| - if (sendto(fd, NULL, 0, 0, &addr, sizeof(addr)) < 0) |
| - exit(1); |
| - |
| - /* Advance frame offset to next frame */ |
| - frame_offset = (frame_offset + frame_size) % ring_size; |
| --- a/include/uapi/linux/netlink.h |
| +++ b/include/uapi/linux/netlink.h |
| @@ -106,8 +106,10 @@ struct nlmsgerr { |
| #define NETLINK_PKTINFO 3 |
| #define NETLINK_BROADCAST_ERROR 4 |
| #define NETLINK_NO_ENOBUFS 5 |
| +#ifndef __KERNEL__ |
| #define NETLINK_RX_RING 6 |
| #define NETLINK_TX_RING 7 |
| +#endif |
| |
| struct nl_pktinfo { |
| __u32 group; |
| @@ -130,6 +132,7 @@ struct nl_mmap_hdr { |
| __u32 nm_gid; |
| }; |
| |
| +#ifndef __KERNEL__ |
| enum nl_mmap_status { |
| NL_MMAP_STATUS_UNUSED, |
| NL_MMAP_STATUS_RESERVED, |
| @@ -141,6 +144,7 @@ enum nl_mmap_status { |
| #define NL_MMAP_MSG_ALIGNMENT NLMSG_ALIGNTO |
| #define NL_MMAP_MSG_ALIGN(sz) __ALIGN_KERNEL(sz, NL_MMAP_MSG_ALIGNMENT) |
| #define NL_MMAP_HDRLEN NL_MMAP_MSG_ALIGN(sizeof(struct nl_mmap_hdr)) |
| +#endif |
| |
| #define NET_MAJOR 36 /* Major 36 is reserved for networking */ |
| |
| --- a/include/uapi/linux/netlink_diag.h |
| +++ b/include/uapi/linux/netlink_diag.h |
| @@ -48,6 +48,8 @@ enum { |
| |
| #define NDIAG_SHOW_MEMINFO 0x00000001 /* show memory info of a socket */ |
| #define NDIAG_SHOW_GROUPS 0x00000002 /* show groups of a netlink socket */ |
| +#ifndef __KERNEL__ |
| #define NDIAG_SHOW_RING_CFG 0x00000004 /* show ring configuration */ |
| +#endif |
| |
| #endif |
| --- a/net/netlink/Kconfig |
| +++ b/net/netlink/Kconfig |
| @@ -2,15 +2,6 @@ |
| # Netlink Sockets |
| # |
| |
| -config NETLINK_MMAP |
| - bool "NETLINK: mmaped IO" |
| - ---help--- |
| - This option enables support for memory mapped netlink IO. This |
| - reduces overhead by avoiding copying data between kernel- and |
| - userspace. |
| - |
| - If unsure, say N. |
| - |
| config NETLINK_DIAG |
| tristate "NETLINK: socket monitoring interface" |
| default n |
| --- a/net/netlink/af_netlink.c |
| +++ b/net/netlink/af_netlink.c |
| @@ -218,7 +218,7 @@ static int __netlink_deliver_tap_skb(str |
| |
| dev_hold(dev); |
| |
| - if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head)) |
| + if (is_vmalloc_addr(skb->head)) |
| nskb = netlink_to_full_skb(skb, GFP_ATOMIC); |
| else |
| nskb = skb_clone(skb, GFP_ATOMIC); |
| @@ -292,599 +292,8 @@ static void netlink_rcv_wake(struct sock |
| wake_up_interruptible(&nlk->wait); |
| } |
| |
| -#ifdef CONFIG_NETLINK_MMAP |
| -static bool netlink_rx_is_mmaped(struct sock *sk) |
| -{ |
| - return nlk_sk(sk)->rx_ring.pg_vec != NULL; |
| -} |
| - |
| -static bool netlink_tx_is_mmaped(struct sock *sk) |
| -{ |
| - return nlk_sk(sk)->tx_ring.pg_vec != NULL; |
| -} |
| - |
| -static __pure struct page *pgvec_to_page(const void *addr) |
| -{ |
| - if (is_vmalloc_addr(addr)) |
| - return vmalloc_to_page(addr); |
| - else |
| - return virt_to_page(addr); |
| -} |
| - |
| -static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len) |
| -{ |
| - unsigned int i; |
| - |
| - for (i = 0; i < len; i++) { |
| - if (pg_vec[i] != NULL) { |
| - if (is_vmalloc_addr(pg_vec[i])) |
| - vfree(pg_vec[i]); |
| - else |
| - free_pages((unsigned long)pg_vec[i], order); |
| - } |
| - } |
| - kfree(pg_vec); |
| -} |
| - |
| -static void *alloc_one_pg_vec_page(unsigned long order) |
| -{ |
| - void *buffer; |
| - gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO | |
| - __GFP_NOWARN | __GFP_NORETRY; |
| - |
| - buffer = (void *)__get_free_pages(gfp_flags, order); |
| - if (buffer != NULL) |
| - return buffer; |
| - |
| - buffer = vzalloc((1 << order) * PAGE_SIZE); |
| - if (buffer != NULL) |
| - return buffer; |
| - |
| - gfp_flags &= ~__GFP_NORETRY; |
| - return (void *)__get_free_pages(gfp_flags, order); |
| -} |
| - |
| -static void **alloc_pg_vec(struct netlink_sock *nlk, |
| - struct nl_mmap_req *req, unsigned int order) |
| -{ |
| - unsigned int block_nr = req->nm_block_nr; |
| - unsigned int i; |
| - void **pg_vec; |
| - |
| - pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL); |
| - if (pg_vec == NULL) |
| - return NULL; |
| - |
| - for (i = 0; i < block_nr; i++) { |
| - pg_vec[i] = alloc_one_pg_vec_page(order); |
| - if (pg_vec[i] == NULL) |
| - goto err1; |
| - } |
| - |
| - return pg_vec; |
| -err1: |
| - free_pg_vec(pg_vec, order, block_nr); |
| - return NULL; |
| -} |
| - |
| - |
| -static void |
| -__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec, |
| - unsigned int order) |
| -{ |
| - struct netlink_sock *nlk = nlk_sk(sk); |
| - struct sk_buff_head *queue; |
| - struct netlink_ring *ring; |
| - |
| - queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue; |
| - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; |
| - |
| - spin_lock_bh(&queue->lock); |
| - |
| - ring->frame_max = req->nm_frame_nr - 1; |
| - ring->head = 0; |
| - ring->frame_size = req->nm_frame_size; |
| - ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE; |
| - |
| - swap(ring->pg_vec_len, req->nm_block_nr); |
| - swap(ring->pg_vec_order, order); |
| - swap(ring->pg_vec, pg_vec); |
| - |
| - __skb_queue_purge(queue); |
| - spin_unlock_bh(&queue->lock); |
| - |
| - WARN_ON(atomic_read(&nlk->mapped)); |
| - |
| - if (pg_vec) |
| - free_pg_vec(pg_vec, order, req->nm_block_nr); |
| -} |
| - |
| -static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, |
| - bool tx_ring) |
| -{ |
| - struct netlink_sock *nlk = nlk_sk(sk); |
| - struct netlink_ring *ring; |
| - void **pg_vec = NULL; |
| - unsigned int order = 0; |
| - |
| - ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring; |
| - |
| - if (atomic_read(&nlk->mapped)) |
| - return -EBUSY; |
| - if (atomic_read(&ring->pending)) |
| - return -EBUSY; |
| - |
| - if (req->nm_block_nr) { |
| - if (ring->pg_vec != NULL) |
| - return -EBUSY; |
| - |
| - if ((int)req->nm_block_size <= 0) |
| - return -EINVAL; |
| - if (!IS_ALIGNED(req->nm_block_size, PAGE_SIZE)) |
| - return -EINVAL; |
| - if (req->nm_frame_size < NL_MMAP_HDRLEN) |
| - return -EINVAL; |
| - if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT)) |
| - return -EINVAL; |
| - |
| - ring->frames_per_block = req->nm_block_size / |
| - req->nm_frame_size; |
| - if (ring->frames_per_block == 0) |
| - return -EINVAL; |
| - if (ring->frames_per_block * req->nm_block_nr != |
| - req->nm_frame_nr) |
| - return -EINVAL; |
| - |
| - order = get_order(req->nm_block_size); |
| - pg_vec = alloc_pg_vec(nlk, req, order); |
| - if (pg_vec == NULL) |
| - return -ENOMEM; |
| - } else { |
| - if (req->nm_frame_nr) |
| - return -EINVAL; |
| - } |
| - |
| - mutex_lock(&nlk->pg_vec_lock); |
| - if (atomic_read(&nlk->mapped) == 0) { |
| - __netlink_set_ring(sk, req, tx_ring, pg_vec, order); |
| - mutex_unlock(&nlk->pg_vec_lock); |
| - return 0; |
| - } |
| - |
| - mutex_unlock(&nlk->pg_vec_lock); |
| - |
| - if (pg_vec) |
| - free_pg_vec(pg_vec, order, req->nm_block_nr); |
| - |
| - return -EBUSY; |
| -} |
| - |
| -static void netlink_mm_open(struct vm_area_struct *vma) |
| -{ |
| - struct file *file = vma->vm_file; |
| - struct socket *sock = file->private_data; |
| - struct sock *sk = sock->sk; |
| - |
| - if (sk) |
| - atomic_inc(&nlk_sk(sk)->mapped); |
| -} |
| - |
| -static void netlink_mm_close(struct vm_area_struct *vma) |
| -{ |
| - struct file *file = vma->vm_file; |
| - struct socket *sock = file->private_data; |
| - struct sock *sk = sock->sk; |
| - |
| - if (sk) |
| - atomic_dec(&nlk_sk(sk)->mapped); |
| -} |
| - |
| -static const struct vm_operations_struct netlink_mmap_ops = { |
| - .open = netlink_mm_open, |
| - .close = netlink_mm_close, |
| -}; |
| - |
| -static int netlink_mmap(struct file *file, struct socket *sock, |
| - struct vm_area_struct *vma) |
| -{ |
| - struct sock *sk = sock->sk; |
| - struct netlink_sock *nlk = nlk_sk(sk); |
| - struct netlink_ring *ring; |
| - unsigned long start, size, expected; |
| - unsigned int i; |
| - int err = -EINVAL; |
| - |
| - if (vma->vm_pgoff) |
| - return -EINVAL; |
| - |
| - mutex_lock(&nlk->pg_vec_lock); |
| - |
| - expected = 0; |
| - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { |
| - if (ring->pg_vec == NULL) |
| - continue; |
| - expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE; |
| - } |
| - |
| - if (expected == 0) |
| - goto out; |
| - |
| - size = vma->vm_end - vma->vm_start; |
| - if (size != expected) |
| - goto out; |
| - |
| - start = vma->vm_start; |
| - for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) { |
| - if (ring->pg_vec == NULL) |
| - continue; |
| - |
| - for (i = 0; i < ring->pg_vec_len; i++) { |
| - struct page *page; |
| - void *kaddr = ring->pg_vec[i]; |
| - unsigned int pg_num; |
| - |
| - for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) { |
| - page = pgvec_to_page(kaddr); |
| - err = vm_insert_page(vma, start, page); |
| - if (err < 0) |
| - goto out; |
| - start += PAGE_SIZE; |
| - kaddr += PAGE_SIZE; |
| - } |
| - } |
| - } |
| - |
| - atomic_inc(&nlk->mapped); |
| - vma->vm_ops = &netlink_mmap_ops; |
| - err = 0; |
| -out: |
| - mutex_unlock(&nlk->pg_vec_lock); |
| - return err; |
| -} |
| - |
| -static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len) |
| -{ |
| -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1 |
| - struct page *p_start, *p_end; |
| - |
| - /* First page is flushed through netlink_{get,set}_status */ |
| - p_start = pgvec_to_page(hdr + PAGE_SIZE); |
| - p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1); |
| - while (p_start <= p_end) { |
| - flush_dcache_page(p_start); |
| - p_start++; |
| - } |
| -#endif |
| -} |
| - |
| -static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr) |
| -{ |
| - smp_rmb(); |
| - flush_dcache_page(pgvec_to_page(hdr)); |
| - return hdr->nm_status; |
| -} |
| - |
| -static void netlink_set_status(struct nl_mmap_hdr *hdr, |
| - enum nl_mmap_status status) |
| -{ |
| - smp_mb(); |
| - hdr->nm_status = status; |
| - flush_dcache_page(pgvec_to_page(hdr)); |
| -} |
| - |
| -static struct nl_mmap_hdr * |
| -__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos) |
| -{ |
| - unsigned int pg_vec_pos, frame_off; |
| - |
| - pg_vec_pos = pos / ring->frames_per_block; |
| - frame_off = pos % ring->frames_per_block; |
| - |
| - return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size); |
| -} |
| - |
| -static struct nl_mmap_hdr * |
| -netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos, |
| - enum nl_mmap_status status) |
| -{ |
| - struct nl_mmap_hdr *hdr; |
| - |
| - hdr = __netlink_lookup_frame(ring, pos); |
| - if (netlink_get_status(hdr) != status) |
| - return NULL; |
| - |
| - return hdr; |
| -} |
| - |
| -static struct nl_mmap_hdr * |
| -netlink_current_frame(const struct netlink_ring *ring, |
| - enum nl_mmap_status status) |
| -{ |
| - return netlink_lookup_frame(ring, ring->head, status); |
| -} |
| - |
| -static struct nl_mmap_hdr * |
| -netlink_previous_frame(const struct netlink_ring *ring, |
| - enum nl_mmap_status status) |
| -{ |
| - unsigned int prev; |
| - |
| - prev = ring->head ? ring->head - 1 : ring->frame_max; |
| - return netlink_lookup_frame(ring, prev, status); |
| -} |
| - |
| -static void netlink_increment_head(struct netlink_ring *ring) |
| -{ |
| - ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0; |
| -} |
| - |
| -static void netlink_forward_ring(struct netlink_ring *ring) |
| -{ |
| - unsigned int head = ring->head, pos = head; |
| - const struct nl_mmap_hdr *hdr; |
| - |
| - do { |
| - hdr = __netlink_lookup_frame(ring, pos); |
| - if (hdr->nm_status == NL_MMAP_STATUS_UNUSED) |
| - break; |
| - if (hdr->nm_status != NL_MMAP_STATUS_SKIP) |
| - break; |
| - netlink_increment_head(ring); |
| - } while (ring->head != head); |
| -} |
| - |
| -static bool netlink_dump_space(struct netlink_sock *nlk) |
| -{ |
| - struct netlink_ring *ring = &nlk->rx_ring; |
| - struct nl_mmap_hdr *hdr; |
| - unsigned int n; |
| - |
| - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); |
| - if (hdr == NULL) |
| - return false; |
| - |
| - n = ring->head + ring->frame_max / 2; |
| - if (n > ring->frame_max) |
| - n -= ring->frame_max; |
| - |
| - hdr = __netlink_lookup_frame(ring, n); |
| - |
| - return hdr->nm_status == NL_MMAP_STATUS_UNUSED; |
| -} |
| - |
| -static unsigned int netlink_poll(struct file *file, struct socket *sock, |
| - poll_table *wait) |
| -{ |
| - struct sock *sk = sock->sk; |
| - struct netlink_sock *nlk = nlk_sk(sk); |
| - unsigned int mask; |
| - int err; |
| - |
| - if (nlk->rx_ring.pg_vec != NULL) { |
| - /* Memory mapped sockets don't call recvmsg(), so flow control |
| - * for dumps is performed here. A dump is allowed to continue |
| - * if at least half the ring is unused. |
| - */ |
| - while (nlk->cb_running && netlink_dump_space(nlk)) { |
| - err = netlink_dump(sk); |
| - if (err < 0) { |
| - sk->sk_err = -err; |
| - sk->sk_error_report(sk); |
| - break; |
| - } |
| - } |
| - netlink_rcv_wake(sk); |
| - } |
| - |
| - mask = datagram_poll(file, sock, wait); |
| - |
| - spin_lock_bh(&sk->sk_receive_queue.lock); |
| - if (nlk->rx_ring.pg_vec) { |
| - netlink_forward_ring(&nlk->rx_ring); |
| - if (!netlink_previous_frame(&nlk->rx_ring, NL_MMAP_STATUS_UNUSED)) |
| - mask |= POLLIN | POLLRDNORM; |
| - } |
| - spin_unlock_bh(&sk->sk_receive_queue.lock); |
| - |
| - spin_lock_bh(&sk->sk_write_queue.lock); |
| - if (nlk->tx_ring.pg_vec) { |
| - if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED)) |
| - mask |= POLLOUT | POLLWRNORM; |
| - } |
| - spin_unlock_bh(&sk->sk_write_queue.lock); |
| - |
| - return mask; |
| -} |
| - |
| -static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb) |
| -{ |
| - return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN); |
| -} |
| - |
| -static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk, |
| - struct netlink_ring *ring, |
| - struct nl_mmap_hdr *hdr) |
| -{ |
| - unsigned int size; |
| - void *data; |
| - |
| - size = ring->frame_size - NL_MMAP_HDRLEN; |
| - data = (void *)hdr + NL_MMAP_HDRLEN; |
| - |
| - skb->head = data; |
| - skb->data = data; |
| - skb_reset_tail_pointer(skb); |
| - skb->end = skb->tail + size; |
| - skb->len = 0; |
| - |
| - skb->destructor = netlink_skb_destructor; |
| - NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED; |
| - NETLINK_CB(skb).sk = sk; |
| -} |
| - |
| -static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg, |
| - u32 dst_portid, u32 dst_group, |
| - struct sock_iocb *siocb) |
| -{ |
| - struct netlink_sock *nlk = nlk_sk(sk); |
| - struct netlink_ring *ring; |
| - struct nl_mmap_hdr *hdr; |
| - struct sk_buff *skb; |
| - unsigned int maxlen; |
| - int err = 0, len = 0; |
| - |
| - mutex_lock(&nlk->pg_vec_lock); |
| - |
| - ring = &nlk->tx_ring; |
| - maxlen = ring->frame_size - NL_MMAP_HDRLEN; |
| - |
| - do { |
| - unsigned int nm_len; |
| - |
| - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID); |
| - if (hdr == NULL) { |
| - if (!(msg->msg_flags & MSG_DONTWAIT) && |
| - atomic_read(&nlk->tx_ring.pending)) |
| - schedule(); |
| - continue; |
| - } |
| - |
| - nm_len = ACCESS_ONCE(hdr->nm_len); |
| - if (nm_len > maxlen) { |
| - err = -EINVAL; |
| - goto out; |
| - } |
| - |
| - netlink_frame_flush_dcache(hdr, nm_len); |
| - |
| - skb = alloc_skb(nm_len, GFP_KERNEL); |
| - if (skb == NULL) { |
| - err = -ENOBUFS; |
| - goto out; |
| - } |
| - __skb_put(skb, nm_len); |
| - memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len); |
| - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); |
| - |
| - netlink_increment_head(ring); |
| - |
| - NETLINK_CB(skb).portid = nlk->portid; |
| - NETLINK_CB(skb).dst_group = dst_group; |
| - NETLINK_CB(skb).creds = siocb->scm->creds; |
| - |
| - err = security_netlink_send(sk, skb); |
| - if (err) { |
| - kfree_skb(skb); |
| - goto out; |
| - } |
| - |
| - if (unlikely(dst_group)) { |
| - atomic_inc(&skb->users); |
| - netlink_broadcast(sk, skb, dst_portid, dst_group, |
| - GFP_KERNEL); |
| - } |
| - err = netlink_unicast(sk, skb, dst_portid, |
| - msg->msg_flags & MSG_DONTWAIT); |
| - if (err < 0) |
| - goto out; |
| - len += err; |
| - |
| - } while (hdr != NULL || |
| - (!(msg->msg_flags & MSG_DONTWAIT) && |
| - atomic_read(&nlk->tx_ring.pending))); |
| - |
| - if (len > 0) |
| - err = len; |
| -out: |
| - mutex_unlock(&nlk->pg_vec_lock); |
| - return err; |
| -} |
| - |
| -static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb) |
| -{ |
| - struct nl_mmap_hdr *hdr; |
| - |
| - hdr = netlink_mmap_hdr(skb); |
| - hdr->nm_len = skb->len; |
| - hdr->nm_group = NETLINK_CB(skb).dst_group; |
| - hdr->nm_pid = NETLINK_CB(skb).creds.pid; |
| - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); |
| - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); |
| - netlink_frame_flush_dcache(hdr, hdr->nm_len); |
| - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); |
| - |
| - NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED; |
| - kfree_skb(skb); |
| -} |
| - |
| -static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb) |
| -{ |
| - struct netlink_sock *nlk = nlk_sk(sk); |
| - struct netlink_ring *ring = &nlk->rx_ring; |
| - struct nl_mmap_hdr *hdr; |
| - |
| - spin_lock_bh(&sk->sk_receive_queue.lock); |
| - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); |
| - if (hdr == NULL) { |
| - spin_unlock_bh(&sk->sk_receive_queue.lock); |
| - kfree_skb(skb); |
| - netlink_overrun(sk); |
| - return; |
| - } |
| - netlink_increment_head(ring); |
| - __skb_queue_tail(&sk->sk_receive_queue, skb); |
| - spin_unlock_bh(&sk->sk_receive_queue.lock); |
| - |
| - hdr->nm_len = skb->len; |
| - hdr->nm_group = NETLINK_CB(skb).dst_group; |
| - hdr->nm_pid = NETLINK_CB(skb).creds.pid; |
| - hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid); |
| - hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid); |
| - netlink_set_status(hdr, NL_MMAP_STATUS_COPY); |
| -} |
| - |
| -#else /* CONFIG_NETLINK_MMAP */ |
| -#define netlink_rx_is_mmaped(sk) false |
| -#define netlink_tx_is_mmaped(sk) false |
| -#define netlink_mmap sock_no_mmap |
| -#define netlink_poll datagram_poll |
| -#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, siocb) 0 |
| -#endif /* CONFIG_NETLINK_MMAP */ |
| - |
| static void netlink_skb_destructor(struct sk_buff *skb) |
| { |
| -#ifdef CONFIG_NETLINK_MMAP |
| - struct nl_mmap_hdr *hdr; |
| - struct netlink_ring *ring; |
| - struct sock *sk; |
| - |
| - /* If a packet from the kernel to userspace was freed because of an |
| - * error without being delivered to userspace, the kernel must reset |
| - * the status. In the direction userspace to kernel, the status is |
| - * always reset here after the packet was processed and freed. |
| - */ |
| - if (netlink_skb_is_mmaped(skb)) { |
| - hdr = netlink_mmap_hdr(skb); |
| - sk = NETLINK_CB(skb).sk; |
| - |
| - if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) { |
| - netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED); |
| - ring = &nlk_sk(sk)->tx_ring; |
| - } else { |
| - if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) { |
| - hdr->nm_len = 0; |
| - netlink_set_status(hdr, NL_MMAP_STATUS_VALID); |
| - } |
| - ring = &nlk_sk(sk)->rx_ring; |
| - } |
| - |
| - WARN_ON(atomic_read(&ring->pending) == 0); |
| - atomic_dec(&ring->pending); |
| - sock_put(sk); |
| - |
| - skb->head = NULL; |
| - } |
| -#endif |
| if (is_vmalloc_addr(skb->head)) { |
| if (!skb->cloned || |
| !atomic_dec_return(&(skb_shinfo(skb)->dataref))) |
| @@ -918,18 +327,6 @@ static void netlink_sock_destruct(struct |
| } |
| |
| skb_queue_purge(&sk->sk_receive_queue); |
| -#ifdef CONFIG_NETLINK_MMAP |
| - if (1) { |
| - struct nl_mmap_req req; |
| - |
| - memset(&req, 0, sizeof(req)); |
| - if (nlk->rx_ring.pg_vec) |
| - __netlink_set_ring(sk, &req, false, NULL, 0); |
| - memset(&req, 0, sizeof(req)); |
| - if (nlk->tx_ring.pg_vec) |
| - __netlink_set_ring(sk, &req, true, NULL, 0); |
| - } |
| -#endif /* CONFIG_NETLINK_MMAP */ |
| |
| if (!sock_flag(sk, SOCK_DEAD)) { |
| printk(KERN_ERR "Freeing alive netlink socket %p\n", sk); |
| @@ -1202,9 +599,6 @@ static int __netlink_create(struct net * |
| mutex_init(nlk->cb_mutex); |
| } |
| init_waitqueue_head(&nlk->wait); |
| -#ifdef CONFIG_NETLINK_MMAP |
| - mutex_init(&nlk->pg_vec_lock); |
| -#endif |
| |
| sk->sk_destruct = netlink_sock_destruct; |
| sk->sk_protocol = protocol; |
| @@ -1708,8 +1102,7 @@ int netlink_attachskb(struct sock *sk, s |
| nlk = nlk_sk(sk); |
| |
| if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf || |
| - test_bit(NETLINK_CONGESTED, &nlk->state)) && |
| - !netlink_skb_is_mmaped(skb)) { |
| + test_bit(NETLINK_CONGESTED, &nlk->state))) { |
| DECLARE_WAITQUEUE(wait, current); |
| if (!*timeo) { |
| if (!ssk || netlink_is_kernel(ssk)) |
| @@ -1747,14 +1140,7 @@ static int __netlink_sendskb(struct sock |
| |
| netlink_deliver_tap(skb); |
| |
| -#ifdef CONFIG_NETLINK_MMAP |
| - if (netlink_skb_is_mmaped(skb)) |
| - netlink_queue_mmaped_skb(sk, skb); |
| - else if (netlink_rx_is_mmaped(sk)) |
| - netlink_ring_set_copied(sk, skb); |
| - else |
| -#endif /* CONFIG_NETLINK_MMAP */ |
| - skb_queue_tail(&sk->sk_receive_queue, skb); |
| + skb_queue_tail(&sk->sk_receive_queue, skb); |
| sk->sk_data_ready(sk); |
| return len; |
| } |
| @@ -1778,9 +1164,6 @@ static struct sk_buff *netlink_trim(stru |
| int delta; |
| |
| WARN_ON(skb->sk != NULL); |
| - if (netlink_skb_is_mmaped(skb)) |
| - return skb; |
| - |
| delta = skb->end - skb->tail; |
| if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize) |
| return skb; |
| @@ -1860,71 +1243,6 @@ struct sk_buff *__netlink_alloc_skb(stru |
| unsigned int ldiff, u32 dst_portid, |
| gfp_t gfp_mask) |
| { |
| -#ifdef CONFIG_NETLINK_MMAP |
| - unsigned int maxlen, linear_size; |
| - struct sock *sk = NULL; |
| - struct sk_buff *skb; |
| - struct netlink_ring *ring; |
| - struct nl_mmap_hdr *hdr; |
| - |
| - sk = netlink_getsockbyportid(ssk, dst_portid); |
| - if (IS_ERR(sk)) |
| - goto out; |
| - |
| - ring = &nlk_sk(sk)->rx_ring; |
| - /* fast-path without atomic ops for common case: non-mmaped receiver */ |
| - if (ring->pg_vec == NULL) |
| - goto out_put; |
| - |
| - /* We need to account the full linear size needed as a ring |
| - * slot cannot have non-linear parts. |
| - */ |
| - linear_size = size + ldiff; |
| - if (ring->frame_size - NL_MMAP_HDRLEN < linear_size) |
| - goto out_put; |
| - |
| - skb = alloc_skb_head(gfp_mask); |
| - if (skb == NULL) |
| - goto err1; |
| - |
| - spin_lock_bh(&sk->sk_receive_queue.lock); |
| - /* check again under lock */ |
| - if (ring->pg_vec == NULL) |
| - goto out_free; |
| - |
| - /* check again under lock */ |
| - maxlen = ring->frame_size - NL_MMAP_HDRLEN; |
| - if (maxlen < linear_size) |
| - goto out_free; |
| - |
| - netlink_forward_ring(ring); |
| - hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED); |
| - if (hdr == NULL) |
| - goto err2; |
| - |
| - netlink_ring_setup_skb(skb, sk, ring, hdr); |
| - netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED); |
| - atomic_inc(&ring->pending); |
| - netlink_increment_head(ring); |
| - |
| - spin_unlock_bh(&sk->sk_receive_queue.lock); |
| - return skb; |
| - |
| -err2: |
| - kfree_skb(skb); |
| - spin_unlock_bh(&sk->sk_receive_queue.lock); |
| - netlink_overrun(sk); |
| -err1: |
| - sock_put(sk); |
| - return NULL; |
| - |
| -out_free: |
| - kfree_skb(skb); |
| - spin_unlock_bh(&sk->sk_receive_queue.lock); |
| -out_put: |
| - sock_put(sk); |
| -out: |
| -#endif |
| return alloc_skb(size, gfp_mask); |
| } |
| EXPORT_SYMBOL_GPL(__netlink_alloc_skb); |
| @@ -2189,8 +1507,7 @@ static int netlink_setsockopt(struct soc |
| if (level != SOL_NETLINK) |
| return -ENOPROTOOPT; |
| |
| - if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING && |
| - optlen >= sizeof(int) && |
| + if (optlen >= sizeof(int) && |
| get_user(val, (unsigned int __user *)optval)) |
| return -EFAULT; |
| |
| @@ -2243,25 +1560,6 @@ static int netlink_setsockopt(struct soc |
| } |
| err = 0; |
| break; |
| -#ifdef CONFIG_NETLINK_MMAP |
| - case NETLINK_RX_RING: |
| - case NETLINK_TX_RING: { |
| - struct nl_mmap_req req; |
| - |
| - /* Rings might consume more memory than queue limits, require |
| - * CAP_NET_ADMIN. |
| - */ |
| - if (!capable(CAP_NET_ADMIN)) |
| - return -EPERM; |
| - if (optlen < sizeof(req)) |
| - return -EINVAL; |
| - if (copy_from_user(&req, optval, sizeof(req))) |
| - return -EFAULT; |
| - err = netlink_set_ring(sk, &req, |
| - optname == NETLINK_TX_RING); |
| - break; |
| - } |
| -#endif /* CONFIG_NETLINK_MMAP */ |
| default: |
| err = -ENOPROTOOPT; |
| } |
| @@ -2374,13 +1672,6 @@ static int netlink_sendmsg(struct kiocb |
| goto out; |
| } |
| |
| - if (netlink_tx_is_mmaped(sk) && |
| - msg->msg_iov->iov_base == NULL) { |
| - err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, |
| - siocb); |
| - goto out; |
| - } |
| - |
| err = -EMSGSIZE; |
| if (len > sk->sk_sndbuf - 32) |
| goto out; |
| @@ -2704,8 +1995,7 @@ static int netlink_dump(struct sock *sk) |
| goto errout_skb; |
| } |
| |
| - if (!netlink_rx_is_mmaped(sk) && |
| - atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) |
| + if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) |
| goto errout_skb; |
| |
| /* NLMSG_GOODSIZE is small to avoid high order allocations being |
| @@ -2740,8 +2030,7 @@ static int netlink_dump(struct sock *sk) |
| * reasonable static buffer based on the expected largest dump of a |
| * single netdev. The outcome is MSG_TRUNC error. |
| */ |
| - if (!netlink_rx_is_mmaped(sk)) |
| - skb_reserve(skb, skb_tailroom(skb) - alloc_size); |
| + skb_reserve(skb, skb_tailroom(skb) - alloc_size); |
| netlink_skb_set_owner_r(skb, sk); |
| |
| len = cb->dump(skb, cb); |
| @@ -2795,16 +2084,7 @@ int __netlink_dump_start(struct sock *ss |
| struct netlink_sock *nlk; |
| int ret; |
| |
| - /* Memory mapped dump requests need to be copied to avoid looping |
| - * on the pending state in netlink_mmap_sendmsg() while the CB hold |
| - * a reference to the skb. |
| - */ |
| - if (netlink_skb_is_mmaped(skb)) { |
| - skb = skb_copy(skb, GFP_KERNEL); |
| - if (skb == NULL) |
| - return -ENOBUFS; |
| - } else |
| - atomic_inc(&skb->users); |
| + atomic_inc(&skb->users); |
| |
| sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid); |
| if (sk == NULL) { |
| @@ -3140,7 +2420,7 @@ static const struct proto_ops netlink_op |
| .socketpair = sock_no_socketpair, |
| .accept = sock_no_accept, |
| .getname = netlink_getname, |
| - .poll = netlink_poll, |
| + .poll = datagram_poll, |
| .ioctl = sock_no_ioctl, |
| .listen = sock_no_listen, |
| .shutdown = sock_no_shutdown, |
| @@ -3148,7 +2428,7 @@ static const struct proto_ops netlink_op |
| .getsockopt = netlink_getsockopt, |
| .sendmsg = netlink_sendmsg, |
| .recvmsg = netlink_recvmsg, |
| - .mmap = netlink_mmap, |
| + .mmap = sock_no_mmap, |
| .sendpage = sock_no_sendpage, |
| }; |
| |
| --- a/net/netlink/af_netlink.h |
| +++ b/net/netlink/af_netlink.h |
| @@ -41,12 +41,6 @@ struct netlink_sock { |
| int (*netlink_bind)(int group); |
| void (*netlink_unbind)(int group); |
| struct module *module; |
| -#ifdef CONFIG_NETLINK_MMAP |
| - struct mutex pg_vec_lock; |
| - struct netlink_ring rx_ring; |
| - struct netlink_ring tx_ring; |
| - atomic_t mapped; |
| -#endif /* CONFIG_NETLINK_MMAP */ |
| }; |
| |
| static inline struct netlink_sock *nlk_sk(struct sock *sk) |
| @@ -67,15 +61,6 @@ struct nl_portid_hash { |
| u32 rnd; |
| }; |
| |
| -static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb) |
| -{ |
| -#ifdef CONFIG_NETLINK_MMAP |
| - return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED; |
| -#else |
| - return false; |
| -#endif /* CONFIG_NETLINK_MMAP */ |
| -} |
| - |
| struct netlink_table { |
| struct nl_portid_hash hash; |
| struct hlist_head mc_list; |
| --- a/net/netlink/diag.c |
| +++ b/net/netlink/diag.c |
| @@ -7,41 +7,6 @@ |
| |
| #include "af_netlink.h" |
| |
| -#ifdef CONFIG_NETLINK_MMAP |
| -static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type, |
| - struct sk_buff *nlskb) |
| -{ |
| - struct netlink_diag_ring ndr; |
| - |
| - ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT; |
| - ndr.ndr_block_nr = ring->pg_vec_len; |
| - ndr.ndr_frame_size = ring->frame_size; |
| - ndr.ndr_frame_nr = ring->frame_max + 1; |
| - |
| - return nla_put(nlskb, nl_type, sizeof(ndr), &ndr); |
| -} |
| - |
| -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) |
| -{ |
| - struct netlink_sock *nlk = nlk_sk(sk); |
| - int ret; |
| - |
| - mutex_lock(&nlk->pg_vec_lock); |
| - ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb); |
| - if (!ret) |
| - ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING, |
| - nlskb); |
| - mutex_unlock(&nlk->pg_vec_lock); |
| - |
| - return ret; |
| -} |
| -#else |
| -static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb) |
| -{ |
| - return 0; |
| -} |
| -#endif |
| - |
| static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb) |
| { |
| struct netlink_sock *nlk = nlk_sk(sk); |
| @@ -86,10 +51,6 @@ static int sk_diag_fill(struct sock *sk, |
| sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO)) |
| goto out_nlmsg_trim; |
| |
| - if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) && |
| - sk_diag_put_rings_cfg(sk, skb)) |
| - goto out_nlmsg_trim; |
| - |
| return nlmsg_end(skb, nlh); |
| |
| out_nlmsg_trim: |