pipe: Use a linked-list instead of a ring buffer Use a linked-list of pipe_buffers rather than a ring, allocating them as we need them. We cache one on pipe_inode_info struct for fast use by ordinary pipe writes, just as we can cache a spare page. Doing this will allow the pipe_buffer to have an integral variable-sized bio_vec array pointing to the content of the buffer, allowing a buffer to point to multiple folios. Having a pipe_buffer that can point to multiple pages then allows splice to append an entire splice segment consisting of multipe pages in a single pipe buffer. The pipe buffer bvec can then be passed directly to, say, sendmsg() with MSG_SPLICE_PAGES when splicing from the pipe, allowing the socket to be a bit more efficient. Signed-off-by: David Howells <dhowells@redhat.com>
diff --git a/fs/internal.h b/fs/internal.h index f7a74cc..8508f85 100644 --- a/fs/internal.h +++ b/fs/internal.h
@@ -178,6 +178,7 @@ extern void shrink_dentry_list(struct list_head *); extern const struct file_operations pipefifo_fops; long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg); struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice); +void wakeup_pipe_readers(struct pipe_inode_info *pipe); /* * fs_pin.c
diff --git a/fs/pipe.c b/fs/pipe.c index 4427903..5473528 100644 --- a/fs/pipe.c +++ b/fs/pipe.c
@@ -129,31 +129,43 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, } } +void wakeup_pipe_readers(struct pipe_inode_info *pipe) +{ + smp_mb(); + if (waitqueue_active(&pipe->rd_wait)) + wake_up_interruptible(&pipe->rd_wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct page *page = buf->page; + unsigned int i; - /* - * If nobody else uses this page, and we don't already have a - * temporary page, let's keep track of it as a one-deep - * allocation cache. (Otherwise just release our reference to it) - */ - if (page_count(page) == 1 && !pipe->tmp_page) - pipe->tmp_page = page; - else - put_page(page); + for (i = 0; i < buf->nr; i++) { + struct folio *folio = buf->bvec[i].bv_folio; + + /* + * If nobody else uses this page, and we don't already have a + * temporary page, let's keep track of it as a one-deep + * allocation cache. (Otherwise just release our reference to it) + */ + if (folio_ref_count(folio) == 1 && !pipe->spare_folio) + pipe->spare_folio = buf->bvec[i].bv_folio; + else + folio_put(buf->bvec[i].bv_folio); + } } static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct page *page = buf->page; + struct folio *folio = buf->bvec[buf->index].bv_folio; - if (page_count(page) != 1) + if (folio_ref_count(folio) != 1) return false; - memcg_kmem_uncharge_page(page, 0); - __SetPageLocked(page); + memcg_kmem_uncharge_page(folio_page(folio, 0), 0); + __folio_lock(folio); return true; } @@ -172,15 +184,15 @@ static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, bool generic_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct page *page = buf->page; + struct folio *folio = buf->bvec[buf->index].bv_folio; /* * A reference of one is golden, that means that the owner of this * page is the only one holding a reference to it. lock the page * and return OK. */ - if (page_count(page) == 1) { - lock_page(page); + if (folio_ref_count(folio) == 1) { + __folio_lock(folio); return true; } return false; @@ -199,7 +211,7 @@ EXPORT_SYMBOL(generic_pipe_buf_try_steal); */ bool generic_pipe_buf_get(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - return try_get_page(buf->page); + return folio_try_get(buf->bvec[buf->index].bv_folio); } EXPORT_SYMBOL(generic_pipe_buf_get); @@ -214,7 +226,10 @@ EXPORT_SYMBOL(generic_pipe_buf_get); void generic_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - put_page(buf->page); + unsigned int i; + + for (i = 0; i < buf->nr; i++) + folio_put(buf->bvec[i].bv_folio); } EXPORT_SYMBOL(generic_pipe_buf_release); @@ -238,17 +253,19 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { */ size_t pipe_query_space(struct pipe_inode_info *pipe, size_t *len, int *error) { - size_t used = pipe_occupancy(pipe->head, pipe->tail); - size_t npages = max_t(ssize_t, pipe->max_usage - used, 0); + size_t npages; if (unlikely(!pipe->readers)) { send_sig(SIGPIPE, current, 0); *error = -EPIPE; return 0; } - - if (npages == 0) + if (pipe->footprint >= pipe->max_footprint) { *error = -EAGAIN; + return 0; + } + + npages = pipe->max_footprint - pipe->footprint; *len = min_t(size_t, *len, npages * PAGE_SIZE); return npages; } @@ -264,17 +281,8 @@ EXPORT_SYMBOL(pipe_query_space); */ size_t pipe_query_content(struct pipe_inode_info *pipe, size_t *len) { - unsigned int head = pipe->head; - unsigned int tail = pipe->tail; - size_t size = 0, used = pipe_occupancy(head, tail); - - while (!pipe_empty(head, tail)) { - size += pipe_buf(pipe, tail)->len; - tail++; - } - - *len = size; - return used; + *len = pipe->content; + return pipe->footprint; } EXPORT_SYMBOL(pipe_query_content); @@ -296,12 +304,41 @@ struct pipe_buffer *pipe_alloc_buffer(struct pipe_inode_info *pipe, size_t bvcount, gfp_t gfp, int *error) { struct pipe_buffer *buf; + size_t size = struct_size(buf, bvec, bvcount); - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_full(pipe)) return NULL; - buf = pipe_head_buf(pipe); - memset(buf, 0, sizeof(*buf)); - buf->ops = ops; + + if (bvcount < 1) + bvcount = 1; + + if (pipe->spare_buffer) { + spin_lock_irq(&pipe->rd_wait.lock); + buf = pipe->spare_buffer; + if (buf) { + if (buf->max >= bvcount) + pipe->spare_buffer = NULL; + else + buf = NULL; + } + spin_unlock_irq(&pipe->rd_wait.lock); + if (buf) { + bvcount = buf->max; + memset(buf, 0, struct_size(buf, bvec, bvcount)); + buf->ops = ops; + buf->max = bvcount; + return buf; + } + } + + buf = kzalloc(size, gfp); + if (!buf) { + *error = -ENOMEM; + return NULL; + } + + buf->ops = ops; + buf->max = bvcount; return buf; } EXPORT_SYMBOL(pipe_alloc_buffer); @@ -323,23 +360,43 @@ EXPORT_SYMBOL(pipe_alloc_buffer); ssize_t pipe_add(struct pipe_inode_info *pipe, struct pipe_buffer *buf, bool *full) { - unsigned int head = pipe->head; - unsigned int tail = pipe->tail; + if (buf->size == 0 || WARN_ON(pipe_full(pipe))) + goto discard; - if (WARN_ON(pipe_full(head, tail, pipe->max_usage))) - goto error; + spin_lock_irq(&pipe->rd_wait.lock); + list_add_tail(&buf->queue_link, &pipe->queue); + pipe->footprint += buf->footprint; + *full = pipe_full(pipe); + spin_unlock_irq(&pipe->rd_wait.lock); + return buf->size; - pipe->head = head + 1; - *full = pipe_full(head, tail, pipe->max_usage); - return buf->len; - -error: +discard: pipe_buf_release(pipe, buf); - *full = true; - return -EAGAIN; + *full = pipe_full(pipe); + return 0; } EXPORT_SYMBOL(pipe_add); +/** + * pipe_buf_release - put a reference to a pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to put a reference to + */ +void pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) +{ + const struct pipe_buf_operations *ops = buf->ops; + + if (ops) + ops->release(pipe, buf); + if (buf->index >= buf->nr) { + spin_lock_irq(&pipe->rd_wait.lock); + pipe->footprint -= buf->footprint; + list_del(&buf->queue_link); + spin_unlock_irq(&pipe->rd_wait.lock); + kfree(buf); + } +} + #ifdef CONFIG_WATCH_QUEUE /** * pipe_set_lost_mark - Mark the pipe as having lost some data @@ -356,39 +413,96 @@ void pipe_set_lost_mark(struct pipe_inode_info *pipe) { struct pipe_buffer *buf; - if (pipe_empty(pipe->head, pipe->tail)) { + spin_lock_irq(&pipe->rd_wait.lock); + if (pipe_empty(pipe)) { pipe->note_loss = true; } else { - buf = pipe_buf(pipe, pipe->head - 1); + buf = list_last_entry(&pipe->queue, struct pipe_buffer, queue_link); buf->flags |= PIPE_BUF_FLAG_LOSS; } + spin_unlock_irq(&pipe->rd_wait.lock); } #endif /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_readable(const struct pipe_inode_info *pipe) { - unsigned int head = READ_ONCE(pipe->head); - unsigned int tail = READ_ONCE(pipe->tail); - unsigned int writers = READ_ONCE(pipe->writers); - - return !pipe_empty(head, tail) || !writers; + return !pipe_empty(pipe) || !READ_ONCE(pipe->writers); } -static ssize_t -pipe_read(struct kiocb *iocb, struct iov_iter *to) +/* + * Deal with the consumption of some data from a pipe buffer. Returns true if + * we've consumed all the data. + */ +bool pipe_consume(struct pipe_inode_info *pipe, struct pipe_buffer *buf, size_t consumed) { - size_t total_len = iov_iter_count(to); + if (WARN_ON_ONCE(consumed > buf->size)) + consumed = buf->size; + buf->size -= consumed; + + do { + struct bio_vec *bv = &buf->bvec[buf->index]; + size_t part = min_t(size_t, consumed, bv->bv_len); + + bv->bv_len -= part; + bv->bv_offset += part; + consumed -= part; + + if (bv->bv_len > 0) + break; + + buf->ops->release(pipe, buf); + buf->index++; + } while (consumed > 0); + + return buf->size == 0; +} + +/* + * Copy data from a pipe buffer into an iterator, confirming the pages in the + * buffer as we use them and releasing them when we've used them. + */ +static ssize_t pipe_copy_buf_to_iter(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, + struct iov_iter *iter) +{ + size_t part, n, copied = 0; + int ret = 0; + + while (buf->size) { + struct bio_vec *bv = &buf->bvec[buf->nr]; + + if (buf->nr_confirmed <= buf->index) { + ret = pipe_buf_confirm(pipe, buf); + if (ret < 0) + break; + } + + part = min_t(size_t, bv->bv_len, iov_iter_count(iter)); + n = copy_folio_to_iter(bv->bv_folio, bv->bv_offset, part, iter); + if (unlikely(n < part)) { + ret = -EFAULT; + break; + } + + copied += n; + pipe_consume(pipe, buf, n); + } + + return copied ?: ret; +} + +static ssize_t pipe_read(struct kiocb *iocb, struct iov_iter *iter) +{ struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - bool was_full, wake_next_reader = false; - ssize_t ret; + bool was_full, wake_next_reader = false, stop; + ssize_t copied = 0, ret = 0; /* Null read succeeds. */ - if (unlikely(total_len == 0)) + if (unlikely(!iov_iter_count(iter))) return 0; - ret = 0; __pipe_lock(pipe); /* @@ -399,90 +513,57 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) * (WF_SYNC), because we want them to get going and generate more * data for us. */ - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); + was_full = pipe_full(pipe); for (;;) { - /* Read ->head with a barrier vs post_one_notification() */ - unsigned int head = smp_load_acquire(&pipe->head); - unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; + struct pipe_buffer *buf; #ifdef CONFIG_WATCH_QUEUE if (pipe->note_loss) { struct watch_notification n; - if (total_len < 8) { - if (ret == 0) - ret = -ENOBUFS; + if (iov_iter_count(iter) < 8) { + ret = -ENOBUFS; break; } n.type = WATCH_TYPE_META; n.subtype = WATCH_META_LOSS_NOTIFICATION; n.info = watch_sizeof(n); - if (copy_to_iter(&n, sizeof(n), to) != sizeof(n)) { + if (copy_to_iter(&n, sizeof(n), iter) != sizeof(n)) { if (ret == 0) ret = -EFAULT; break; } - ret += sizeof(n); - total_len -= sizeof(n); + copied += sizeof(n); pipe->note_loss = false; } #endif - if (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; - size_t chars = buf->len; - size_t written; - int error; - - if (chars > total_len) { - if (buf->flags & PIPE_BUF_FLAG_WHOLE) { - if (ret == 0) - ret = -ENOBUFS; - break; - } - chars = total_len; - } - - error = pipe_buf_confirm(pipe, buf); - if (error) { - if (!ret) - ret = error; - break; - } - - written = copy_page_to_iter(buf->page, buf->offset, chars, to); - if (unlikely(written < chars)) { - if (!ret) - ret = -EFAULT; - break; - } - ret += chars; - buf->offset += chars; - buf->len -= chars; + buf = pipe_head_buf(pipe); + if (buf) { + if (buf->ops->copy_to_iter) + ret = buf->ops->copy_to_iter(pipe, buf, iter); + else + ret = pipe_copy_buf_to_iter(pipe, buf, iter); + if (ret > 0) + copied += ret; /* Was it a packet buffer? Clean up and exit */ - if (buf->flags & PIPE_BUF_FLAG_PACKET) { - total_len = chars; - buf->len = 0; - } + stop = buf->flags & PIPE_BUF_FLAG_PACKET; + if (stop) + buf->size = 0; - if (!buf->len) { - pipe_buf_release(pipe, buf); - spin_lock_irq(&pipe->rd_wait.lock); + if (!buf->size) { #ifdef CONFIG_WATCH_QUEUE if (buf->flags & PIPE_BUF_FLAG_LOSS) pipe->note_loss = true; #endif - tail++; - pipe->tail = tail; - spin_unlock_irq(&pipe->rd_wait.lock); + pipe_buf_release(pipe, buf); } - total_len -= chars; - if (!total_len) + + if (!iov_iter_count(iter)) break; /* common path: read succeeded */ - if (!pipe_empty(head, tail)) /* More to do? */ + if (!pipe_empty(pipe)) /* More to do? */ continue; } @@ -527,10 +608,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) return -ERESTARTSYS; __pipe_lock(pipe); - was_full = pipe_full(pipe->head, pipe->tail, pipe->max_usage); + was_full = pipe_full(pipe); wake_next_reader = true; } - if (pipe_empty(pipe->head, pipe->tail)) + if (pipe_empty(pipe)) wake_next_reader = false; __pipe_unlock(pipe); @@ -541,7 +622,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); if (ret > 0) file_accessed(filp); - return ret; + return copied ?: ret; } static inline int is_packetized(struct file *file) @@ -552,25 +633,47 @@ static inline int is_packetized(struct file *file) /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_writable(const struct pipe_inode_info *pipe) { - unsigned int head = READ_ONCE(pipe->head); - unsigned int tail = READ_ONCE(pipe->tail); - unsigned int max_usage = READ_ONCE(pipe->max_usage); - - return !pipe_full(head, tail, max_usage) || - !READ_ONCE(pipe->readers); + return !pipe_full(pipe) || !READ_ONCE(pipe->readers); } -static ssize_t -pipe_write(struct kiocb *iocb, struct iov_iter *from) +/* + * copy_iter_to_folio - Copy data from an iterator into a folio + * @iter: Source iterator + * @folio: Destination folio + * @offset: Offset within the folio to start writing + * @len: Amount to copy + */ +static ssize_t copy_iter_to_folio(struct iov_iter *iter, struct folio *folio, + size_t offset, size_t len) +{ + size_t copied = 0; + + while (len > 0 && iov_iter_count(iter) > 0) { + size_t pnum = offset / PAGE_SIZE; + size_t poff = offset & ~PAGE_MASK; + size_t part = min3(len, PAGE_SIZE - offset, iov_iter_count(iter)); + size_t n; + + n = copy_page_from_iter(folio_page(folio, pnum), poff, part, iter); + offset += n; + copied += n; + if (n < part) + return copied ?: -EFAULT; + } + + return copied; +} + +static ssize_t pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; - unsigned int head; - ssize_t ret = 0; size_t total_len = iov_iter_count(from); - ssize_t chars; + ssize_t written = 0, chars; bool was_empty = false; bool wake_next_writer = false; + bool full = pipe_full(pipe); + int ret = 0; /* Null write succeeds. */ if (unlikely(total_len == 0)) @@ -599,27 +702,28 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) * page-aligns the rest of the writes for large writes * spanning multiple pages. */ - head = pipe->head; - was_empty = pipe_empty(head, pipe->tail); + was_empty = pipe_empty(pipe); chars = total_len & (PAGE_SIZE-1); if (chars && !was_empty) { - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; - int offset = buf->offset + buf->len; + struct pipe_buffer *buf = + list_last_entry(&pipe->queue, + struct pipe_buffer, queue_link); + struct bio_vec *bv = &buf->bvec[0]; + size_t offset = bv->bv_offset + bv->bv_len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && - offset + chars <= PAGE_SIZE) { + offset + chars <= folio_size(bv->bv_folio)) { ret = pipe_buf_confirm(pipe, buf); if (ret) goto out; - ret = copy_page_from_iter(buf->page, offset, chars, from); + ret = copy_iter_to_folio(from, bv->bv_folio, offset, chars); if (unlikely(ret < chars)) { ret = -EFAULT; goto out; } - buf->len += ret; + buf->size += ret; if (!iov_iter_count(from)) goto out; } @@ -628,81 +732,69 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); - if (!ret) - ret = -EPIPE; + ret = -EPIPE; break; } - head = pipe->head; - if (!pipe_full(head, pipe->tail, pipe->max_usage)) { - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[head & mask]; - struct page *page = pipe->tmp_page; - int copied; + if (!full) { + struct pipe_buffer *buf; + struct folio *folio = pipe->spare_folio; + ssize_t copied; + size_t part; - if (!page) { - page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); - if (unlikely(!page)) { - ret = ret ? : -ENOMEM; + buf = pipe_alloc_buffer(pipe, &anon_pipe_buf_ops, + 1, GFP_KERNEL, &ret); + if (!buf) + break; + + folio = pipe->spare_folio; + if (!folio) { + folio = folio_alloc(GFP_HIGHUSER | __GFP_ACCOUNT, 0); + if (unlikely(!folio)) { + ret = -ENOMEM; break; } - pipe->tmp_page = page; + } else { + pipe->spare_folio = NULL; } - /* Allocate a slot in the ring in advance and attach an - * empty buffer. If we fault or otherwise fail to use - * it, either the reader will consume it or it'll still - * be there for the next write. - */ - spin_lock_irq(&pipe->rd_wait.lock); + buf->bvec[0].bv_folio = folio; + buf->bvec[0].bv_offset = 0; + buf->bvec[0].bv_len = 0; + buf->nr = 1; + buf->footprint += folio_nr_pages(folio); - head = pipe->head; - if (pipe_full(head, pipe->tail, pipe->max_usage)) { - spin_unlock_irq(&pipe->rd_wait.lock); - continue; - } - - pipe->head = head + 1; - spin_unlock_irq(&pipe->rd_wait.lock); - - /* Insert it into the buffer array */ - buf = &pipe->bufs[head & mask]; - buf->page = page; - buf->ops = &anon_pipe_buf_ops; - buf->offset = 0; - buf->len = 0; if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; - pipe->tmp_page = NULL; - copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); - if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + part = min(iov_iter_count(from), folio_size(folio)); + copied = copy_iter_to_folio(from, folio, 0, folio_size(folio)); + if (unlikely(copied < part)) { if (!ret) ret = -EFAULT; break; } ret += copied; - buf->offset = 0; - buf->len = copied; + buf->bvec[0].bv_len += copied; + buf->size += copied; + ret = pipe_add(pipe, buf, &full); if (!iov_iter_count(from)) break; } - if (!pipe_full(head, pipe->tail, pipe->max_usage)) + if (!full) continue; /* Wait for buffer space to become available. */ if (filp->f_flags & O_NONBLOCK) { - if (!ret) - ret = -EAGAIN; + ret = -EAGAIN; break; } if (signal_pending(current)) { - if (!ret) - ret = -ERESTARTSYS; + ret = -ERESTARTSYS; break; } @@ -718,11 +810,12 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); __pipe_lock(pipe); - was_empty = pipe_empty(pipe->head, pipe->tail); + was_empty = pipe_empty(pipe); wake_next_writer = true; + full = pipe_full(pipe); } out: - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_full(pipe)) wake_next_writer = false; __pipe_unlock(pipe); @@ -743,44 +836,35 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { - int err = file_update_time(filp); - if (err) - ret = err; + if (written && sb_start_write_trylock(file_inode(filp)->i_sb)) { + ret = file_update_time(filp); + if (ret) + written = ret; sb_end_write(file_inode(filp)->i_sb); } - return ret; + return written ?: ret; } static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - unsigned int count, head, tail, mask; + struct pipe_buffer *buf; + unsigned int count; switch (cmd) { case FIONREAD: __pipe_lock(pipe); count = 0; - head = pipe->head; - tail = pipe->tail; - mask = pipe->ring_size - 1; - - while (tail != head) { - count += pipe->bufs[tail & mask].len; - tail++; + list_for_each_entry(buf, &pipe->queue, queue_link) { + count += buf->size; } __pipe_unlock(pipe); return put_user(count, (int __user *)arg); #ifdef CONFIG_WATCH_QUEUE - case IOC_WATCH_QUEUE_SET_SIZE: { - int ret; - __pipe_lock(pipe); - ret = watch_queue_set_size(pipe, arg); - __pipe_unlock(pipe); - return ret; - } + case IOC_WATCH_QUEUE_SET_SIZE: + return 0; /* Does nothing for the moment. */ case IOC_WATCH_QUEUE_SET_FILTER: return watch_queue_set_filter( @@ -798,7 +882,6 @@ pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - unsigned int head, tail; /* Epoll has some historical nasty semantics, this enables them */ WRITE_ONCE(pipe->poll_usage, true); @@ -819,19 +902,16 @@ pipe_poll(struct file *filp, poll_table *wait) * if something changes and you got it wrong, the poll * table entry will wake you up and fix it. */ - head = READ_ONCE(pipe->head); - tail = READ_ONCE(pipe->tail); - mask = 0; if (filp->f_mode & FMODE_READ) { - if (!pipe_empty(head, tail)) + if (!pipe_empty(pipe)) mask |= EPOLLIN | EPOLLRDNORM; if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= EPOLLHUP; } if (filp->f_mode & FMODE_WRITE) { - if (!pipe_full(head, tail, pipe->max_usage)) + if (!pipe_full(pipe)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Most Unices do not set EPOLLERR for FIFOs but on Linux they @@ -902,27 +982,27 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } -unsigned long account_pipe_buffers(struct user_struct *user, - unsigned long old, unsigned long new) +static unsigned long account_pipe_buffers(struct user_struct *user, + unsigned long old, unsigned long new) { return atomic_long_add_return(new - old, &user->pipe_bufs); } -bool too_many_pipe_buffers_soft(unsigned long user_bufs) +static bool too_many_pipe_buffers_soft(unsigned long user_bufs) { unsigned long soft_limit = READ_ONCE(pipe_user_pages_soft); return soft_limit && user_bufs > soft_limit; } -bool too_many_pipe_buffers_hard(unsigned long user_bufs) +static bool too_many_pipe_buffers_hard(unsigned long user_bufs) { unsigned long hard_limit = READ_ONCE(pipe_user_pages_hard); return hard_limit && user_bufs > hard_limit; } -bool pipe_is_unprivileged_user(void) +static bool pipe_is_unprivileged_user(void) { return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); } @@ -930,45 +1010,38 @@ bool pipe_is_unprivileged_user(void) struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; - unsigned long pipe_bufs = PIPE_DEF_BUFFERS; struct user_struct *user = get_current_user(); - unsigned long user_bufs; - unsigned int max_size = READ_ONCE(pipe_max_size); + size_t limit = PIPE_DEF_BUFFERS, user_bufs; + size_t sys = min_t(size_t, DIV_ROUND_UP(READ_ONCE(pipe_max_size), PAGE_SIZE), 1); pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT); if (pipe == NULL) goto out_free_uid; - if (pipe_bufs * PAGE_SIZE > max_size && !capable(CAP_SYS_RESOURCE)) - pipe_bufs = max_size >> PAGE_SHIFT; + if (limit > sys && !capable(CAP_SYS_RESOURCE)) + limit = sys; - user_bufs = account_pipe_buffers(user, 0, pipe_bufs); + user_bufs = account_pipe_buffers(user, 0, limit); if (too_many_pipe_buffers_soft(user_bufs) && pipe_is_unprivileged_user()) { - user_bufs = account_pipe_buffers(user, pipe_bufs, PIPE_MIN_DEF_BUFFERS); - pipe_bufs = PIPE_MIN_DEF_BUFFERS; + user_bufs = account_pipe_buffers(user, limit, PIPE_MIN_DEF_BUFFERS); + limit = PIPE_MIN_DEF_BUFFERS; } if (too_many_pipe_buffers_hard(user_bufs) && pipe_is_unprivileged_user()) goto out_revert_acct; - pipe->bufs = kcalloc(pipe_bufs, sizeof(struct pipe_buffer), - GFP_KERNEL_ACCOUNT); - - if (pipe->bufs) { - init_waitqueue_head(&pipe->rd_wait); - init_waitqueue_head(&pipe->wr_wait); - pipe->r_counter = pipe->w_counter = 1; - pipe->max_usage = pipe_bufs; - pipe->ring_size = pipe_bufs; - pipe->nr_accounted = pipe_bufs; - pipe->user = user; - mutex_init(&pipe->mutex); - return pipe; - } + INIT_LIST_HEAD(&pipe->queue); + init_waitqueue_head(&pipe->rd_wait); + init_waitqueue_head(&pipe->wr_wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->max_footprint = limit; + pipe->user = user; + mutex_init(&pipe->mutex); + return pipe; out_revert_acct: - (void) account_pipe_buffers(user, pipe_bufs, 0); + (void) account_pipe_buffers(user, limit, 0); kfree(pipe); out_free_uid: free_uid(user); @@ -977,27 +1050,26 @@ struct pipe_inode_info *alloc_pipe_info(void) void free_pipe_info(struct pipe_inode_info *pipe) { - unsigned int i; + struct pipe_buffer *buf; #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) watch_queue_clear(pipe->watch_queue); #endif - (void) account_pipe_buffers(pipe->user, pipe->nr_accounted, 0); + (void) account_pipe_buffers(pipe->user, pipe->footprint, 0); free_uid(pipe->user); - for (i = 0; i < pipe->ring_size; i++) { - struct pipe_buffer *buf = pipe->bufs + i; - if (buf->ops) - pipe_buf_release(pipe, buf); + while ((buf = list_first_entry_or_null( + &pipe->queue, struct pipe_buffer, queue_link))) { + pipe_buf_release(pipe, buf); } #ifdef CONFIG_WATCH_QUEUE if (pipe->watch_queue) put_watch_queue(pipe->watch_queue); #endif - if (pipe->tmp_page) - __free_page(pipe->tmp_page); - kfree(pipe->bufs); + if (pipe->spare_folio) + folio_put(pipe->spare_folio); + kfree(pipe->spare_buffer); kfree(pipe); } @@ -1376,96 +1448,14 @@ const struct file_operations pipefifo_fops = { }; /* - * Currently we rely on the pipe array holding a power-of-2 number - * of pages. Returns 0 on error. - */ -static unsigned int round_pipe_size(unsigned long size) -{ - if (size > (1U << 31)) - return 0; - - /* Minimum pipe size, as required by POSIX */ - if (size < PAGE_SIZE) - return PAGE_SIZE; - - return roundup_pow_of_two(size); -} - -/* - * Resize the pipe ring to a number of slots. - * - * Note the pipe can be reduced in capacity, but only if the current - * occupancy doesn't exceed nr_slots; if it does, EBUSY will be - * returned instead. - */ -int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) -{ - struct pipe_buffer *bufs; - unsigned int head, tail, mask, n; - - bufs = kcalloc(nr_slots, sizeof(*bufs), - GFP_KERNEL_ACCOUNT | __GFP_NOWARN); - if (unlikely(!bufs)) - return -ENOMEM; - - spin_lock_irq(&pipe->rd_wait.lock); - mask = pipe->ring_size - 1; - head = pipe->head; - tail = pipe->tail; - - n = pipe_occupancy(head, tail); - if (nr_slots < n) { - spin_unlock_irq(&pipe->rd_wait.lock); - kfree(bufs); - return -EBUSY; - } - - /* - * The pipe array wraps around, so just start the new one at zero - * and adjust the indices. - */ - if (n > 0) { - unsigned int h = head & mask; - unsigned int t = tail & mask; - if (h > t) { - memcpy(bufs, pipe->bufs + t, - n * sizeof(struct pipe_buffer)); - } else { - unsigned int tsize = pipe->ring_size - t; - if (h > 0) - memcpy(bufs + tsize, pipe->bufs, - h * sizeof(struct pipe_buffer)); - memcpy(bufs, pipe->bufs + t, - tsize * sizeof(struct pipe_buffer)); - } - } - - head = n; - tail = 0; - - kfree(pipe->bufs); - pipe->bufs = bufs; - pipe->ring_size = nr_slots; - if (pipe->max_usage > nr_slots) - pipe->max_usage = nr_slots; - pipe->tail = tail; - pipe->head = head; - - spin_unlock_irq(&pipe->rd_wait.lock); - - /* This might have made more room for writers */ - wake_up_interruptible(&pipe->wr_wait); - return 0; -} - -/* - * Allocate a new array of pipe buffers and copy the info over. Returns the - * pipe size if successful, or return -ERROR on error. + * Change the limit on the amount of data allowed into a pipe. Returns the pipe + * size if successful, or return -ERROR on error. */ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) { unsigned long user_bufs; - unsigned int nr_slots, size; + size_t limit; + size_t sys = min_t(size_t, DIV_ROUND_UP(pipe_max_size, PAGE_SIZE), 1); long ret = 0; #ifdef CONFIG_WATCH_QUEUE @@ -1473,43 +1463,34 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long arg) return -EBUSY; #endif - size = round_pipe_size(arg); - nr_slots = size >> PAGE_SHIFT; - - if (!nr_slots) - return -EINVAL; + limit = DIV_ROUND_UP(arg, PAGE_SIZE); + limit = min_t(size_t, limit, 1); /* - * If trying to increase the pipe capacity, check that an - * unprivileged user is not trying to exceed various limits - * (soft limit check here, hard limit check just below). - * Decreasing the pipe capacity is always permitted, even - * if the user is currently over a limit. + * If trying to increase the pipe capacity, check that an unprivileged + * user is not trying to exceed various limits (soft limit check here, + * hard limit check just below). Decreasing the pipe capacity is + * always permitted, even if the user is currently over a limit. */ - if (nr_slots > pipe->max_usage && - size > pipe_max_size && !capable(CAP_SYS_RESOURCE)) + if (limit > pipe->max_footprint && + limit > sys && !capable(CAP_SYS_RESOURCE)) return -EPERM; - user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_slots); + user_bufs = account_pipe_buffers(pipe->user, pipe->max_footprint, limit); - if (nr_slots > pipe->max_usage && - (too_many_pipe_buffers_hard(user_bufs) || - too_many_pipe_buffers_soft(user_bufs)) && - pipe_is_unprivileged_user()) { + if (limit > pipe->max_footprint && + (too_many_pipe_buffers_hard(user_bufs) || + too_many_pipe_buffers_soft(user_bufs)) && + pipe_is_unprivileged_user()) { ret = -EPERM; goto out_revert_acct; } - ret = pipe_resize_ring(pipe, nr_slots); - if (ret < 0) - goto out_revert_acct; - - pipe->max_usage = nr_slots; - pipe->nr_accounted = nr_slots; - return pipe->max_usage * PAGE_SIZE; + pipe->max_footprint = limit; + return pipe->max_footprint * PAGE_SIZE; out_revert_acct: - (void) account_pipe_buffers(pipe->user, nr_slots, pipe->nr_accounted); + (void) account_pipe_buffers(pipe->user, limit, pipe->max_footprint); return ret; } @@ -1546,7 +1527,7 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) ret = pipe_set_size(pipe, arg); break; case F_GETPIPE_SZ: - ret = pipe->max_usage * PAGE_SIZE; + ret = pipe->max_footprint; break; default: ret = -EINVAL; @@ -1593,7 +1574,7 @@ static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, if (write) { unsigned int val; - val = round_pipe_size(*lvalp); + val = round_up(*lvalp, PAGE_SIZE); if (val == 0) return -EINVAL;
diff --git a/fs/pipe.h b/fs/pipe.h index 0d749bf..ef11dff 100644 --- a/fs/pipe.h +++ b/fs/pipe.h
@@ -3,15 +3,15 @@ /** * struct pipe_inode_info - a linux kernel pipe * @mutex: mutex protecting the whole thing + * @queue: The pipe buffer. * @rd_wait: reader wait point in case of empty pipe * @wr_wait: writer wait point in case of full pipe - * @head: The point of buffer production - * @tail: The point of buffer consumption * @note_loss: The next read() should insert a data-lost message - * @max_usage: The maximum number of slots that may be used in the ring - * @ring_size: total number of buffers (should be a power of 2) - * @nr_accounted: The amount this pipe accounts for in user->pipe_bufs - * @tmp_page: cached released page + * @footprint: The amount of space pinned by the pipe (in pages). + * @max_footprint: The maximum amount of space that can be pinned (in pages). + * @content: The amount of content (in bytes). + * @spare_folio: Cached released folio + * @spare_buffer: Cached released buffer * @readers: number of current readers of this pipe * @writers: number of current writers of this pipe * @files: number of struct file referring this pipe (protected by ->i_lock) @@ -20,87 +20,70 @@ * @poll_usage: is this pipe used for epoll, which has crazy wakeups? * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync - * @bufs: the circular array of pipe buffers * @user: the user who created this pipe * @watch_queue: If this pipe is a watch_queue, this is the stuff for that **/ struct pipe_inode_info { - struct mutex mutex; - wait_queue_head_t rd_wait, wr_wait; - unsigned int head; - unsigned int tail; - unsigned int max_usage; - unsigned int ring_size; + struct mutex mutex; + struct list_head queue; + wait_queue_head_t rd_wait, wr_wait; #ifdef CONFIG_WATCH_QUEUE - bool note_loss; + bool note_loss; #endif - unsigned int nr_accounted; - unsigned int readers; - unsigned int writers; - unsigned int files; - unsigned int r_counter; - unsigned int w_counter; - bool poll_usage; - struct page *tmp_page; - struct fasync_struct *fasync_readers; - struct fasync_struct *fasync_writers; - struct pipe_buffer *bufs; - struct user_struct *user; + size_t footprint; + size_t max_footprint; + size_t content; + unsigned int readers; + unsigned int writers; + unsigned int files; + unsigned int r_counter; + unsigned int w_counter; + bool poll_usage; + struct folio *spare_folio; + struct pipe_buffer *spare_buffer; + struct fasync_struct *fasync_readers; + struct fasync_struct *fasync_writers; + struct user_struct *user; #ifdef CONFIG_WATCH_QUEUE - struct watch_queue *watch_queue; + struct watch_queue *watch_queue; #endif }; /** * pipe_empty - Return true if the pipe is empty - * @head: The pipe ring head pointer - * @tail: The pipe ring tail pointer + * @pipe: The pipe to query */ -static inline bool pipe_empty(unsigned int head, unsigned int tail) +static inline bool pipe_empty(const struct pipe_inode_info *pipe) { - return head == tail; -} - -/** - * pipe_occupancy - Return number of slots used in the pipe - * @head: The pipe ring head pointer - * @tail: The pipe ring tail pointer - */ -static inline unsigned int pipe_occupancy(unsigned int head, unsigned int tail) -{ - return head - tail; + return list_empty(&pipe->queue); } /** * pipe_full - Return true if the pipe is full - * @head: The pipe ring head pointer - * @tail: The pipe ring tail pointer - * @limit: The maximum amount of slots available. + * @pipe: The pipe to query */ -static inline bool pipe_full(unsigned int head, unsigned int tail, - unsigned int limit) +static inline bool pipe_full(const struct pipe_inode_info *pipe) { - return pipe_occupancy(head, tail) >= limit; + return pipe->footprint >= pipe->max_footprint; } /** - * pipe_buf - Return the pipe buffer for the specified slot in the pipe ring - * @pipe: The pipe to access - * @slot: The slot of interest + * pipe_occupancy - Return number of pages remaining in a pipe + * @pipe: The pipe to query */ -static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, - unsigned int slot) +static inline size_t pipe_occupancy(const struct pipe_inode_info *pipe) { - return &pipe->bufs[slot & (pipe->ring_size - 1)]; + return min_t(ssize_t, pipe->max_footprint - pipe->footprint, 0); } /** - * pipe_head_buf - Return the pipe buffer at the head of the pipe ring + * pipe_head_buf - Return the head pipe buffer or NULL * @pipe: The pipe to access */ -static inline struct pipe_buffer *pipe_head_buf(const struct pipe_inode_info *pipe) +static inline struct pipe_buffer *pipe_head_buf(struct pipe_inode_info *pipe) { - return pipe_buf(pipe, pipe->head); + return list_first_entry_or_null(&pipe->queue, + struct pipe_buffer, queue_link); } /* Wait for a pipe to be readable/writable while dropping the pipe lock */ @@ -108,16 +91,4 @@ void pipe_wait_readable(struct pipe_inode_info *); void pipe_wait_writable(struct pipe_inode_info *); struct pipe_inode_info *alloc_pipe_info(void); - -#ifdef CONFIG_WATCH_QUEUE -unsigned long account_pipe_buffers(struct user_struct *user, - unsigned long old, unsigned long new); -bool too_many_pipe_buffers_soft(unsigned long user_bufs); -bool too_many_pipe_buffers_hard(unsigned long user_bufs); -bool pipe_is_unprivileged_user(void); -#endif - -/* for F_SETPIPE_SZ and F_GETPIPE_SZ */ -#ifdef CONFIG_WATCH_QUEUE -int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots); -#endif +bool pipe_consume(struct pipe_inode_info *pipe, struct pipe_buffer *buf, size_t consumed);
diff --git a/fs/splice.c b/fs/splice.c index 8bbbb19..1889b86 100644 --- a/fs/splice.c +++ b/fs/splice.c
@@ -48,7 +48,7 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct folio *folio = page_folio(buf->page); + struct folio *folio = page_folio(buf->bvec[buf->index].bv_page); struct address_space *mapping; folio_lock(folio); @@ -93,8 +93,9 @@ static bool page_cache_pipe_buf_try_steal(struct pipe_inode_info *pipe, static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - put_page(buf->page); - buf->flags &= ~PIPE_BUF_FLAG_LRU; + put_page(buf->bvec[buf->index++].bv_page); + if (buf->index == buf->nr) + buf->flags &= ~PIPE_BUF_FLAG_LRU; } /* @@ -104,38 +105,34 @@ static void page_cache_pipe_buf_release(struct pipe_inode_info *pipe, static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct page *page = buf->page; + struct folio *folio = page_folio(buf->bvec[buf->index].bv_page); int err; - if (!PageUptodate(page)) { - lock_page(page); + if (!folio_test_uptodate(folio)) { + folio_lock(folio); /* - * Page got truncated/unhashed. This will cause a 0-byte + * Folio got truncated/unhashed. This will cause a 0-byte * splice, if this is the first page. */ - if (!page->mapping) { + if (!folio->mapping) { err = -ENODATA; goto error; } - /* - * Uh oh, read-error from disk. - */ - if (!PageUptodate(page)) { + /* Uh oh, read-error from disk. */ + if (!folio_test_uptodate(folio)) { err = -EIO; goto error; } - /* - * Page is ok afterall, we are done. - */ - unlock_page(page); + /* Folio is ok afterall, we are done. */ + folio_unlock(folio); } return 0; error: - unlock_page(page); + folio_unlock(folio); return err; } @@ -162,116 +159,25 @@ static const struct pipe_buf_operations user_page_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -static void wakeup_pipe_readers(struct pipe_inode_info *pipe) -{ - smp_mb(); - if (waitqueue_active(&pipe->rd_wait)) - wake_up_interruptible(&pipe->rd_wait); - kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); -} - -/** - * splice_to_pipe - fill passed data into a pipe - * @pipe: pipe to fill - * @spd: data to fill - * - * Description: - * @spd contains a map of pages and len/offset tuples, along with - * the struct pipe_buf_operations associated with these pages. This - * function will link that data to the pipe. - * - */ -ssize_t splice_to_pipe(struct pipe_inode_info *pipe, - struct splice_pipe_desc *spd) -{ - struct pipe_buffer *buf; - unsigned int spd_pages = spd->nr_pages; - size_t len = INT_MAX, spliced = 0; - bool full = false; - int ret = -EAGAIN, page_nr = 0; - - if (!spd_pages) - return 0; - - if (!pipe_query_space(pipe, &len, &ret)) - goto out; - - do { - buf = pipe_alloc_buffer(pipe, spd->ops, 1, GFP_KERNEL, &ret); - if (!buf) - goto out; - - buf->page = spd->pages[page_nr]; - buf->offset = spd->partial[page_nr].offset; - buf->len = spd->partial[page_nr].len; - buf->private = spd->partial[page_nr].private; - page_nr++; - spd->nr_pages--; - - spliced += pipe_add(pipe, buf, &full); - } while (!full && spd->nr_pages); - -out: - while (page_nr < spd_pages) - spd->spd_release(spd, page_nr++); - - return spliced ?: ret; -} -EXPORT_SYMBOL_GPL(splice_to_pipe); - -ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) -{ - unsigned int head = pipe->head; - unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; - int ret; - - if (unlikely(!pipe->readers)) { - send_sig(SIGPIPE, current, 0); - ret = -EPIPE; - } else if (pipe_full(head, tail, pipe->max_usage)) { - ret = -EAGAIN; - } else { - pipe->bufs[head & mask] = *buf; - pipe->head = head + 1; - return buf->len; - } - pipe_buf_release(pipe, buf); - return ret; -} -EXPORT_SYMBOL(add_to_pipe); - /* * Check if we need to grow the arrays holding pages and partial page * descriptions. */ -int splice_grow_spd(const struct pipe_inode_info *pipe, struct splice_pipe_desc *spd) +int splice_grow_buf(const struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - unsigned int max_usage = READ_ONCE(pipe->max_usage); + size_t was = struct_size(buf, bvec, buf->nr); + size_t to = struct_size(buf, bvec, buf->nr + 1); - spd->nr_pages_max = max_usage; - if (max_usage <= PIPE_DEF_BUFFERS) - return 0; + buf = krealloc(buf, to, GFP_KERNEL); + if (!buf) + return -ENOMEM; - spd->pages = kmalloc_array(max_usage, sizeof(struct page *), GFP_KERNEL); - spd->partial = kmalloc_array(max_usage, sizeof(struct partial_page), - GFP_KERNEL); - - if (spd->pages && spd->partial) - return 0; - - kfree(spd->pages); - kfree(spd->partial); - return -ENOMEM; + memset((void *)buf + was, 0, to - was); + return 0; } -void splice_shrink_spd(struct splice_pipe_desc *spd) +void splice_shrink_buf(struct pipe_buffer *buf) { - if (spd->nr_pages_max <= PIPE_DEF_BUFFERS) - return; - - kfree(spd->pages); - kfree(spd->partial); } /* @@ -282,29 +188,35 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + struct pipe_buffer *buf; struct iov_iter to; - struct bio_vec *bv; struct kiocb kiocb; struct page **pages; ssize_t ret; - size_t used, npages, chunk, remain, keep = 0; - int i; + size_t npages, chunk, remain, keep; + bool full = false; + int i, error = -EAGAIN; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); - npages = max_t(ssize_t, pipe->max_usage - used, 0); - len = min_t(size_t, len, npages * PAGE_SIZE); - npages = DIV_ROUND_UP(len, PAGE_SIZE); + npages = pipe_query_space(pipe, &len, &error); + if (!npages) + return error; - bv = kzalloc(array_size(npages, sizeof(bv[0])) + - array_size(npages, sizeof(struct page *)), GFP_KERNEL); - if (!bv) + buf = pipe_alloc_buffer(pipe, &page_cache_pipe_buf_ops, npages, + GFP_KERNEL, &error); + if (!buf) + return error; + + pages = kzalloc(array_size(npages, sizeof(struct page *)), GFP_KERNEL); + if (!pages) { + kfree(buf); return -ENOMEM; + } - pages = (struct page **)(bv + npages); npages = alloc_pages_bulk_array(GFP_USER, npages, pages); if (!npages) { - kfree(bv); + kfree(buf); + kfree(pages); return -ENOMEM; } @@ -312,14 +224,14 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, for (i = 0; i < npages; i++) { chunk = min_t(size_t, PAGE_SIZE, remain); - bv[i].bv_page = pages[i]; - bv[i].bv_offset = 0; - bv[i].bv_len = chunk; + buf->bvec[i].bv_page = pages[i]; + buf->bvec[i].bv_offset = 0; + buf->bvec[i].bv_len = chunk; remain -= chunk; } /* Do the I/O */ - iov_iter_bvec(&to, ITER_DEST, bv, npages, len); + iov_iter_bvec(&to, ITER_DEST, buf->bvec, npages, len); init_sync_kiocb(&kiocb, in); kiocb.ki_pos = *ppos; ret = call_read_iter(in, &kiocb, &to); @@ -340,24 +252,12 @@ ssize_t direct_splice_read(struct file *in, loff_t *ppos, /* Free any pages that didn't get touched at all. */ if (keep < npages) release_pages(pages + keep, npages - keep); + buf->nr = npages; + kfree(pages); - /* Push the remaining pages into the pipe. */ - remain = ret; - for (i = 0; i < keep; i++) { - struct pipe_buffer *buf = pipe_head_buf(pipe); - - chunk = min_t(size_t, remain, PAGE_SIZE); - *buf = (struct pipe_buffer) { - .ops = &default_pipe_buf_ops, - .page = bv[i].bv_page, - .offset = 0, - .len = chunk, - }; - pipe->head++; - remain -= chunk; - } - - kfree(bv); + /* Push the remaining pages into the pipe (will discard the + * buf if empty). */ + pipe_add(pipe, buf, &full); return ret; } EXPORT_SYMBOL(direct_splice_read); @@ -418,9 +318,8 @@ static int pipe_to_sendmsg(struct pipe_inode_info *pipe, struct splice_desc *sd, if (sd->flags & SPLICE_F_MORE) msg.msg_flags |= MSG_MORE; - if (sd->len < sd->total_len && - pipe_occupancy(pipe->head, pipe->tail) > 1) - msg.msg_flags |= MSG_MORE; + if (sd->len < sd->total_len) + msg.msg_flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST; iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, bv, nr_bv, sd->len); return sock_sendmsg(sock, &msg); @@ -436,44 +335,55 @@ static void wakeup_pipe_writers(struct pipe_inode_info *pipe) } /* - * Try to steal the page from a pipe buffer and if we fail, copy the page and - * replace the pipe buffer with one that points to the copy. + * Try to steal the page from a pipe buffer and if we fail, copy the folio and + * update the pipe buffer to point to the copy. */ static int splice_steal_or_copy(struct pipe_inode_info *pipe, struct pipe_buffer *buf, struct splice_desc *sd) { - if (!pipe_buf_try_steal(pipe, buf)) { - /* Fall back to replacing the buffer page with a copy. */ - struct page *page; - size_t offset = buf->offset, len = buf->len; - void *src, *dst; + struct bio_vec *bv = &buf->bvec[buf->index]; - page = alloc_page(GFP_KERNEL); - if (!page) + if (!pipe_buf_try_steal(pipe, buf)) { + /* Fall back to replacing the buffer folio with a copy. */ + struct folio *folio; + size_t offset_d = 0; + size_t offset_s = bv->bv_offset, len = bv->bv_len; + size_t size = roundup_pow_of_two(len); + size_t order = ilog2(size); + + WARN_ON(order > folio_order(bv->bv_folio)); + + folio = folio_alloc(GFP_KERNEL, order); + if (!folio) return -ENOMEM; - src = kmap_local_page(buf->page); - dst = kmap_local_page(page); - memcpy(dst + offset, src + offset, len); - kunmap_local(src); - kunmap_local(dst); + do { + void *src, *dst; + size_t part = min3(len, + PAGE_SIZE - (offset_s & ~PAGE_MASK), + PAGE_SIZE - (offset_d & ~PAGE_MASK)); + + src = kmap_local_folio(bv->bv_folio, offset_s); + dst = kmap_local_folio(folio, offset_d); + memcpy(dst, src, part); + kunmap_local(src); + kunmap_local(dst); + offset_s += part; + offset_d += part; + len -= part; + } while (len > 0); pipe_buf_release(pipe, buf); - *buf = (struct pipe_buffer) { - .page = page, - .offset = offset, - .len = len, - .ops = &default_pipe_buf_ops, - }; + bv->bv_folio = folio; + bv->bv_offset = 0; } else { /* Need to unlock the page */ - unlock_page(buf->page); - buf->ops = &default_pipe_buf_ops; - buf->private = 0; + folio_unlock(bv->bv_folio); } + buf->flags |= PIPE_BUF_FLAG_IX_STOLEN; return 0; } @@ -500,18 +410,11 @@ static int splice_steal_or_copy(struct pipe_inode_info *pipe, static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_desc *sd, splice_actor *actor) { - unsigned int head = pipe->head; - unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; + struct pipe_buffer *buf; int ret; - while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; - struct bio_vec bv; - - sd->len = buf->len; - if (sd->len > sd->total_len) - sd->len = sd->total_len; + while ((buf = pipe_head_buf(pipe))) { + sd->len = min(buf->size, sd->total_len); ret = pipe_buf_confirm(pipe, buf); if (unlikely(ret)) { @@ -526,23 +429,20 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des return ret; } - bvec_set_page(&bv, buf->page, buf->offset, buf->len); - ret = actor(pipe, sd, 1, &bv); + ret = actor(pipe, sd, buf->nr - buf->index, + buf->bvec + buf->index); if (ret <= 0) return ret; - buf->offset += ret; - buf->len -= ret; + buf->size -= ret; sd->num_spliced += ret; sd->len -= ret; sd->pos += ret; sd->total_len -= ret; - if (!buf->len) { + if (!buf->size) { pipe_buf_release(pipe, buf); - tail++; - pipe->tail = tail; if (pipe->files) sd->need_wakeup = true; } @@ -557,13 +457,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des /* We know we have a pipe buffer, but maybe it's empty? */ static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { - unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_head_buf(pipe); - if (unlikely(!buf->len)) { + if (buf && unlikely(!buf->size)) { pipe_buf_release(pipe, buf); - pipe->tail = tail+1; return true; } @@ -590,7 +487,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des return -ERESTARTSYS; repeat: - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_empty(pipe)) { if (!pipe->writers) return 0; @@ -736,67 +633,31 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, .pos = *ppos, .u.file = out, }; - int nbufs = pipe->max_usage; - struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec), - GFP_KERNEL); ssize_t ret; - if (unlikely(!array)) - return -ENOMEM; - pipe_lock(pipe); - splice_from_pipe_begin(&sd); + while (sd.total_len) { + struct pipe_buffer *buf; struct iov_iter from; - unsigned int head, tail, mask; - size_t left; - int n; ret = splice_from_pipe_next(pipe, &sd); if (ret <= 0) break; - if (unlikely(nbufs < pipe->max_usage)) { - kfree(array); - nbufs = pipe->max_usage; - array = kcalloc(nbufs, sizeof(struct bio_vec), - GFP_KERNEL); - if (!array) { - ret = -ENOMEM; - break; - } + buf = pipe_head_buf(pipe); + + ret = pipe_buf_confirm(pipe, buf); + if (unlikely(ret)) { + if (ret == -ENODATA) + ret = 0; + break; } - head = pipe->head; - tail = pipe->tail; - mask = pipe->ring_size - 1; - - /* build the vector */ - left = sd.total_len; - for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; - size_t this_len = buf->len; - - /* zero-length bvecs are not supported, skip them */ - if (!this_len) - continue; - this_len = min(this_len, left); - - ret = pipe_buf_confirm(pipe, buf); - if (unlikely(ret)) { - if (ret == -ENODATA) - ret = 0; - goto done; - } - - bvec_set_page(&array[n], buf->page, this_len, - buf->offset); - left -= this_len; - n++; - } - - iov_iter_bvec(&from, ITER_SOURCE, array, n, sd.total_len - left); + iov_iter_bvec(&from, ITER_SOURCE, + buf->bvec + buf->index, buf->nr - buf->index, + min(sd.total_len, buf->size)); ret = vfs_iter_write(out, &from, &sd.pos, 0); if (ret <= 0) break; @@ -805,35 +666,12 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, sd.total_len -= ret; *ppos = sd.pos; - /* dismiss the fully eaten buffers, adjust the partial one */ - tail = pipe->tail; - while (ret) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; - if (ret >= buf->len) { - ret -= buf->len; - buf->len = 0; - pipe_buf_release(pipe, buf); - tail++; - pipe->tail = tail; - if (pipe->files) - sd.need_wakeup = true; - } else { - buf->offset += ret; - buf->len -= ret; - ret = 0; - } - } + pipe_consume(pipe, buf, ret); } -done: - kfree(array); + splice_from_pipe_end(pipe, &sd); - pipe_unlock(pipe); - - if (sd.num_spliced) - ret = sd.num_spliced; - - return ret; + return sd.num_spliced ?: ret; } EXPORT_SYMBOL(iter_file_splice_write); @@ -904,7 +742,7 @@ long vfs_splice_read(struct file *in, loff_t *ppos, return -EBADF; /* Don't try to read more the pipe has space for. */ - p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); + p_space = pipe->max_footprint - pipe_occupancy(pipe); len = min_t(size_t, len, p_space << PAGE_SHIFT); ret = rw_verify_area(READ, in, ppos, len); @@ -937,9 +775,10 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, splice_direct_actor *actor) { struct pipe_inode_info *pipe; + struct pipe_buffer *buf; long ret, bytes; size_t len; - int i, flags, more; + int flags, more; /* * We require the input to be seekable, as we don't want to randomly @@ -983,7 +822,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, sd->flags &= ~SPLICE_F_NONBLOCK; more = sd->flags & SPLICE_F_MORE; - WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); + WARN_ON_ONCE(!pipe_empty(pipe)); while (len) { size_t read_len; @@ -1027,7 +866,6 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, } done: - pipe->tail = pipe->head = 0; file_accessed(in); return bytes; @@ -1036,11 +874,9 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, * If we did an incomplete transfer we must release * the pipe buffers in question: */ - for (i = 0; i < pipe->ring_size; i++) { - struct pipe_buffer *buf = &pipe->bufs[i]; - - if (buf->ops) - pipe_buf_release(pipe, buf); + while ((buf = pipe_head_buf(pipe))) { + buf->index = buf->nr; + pipe_buf_release(pipe, buf); } if (!bytes) @@ -1113,7 +949,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) send_sig(SIGPIPE, current, 0); return -EPIPE; } - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_full(pipe)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; @@ -1275,22 +1111,19 @@ static long __do_splice(struct file *in, loff_t __user *off_in, return ret; } -static int iter_to_pipe(struct iov_iter *from, - struct pipe_inode_info *pipe, - unsigned flags) +static int iter_to_pipe(struct iov_iter *from, struct pipe_inode_info *pipe, + unsigned int flags) { - struct pipe_buffer buf = { - .ops = &user_page_pipe_buf_ops, - .flags = flags - }; - size_t total = 0; + size_t spliced = 0; + bool full = false; int ret = 0; while (iov_iter_count(from)) { + struct pipe_buffer *buf; struct page *pages[16]; ssize_t left; size_t start; - int i, n; + int i; left = iov_iter_get_pages2(from, pages, ~0UL, 16, &start); if (left <= 0) { @@ -1298,28 +1131,29 @@ static int iter_to_pipe(struct iov_iter *from, break; } - n = DIV_ROUND_UP(left + start, PAGE_SIZE); - for (i = 0; i < n; i++) { - int size = min_t(int, left, PAGE_SIZE - start); + buf = pipe_alloc_buffer(pipe, &user_page_pipe_buf_ops, + DIV_ROUND_UP(left + start, PAGE_SIZE), + GFP_KERNEL, &ret); + if (!buf) + break; + buf->flags |= flags; - buf.page = pages[i]; - buf.offset = start; - buf.len = size; - ret = add_to_pipe(pipe, &buf); - if (unlikely(ret < 0)) { - iov_iter_revert(from, left); - // this one got dropped by add_to_pipe() - while (++i < n) - put_page(pages[i]); - goto out; - } - total += ret; + for (i = 0; i < buf->max; i++) { + size_t size = min_t(size_t, left, PAGE_SIZE - start); + + bvec_set_page(&buf->bvec[i], pages[i], size, start); + buf->size += size; left -= size; start = 0; } + + buf->nr = i; + spliced += pipe_add(pipe, buf, &full); + if (full) + break; } -out: - return total ? total : ret; + + return spliced ?: ret; } static int pipe_to_user(struct pipe_inode_info *pipe, struct splice_desc *sd, @@ -1491,13 +1325,13 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (!pipe_empty(pipe->head, pipe->tail)) + if (!pipe_empty(pipe)) return 0; ret = 0; pipe_lock(pipe); - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_empty(pipe)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -1527,13 +1361,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_full(pipe)) return 0; ret = 0; pipe_lock(pipe); - while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (pipe_full(pipe)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; @@ -1555,28 +1389,79 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) } /* + * Split the front off of a buffer and paste it into another buffer. + */ +static void splice_split_buffer(struct pipe_buffer *ibuf, + struct pipe_buffer *obuf, + size_t len) +{ + unsigned int i = ibuf->ix, o = 0; + + obuf->ops = ibuf->ops; + obuf->private = ibuf->private; + obuf->private_2 = ibuf->private_2; + obuf->size = len; + obuf->footprint = 0; + obuf->nr = ibuf->nr - ibuf->ix; + obuf->confirmed = ibuf->confirmed - ibuf->ix; + + /* + * Don't inherit the gift and merge flags, we need to prevent multiple + * steals of this page. + */ + obuf->flags = ibuf->flags & + ~(PIPE_BUF_FLAG_GIFT | PIPE_BUF_FLAG_CAN_MERGE); + + do { + size_t part = min_t(size_t, ibuf->bvec[i].bv_len, len); + + obuf->bvec[o] = ibuf->bvec[i]; + obuf->bvec[o].bv_len = part; + obuf->footprint += folio_nr_pages(obuf->bvec[o].bv_folio); + + ibuf->bvec[i].bv_offset += part; + ibuf->bvec[i].bv_len -= part; + len -= part; + o++; + if (ibuf->bvec[i].bv_len) + break; + i++; + if (j >= obuf->max) + break; + } while (len > 0); + + ibuf->ix = i; + obuf->ix = o; + +#error need to do the page getting thing + obuf->ops->get_pages(obuf); +} + +/* * Splice contents of ipipe to opipe. */ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, size_t len, unsigned int flags) { - struct pipe_buffer *ibuf, *obuf; - unsigned int i_head, o_head; - unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; - int ret = 0; - bool input_wakeup = false; + struct pipe_buffer *ibuf, *spare; + size_t spliced = 0; + int ret = -EAGAIN; + bool input_wakeup = false, full; + /* We may need to split a buffer */ + spare = pipe_alloc_buffer(opipe, NULL, 16, GFP_KERNEL, &ret); + if (!spare) + return ret; retry: ret = ipipe_prep(ipipe, flags); if (ret) - return ret; + goto out; ret = opipe_prep(opipe, flags); if (ret) - return ret; + goto out; /* * Potential ABBA deadlock, work around it by ordering lock @@ -1585,41 +1470,29 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, */ pipe_double_lock(ipipe, opipe); - i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; - o_head = opipe->head; - o_mask = opipe->ring_size - 1; - + full = pipe_full(opipe); do { - size_t o_len; - if (!opipe->readers) { send_sig(SIGPIPE, current, 0); - if (!ret) - ret = -EPIPE; + ret = -EPIPE; break; } - i_head = ipipe->head; - o_tail = opipe->tail; - - if (pipe_empty(i_head, i_tail) && !ipipe->writers) + if (pipe_empty(ipipe) && !ipipe->writers) break; /* * Cannot make any progress, because either the input * pipe is empty or the output pipe is full. */ - if (pipe_empty(i_head, i_tail) || - pipe_full(o_head, o_tail, opipe->max_usage)) { + if (pipe_empty(ipipe) || full) { /* Already processed some buffers, break */ - if (ret) + if (spliced) break; - if (flags & SPLICE_F_NONBLOCK) { - ret = -EAGAIN; + ret = -EAGAIN; + if (flags & SPLICE_F_NONBLOCK) break; - } /* * We raced with another reader/writer and haven't @@ -1631,50 +1504,26 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, goto retry; } - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_head_buf(ipipe); + if (ibuf->size <= len - spliced) { + /* Simply move the whole buffer from ipipe to opipe */ + spin_lock_irq(&ipipe->rd_wait.lock); + ipipe->footprint -= ibuf->footprint; + list_del(&ibuf->queue_link); + spin_unlock_irq(&ipipe->rd_wait.lock); - if (len >= ibuf->len) { - /* - * Simply move the whole buffer from ipipe to opipe - */ - *obuf = *ibuf; - ibuf->ops = NULL; - i_tail++; - ipipe->tail = i_tail; - input_wakeup = true; - o_len = obuf->len; - o_head++; - opipe->head = o_head; + spliced += pipe_add(opipe, ibuf, &full); } else { /* - * Get a reference to this pipe buffer, - * so we can copy the contents over. + * Need to split the pipe buffer. Multiple folios may + * be involved. */ - if (!pipe_buf_get(ipipe, ibuf)) { - if (ret == 0) - ret = -EFAULT; - break; - } - *obuf = *ibuf; + splice_split_buffer(ibuf, spare, len - spliced); - /* - * Don't inherit the gift and merge flags, we need to - * prevent multiple steals of this page. - */ - obuf->flags &= ~PIPE_BUF_FLAG_GIFT; - obuf->flags &= ~PIPE_BUF_FLAG_CAN_MERGE; - - obuf->len = len; - ibuf->offset += len; - ibuf->len -= len; - o_len = len; - o_head++; - opipe->head = o_head; + spliced += pipe_add(opipe, spare, &full); + spare = NULL; } - ret += o_len; - len -= o_len; - } while (len); + } while (spliced < len); pipe_unlock(ipipe); pipe_unlock(opipe); @@ -1682,13 +1531,16 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, /* * If we put data in the output pipe, wakeup any potential readers. */ - if (ret > 0) + if (spliced) wakeup_pipe_readers(opipe); if (input_wakeup) wakeup_pipe_writers(ipipe); - return ret; +out: + if (spare) + pipe_buf_release(opipe, spare); + return spliced ?: ret; } /* @@ -1731,8 +1583,8 @@ static int link_pipe(struct pipe_inode_info *ipipe, * If we have iterated all input buffers or run out of * output room, break. */ - if (pipe_empty(i_head, i_tail) || - pipe_full(o_head, o_tail, opipe->max_usage)) + if (pipe_empty(i_pipe) || + pipe_full(o_pipe)) break; ibuf = &ipipe->bufs[i_tail & i_mask];
diff --git a/include/linux/bvec.h b/include/linux/bvec.h index 555aae54..6c030a2 100644 --- a/include/linux/bvec.h +++ b/include/linux/bvec.h
@@ -18,6 +18,7 @@ struct page; /** * struct bio_vec - a contiguous range of physical memory addresses + * @bv_folio: First folio associated with the address range. * @bv_page: First page associated with the address range. * @bv_len: Number of bytes in the address range. * @bv_offset: Start of the address range relative to the start of @bv_page. @@ -29,7 +30,10 @@ struct page; * This holds because page_is_mergeable() checks the above property. */ struct bio_vec { - struct page *bv_page; + union { + struct folio *bv_folio; + struct page *bv_page; + }; unsigned int bv_len; unsigned int bv_offset; };
diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index 569483e..2e33c66d 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h
@@ -12,25 +12,38 @@ #define PIPE_BUF_FLAG_PACKET 0x08 /* read() as a packet */ #define PIPE_BUF_FLAG_CAN_MERGE 0x10 /* can merge buffers */ #define PIPE_BUF_FLAG_WHOLE 0x20 /* read() must return entire buffer or error */ +#define PIPE_BUF_FLAG_IX_STOLEN 0x40 /* The folio at bvec[index] has been stolen/copied */ #ifdef CONFIG_WATCH_QUEUE #define PIPE_BUF_FLAG_LOSS 0x40 /* Message loss happened after this buffer */ #endif /** * struct pipe_buffer - a linux kernel pipe buffer - * @page: the page containing the data for the pipe buffer - * @offset: offset of data inside the @page - * @len: length of data inside the @page + * @queue_link: Link in pipe_inode_info::queue_link * @ops: operations associated with this buffer. See @pipe_buf_operations. * @flags: pipe buffer flags. See above. * @private: private data owned by the ops. + * @private_2: Additional private data owned by the ops. + * @size: Size of the buffer + * @footprint: Amount of memory pinned by this buffer in pages + * @max: The size of bvec[] + * @index: Current element in bvec[] to consume. + * @nr: bvec[] count. + * @bvec: List of buffer folios **/ struct pipe_buffer { - struct page *page; - unsigned int offset, len; const struct pipe_buf_operations *ops; - unsigned int flags; - unsigned long private; + struct list_head queue_link; + size_t size; + size_t footprint; + void *private; + unsigned long private_2; + unsigned int flags; + unsigned short index; + unsigned short max; + unsigned short nr; + unsigned short nr_confirmed; + struct bio_vec bvec[0]; }; /* @@ -46,10 +59,10 @@ struct pipe_buffer { struct pipe_buf_operations { /* * ->confirm() verifies that the data in the pipe buffer is there - * and that the contents are good. If the pages in the pipe belong + * and that the contents are good. If the folios in the pipe belong * to a file system, we may need to wait for IO completion in this * hook. Returns 0 for good, or a negative error value in case of - * error. If not present all pages are considered good. + * error. If not present all folios are considered good. */ int (*confirm)(struct pipe_inode_info *, struct pipe_buffer *); @@ -62,17 +75,25 @@ struct pipe_buf_operations { /* * Attempt to take ownership of the pipe buffer and its contents. * ->try_steal() returns %true for success, in which case the contents - * of the pipe (the buf->page) is locked and now completely owned by the - * caller. The page may then be transferred to a different mapping, the - * most often used case is insertion into different file address space - * cache. + * of the pipe (the bvec[]) is locked and now completely owned by the + * caller. The folios may then be transferred to a different mapping, + * the most often used case is insertion into different file address + * space cache. */ bool (*try_steal)(struct pipe_inode_info *, struct pipe_buffer *); /* * Get a reference to the pipe buffer. */ - bool (*get)(struct pipe_inode_info *, struct pipe_buffer *); + bool (*get_pages)(struct pipe_inode_info *pipe, struct pipe_buffer *buf); + + /* + * Copy data out of the pipe buffer, performing any confirmatory step + * necessary beforehand and releasing any used up bufferage. The + * caller will dispose of the buffer when buf->size reduces to zero. + */ + ssize_t (*copy_to_iter)(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, struct iov_iter *iter); }; /** @@ -89,20 +110,6 @@ static inline __must_check bool pipe_buf_get(struct pipe_inode_info *pipe, } /** - * pipe_buf_release - put a reference to a pipe_buffer - * @pipe: the pipe that the buffer belongs to - * @buf: the buffer to put a reference to - */ -static inline void pipe_buf_release(struct pipe_inode_info *pipe, - struct pipe_buffer *buf) -{ - const struct pipe_buf_operations *ops = buf->ops; - - buf->ops = NULL; - ops->release(pipe, buf); -} - -/** * pipe_buf_confirm - verify contents of the pipe buffer * @pipe: the pipe that the buffer belongs to * @buf: the buffer to confirm @@ -137,6 +144,7 @@ ssize_t pipe_add(struct pipe_inode_info *pipe, struct pipe_buffer *buf, bool *fu #ifdef CONFIG_WATCH_QUEUE void pipe_set_lost_mark(struct pipe_inode_info *pipe); #endif +void pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf); /* Get data from a pipe */ size_t pipe_query_content(struct pipe_inode_info *pipe, size_t *len);
diff --git a/include/linux/splice.h b/include/linux/splice.h index 9ed729e..9f6f5ef 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h
@@ -45,24 +45,9 @@ struct splice_desc { bool steal_or_copy; /* Want the pages pre-stealing or copying */ }; -struct partial_page { - unsigned int offset; - unsigned int len; - unsigned long private; -}; - /* * Passed to splice_to_pipe */ -struct splice_pipe_desc { - struct page **pages; /* page map */ - struct partial_page *partial; /* pages[] may not be contig */ - int nr_pages; /* number of populated pages in map */ - unsigned int nr_pages_max; /* pages[] & partial[] arrays size */ - const struct pipe_buf_operations *ops;/* ops associated with output pipe */ - void (*spd_release)(struct splice_pipe_desc *, unsigned int); -}; - typedef int (splice_actor)(struct pipe_inode_info *pipe, struct splice_desc *sd, unsigned int nr_bv, struct bio_vec *bv); typedef int (splice_direct_actor)(struct pipe_inode_info *, @@ -72,11 +57,7 @@ extern ssize_t splice_from_pipe(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int, splice_actor); extern ssize_t __splice_from_pipe(struct pipe_inode_info *, - struct splice_desc *, splice_actor); -extern ssize_t splice_to_pipe(struct pipe_inode_info *, - struct splice_pipe_desc *); -extern ssize_t add_to_pipe(struct pipe_inode_info *, - struct pipe_buffer *); + struct splice_desc *, splice_actor *); long vfs_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags); @@ -94,8 +75,8 @@ extern ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, /* * for dynamic pipe sizing */ -extern int splice_grow_spd(const struct pipe_inode_info *, struct splice_pipe_desc *); -extern void splice_shrink_spd(struct splice_pipe_desc *); +int splice_grow_buf(const struct pipe_inode_info *pipe, struct pipe_buffer *buf); +void splice_shrink_buf(struct pipe_buffer *buf); extern const struct pipe_buf_operations page_cache_pipe_buf_ops; extern const struct pipe_buf_operations default_pipe_buf_ops;
diff --git a/include/linux/watch_queue.h b/include/linux/watch_queue.h index fc6bba2..cef9383 100644 --- a/include/linux/watch_queue.h +++ b/include/linux/watch_queue.h
@@ -77,16 +77,16 @@ struct watch_list { spinlock_t lock; }; -extern void __post_watch_notification(struct watch_list *, - struct watch_notification *, - const struct cred *, - u64); +extern void __post_watch_notification(struct watch_list *wlist, + struct watch_notification *n, + const struct cred *cred, + gfp_t gfp, + u64 id); extern struct watch_queue *get_watch_queue(int); extern void put_watch_queue(struct watch_queue *); extern void init_watch(struct watch *, struct watch_queue *); extern int add_watch_to_object(struct watch *, struct watch_list *); extern int remove_watch_from_object(struct watch_list *, struct watch_queue *, u64, bool); -extern long watch_queue_set_size(struct pipe_inode_info *, unsigned int); extern long watch_queue_set_filter(struct pipe_inode_info *, struct watch_notification_filter __user *); extern int watch_queue_init(struct pipe_inode_info *); @@ -103,10 +103,11 @@ static inline void init_watch_list(struct watch_list *wlist, static inline void post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, + gfp_t gfp, u64 id) { if (unlikely(wlist)) - __post_watch_notification(wlist, n, cred, id); + __post_watch_notification(wlist, n, cred, gfp, id); } static inline void remove_watch_list(struct watch_list *wlist, u64 id)
diff --git a/kernel/relay.c b/kernel/relay.c index 9aa70ae..5e48d03 100644 --- a/kernel/relay.c +++ b/kernel/relay.c
@@ -1086,9 +1086,12 @@ static void relay_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct rchan_buf *rbuf; + unsigned int size = buf->bvec[buf->index].bv_len; - rbuf = (struct rchan_buf *)page_private(buf->page); - relay_consume_bytes(rbuf, buf->private); + rbuf = buf->private; + if (buf->index == buf->nr - 1) + size += buf->private_2; /* Account for end padding */ + relay_consume_bytes(rbuf, size); } static const struct pipe_buf_operations relay_pipe_buf_ops = { @@ -1097,10 +1100,6 @@ static const struct pipe_buf_operations relay_pipe_buf_ops = { .get = generic_pipe_buf_get, }; -static void relay_page_release(struct splice_pipe_desc *spd, unsigned int i) -{ -} - /* * subbuf_splice_actor - splice up to one subbuf's worth of data */ @@ -1112,6 +1111,7 @@ static ssize_t subbuf_splice_actor(struct file *in, int *nonpad_ret) { unsigned int pidx, poff, total_len, subbuf_pages, nr_pages; + struct pipe_buffer *buf; struct rchan_buf *rbuf = in->private_data; unsigned int subbuf_size = rbuf->chan->subbuf_size; uint64_t pos = (uint64_t) *ppos; @@ -1120,22 +1120,12 @@ static ssize_t subbuf_splice_actor(struct file *in, size_t read_subbuf = read_start / subbuf_size; size_t padding = rbuf->padding[read_subbuf]; size_t nonpad_end = read_subbuf * subbuf_size + subbuf_size - padding; - struct page *pages[PIPE_DEF_BUFFERS]; - struct partial_page partial[PIPE_DEF_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages, - .nr_pages = 0, - .nr_pages_max = PIPE_DEF_BUFFERS, - .partial = partial, - .ops = &relay_pipe_buf_ops, - .spd_release = relay_page_release, - }; - ssize_t ret; + ssize_t spliced = 0; + bool full = false; + int ret = 0; if (rbuf->subbufs_produced == rbuf->subbufs_consumed) return 0; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; /* * Adjust read len, if longer than what is available @@ -1146,54 +1136,57 @@ static ssize_t subbuf_splice_actor(struct file *in, subbuf_pages = rbuf->chan->alloc_size >> PAGE_SHIFT; pidx = (read_start / PAGE_SIZE) % subbuf_pages; poff = read_start & ~PAGE_MASK; - nr_pages = min_t(unsigned int, subbuf_pages, spd.nr_pages_max); + nr_pages = min_t(unsigned int, subbuf_pages, PIPE_DEF_BUFFERS); - for (total_len = 0; spd.nr_pages < nr_pages; spd.nr_pages++) { - unsigned int this_len, this_end, private; + buf = pipe_alloc_buffer(pipe, &relay_pipe_buf_ops, nr_pages, + GFP_KERNEL, &ret); + if (!buf) + return ret; + + buf->private = rbuf; + /* buf->private_2 = 0; -- The amount of padding after the last segment */ + + for (total_len = 0; buf->nr < nr_pages;) { + struct folio *folio; + unsigned int this_len, this_end; unsigned int cur_pos = read_start + total_len; if (!len) break; this_len = min_t(unsigned long, len, PAGE_SIZE - poff); - private = this_len; - - spd.pages[spd.nr_pages] = rbuf->page_array[pidx]; - spd.partial[spd.nr_pages].offset = poff; this_end = cur_pos + this_len; if (this_end >= nonpad_end) { this_len = nonpad_end - cur_pos; - private = this_len + padding; + buf->private_2 = padding; } - spd.partial[spd.nr_pages].len = this_len; - spd.partial[spd.nr_pages].private = private; + + folio = page_folio(rbuf->page_array[pidx]); + + bvec_set_folio(&buf->bvec[buf->nr], folio, this_len, poff); + buf->footprint += folio_nr_pages(folio); + // TODO: Take page ref? len -= this_len; total_len += this_len; poff = 0; pidx = (pidx + 1) % subbuf_pages; + buf->nr++; - if (this_end >= nonpad_end) { - spd.nr_pages++; + if (this_end >= nonpad_end) break; - } } - ret = 0; - if (!spd.nr_pages) + spliced = *nonpad_ret = pipe_add(pipe, buf, &full); + if (spliced < total_len) goto out; - ret = *nonpad_ret = splice_to_pipe(pipe, &spd); - if (ret < 0 || ret < total_len) - goto out; - - if (read_start + ret == nonpad_end) - ret += padding; + if (read_start + spliced == nonpad_end) + spliced += padding; out: - splice_shrink_spd(&spd); - return ret; + return spliced; } static ssize_t relay_file_splice_read(struct file *in, @@ -1228,10 +1221,7 @@ static ssize_t relay_file_splice_read(struct file *in, nonpad_ret = 0; } - if (spliced) - return spliced; - - return ret; + return spliced ?: ret; } const struct file_operations relay_file_operations = {
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 45551c7..3833744 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c
@@ -6399,7 +6399,7 @@ static void tracing_set_nop(struct trace_array *tr) { if (tr->current_trace == &nop_trace) return; - + tr->current_trace->enabled--; if (tr->current_trace->reset) @@ -6901,12 +6901,6 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, return sret; } -static void tracing_spd_release_pipe(struct splice_pipe_desc *spd, - unsigned int idx) -{ - __free_page(spd->pages[idx]); -} - static size_t tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter) { @@ -6960,23 +6954,17 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, size_t len, unsigned int flags) { - struct page *pages_def[PIPE_DEF_BUFFERS]; - struct partial_page partial_def[PIPE_DEF_BUFFERS]; + struct pipe_buffer *buf; struct trace_iterator *iter = filp->private_data; - struct splice_pipe_desc spd = { - .pages = pages_def, - .partial = partial_def, - .nr_pages = 0, /* This gets updated below. */ - .nr_pages_max = PIPE_DEF_BUFFERS, - .ops = &default_pipe_buf_ops, - .spd_release = tracing_spd_release_pipe, - }; ssize_t ret; size_t rem; + bool full = false; unsigned int i; - if (splice_grow_spd(pipe, &spd)) + buf = kzalloc(struct_size(buf, bvec, PIPE_DEF_BUFFERS), GFP_KERNEL); + if (!buf) return -ENOMEM; + buf->ops = &default_pipe_buf_ops; mutex_lock(&iter->mutex); @@ -7000,42 +6988,41 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ - for (i = 0, rem = len; i < spd.nr_pages_max && rem; i++) { - spd.pages[i] = alloc_page(GFP_KERNEL); - if (!spd.pages[i]) + for (i = 0, rem = len; i < PIPE_DEF_BUFFERS && rem; i++) { + struct folio *folio; + void *p; + + folio = folio_alloc(GFP_KERNEL, 0); + if (!folio) break; rem = tracing_fill_pipe_page(rem, iter); /* Copy the data into the page, so we can start over. */ - ret = trace_seq_to_buffer(&iter->seq, - page_address(spd.pages[i]), + p = kmap_local_folio(folio, 0); + ret = trace_seq_to_buffer(&iter->seq, p, trace_seq_used(&iter->seq)); + kunmap_local(p); if (ret < 0) { - __free_page(spd.pages[i]); + folio_put(folio); break; } - spd.partial[i].offset = 0; - spd.partial[i].len = trace_seq_used(&iter->seq); + bvec_set_folio(&buf->bvec[i], folio, trace_seq_used(&iter->seq), 0); trace_seq_init(&iter->seq); } + buf->nr = i; trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); mutex_unlock(&iter->mutex); - spd.nr_pages = i; - - if (i) - ret = splice_to_pipe(pipe, &spd); - else - ret = 0; + ret = pipe_add(pipe, buf, &full); out: - splice_shrink_spd(&spd); return ret; out_err: + pipe_add(pipe, buf, &full); mutex_unlock(&iter->mutex); goto out; } @@ -8297,19 +8284,6 @@ static const struct pipe_buf_operations buffer_pipe_buf_ops = { .get = buffer_pipe_buf_get, }; -/* - * Callback from splice_to_pipe(), if we need to release some pages - * at the end of the spd in case we error'ed out in filling the pipe. - */ -static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i) -{ - struct buffer_ref *ref = - (struct buffer_ref *)spd->partial[i].private; - - buffer_ref_release(ref); - spd->partial[i].private = 0; -} - static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, @@ -8317,23 +8291,11 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, { struct ftrace_buffer_info *info = file->private_data; struct trace_iterator *iter = &info->iter; - struct partial_page partial_def[PIPE_DEF_BUFFERS]; - struct page *pages_def[PIPE_DEF_BUFFERS]; - struct splice_pipe_desc spd = { - .pages = pages_def, - .partial = partial_def, - .nr_pages_max = PIPE_DEF_BUFFERS, - .ops = &buffer_pipe_buf_ops, - .spd_release = buffer_spd_release, - }; - struct buffer_ref *ref; - int entries, i; - ssize_t ret = 0; - -#ifdef CONFIG_TRACER_MAX_TRACE - if (iter->snapshot && iter->tr->current_trace->use_max_tr) - return -EBUSY; -#endif + struct pipe_buffer *buf; + struct buffer_ref **refs; + ssize_t spliced; + bool full = false; + int ret = 0, entries; if (*ppos & (PAGE_SIZE - 1)) return -EINVAL; @@ -8344,15 +8306,31 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - if (splice_grow_spd(pipe, &spd)) +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) + return -EBUSY; +#endif + + buf = pipe_alloc_buffer(pipe, &buffer_pipe_buf_ops, PIPE_DEF_BUFFERS, + GFP_KERNEL, &ret); + if (!buf) + return ret; + + refs = kcalloc(sizeof(*refs), PIPE_DEF_BUFFERS, GFP_KERNEL); + if (!refs) { + pipe_add(pipe, buf, &full); return -ENOMEM; + } + + buf->private_2 = (unsigned long)refs; again: trace_access_lock(iter->cpu_file); - entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); - for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) { - struct page *page; + while (len && buf->nr < buf->max && + (entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, + iter->cpu_file))) { + struct buffer_ref *ref; int r; ref = kzalloc(sizeof(*ref), GFP_KERNEL); @@ -8381,28 +8359,20 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; } - page = virt_to_page(ref->page); - - spd.pages[i] = page; - spd.partial[i].len = PAGE_SIZE; - spd.partial[i].offset = 0; - spd.partial[i].private = (unsigned long)ref; - spd.nr_pages++; + bvec_set_page(&buf->bvec[buf->nr], ref->page, PAGE_SIZE, 0); + refs[buf->nr] = ref; // TODO: Use page->private? + buf->nr++; + buf->size += PAGE_SIZE; *ppos += PAGE_SIZE; - - entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file); + len -= PAGE_SIZE; } trace_access_unlock(iter->cpu_file); - spd.nr_pages = i; /* did we read anything? */ - if (!spd.nr_pages) { + if (!ret && !buf->nr) { long wait_index; - if (ret) - goto out; - ret = -EAGAIN; if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) goto out; @@ -8425,11 +8395,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, goto again; } - ret = splice_to_pipe(pipe, &spd); out: - splice_shrink_spd(&spd); - - return ret; + spliced = pipe_add(pipe, buf, &full); + return spliced ?: ret; } /* An ioctl call with cmd 0 to the ring buffer file will wake up all waiters */
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index df89779..0541e3e 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c
@@ -26,6 +26,7 @@ #include <linux/sched/signal.h> #include <linux/watch_queue.h> #include <linux/pipe_fs_i.h> +#include <linux/uio.h> #include "../fs/pipe.h" #include "../fs/internal.h" @@ -33,9 +34,6 @@ MODULE_DESCRIPTION("Watch queue"); MODULE_AUTHOR("Red Hat, Inc."); MODULE_LICENSE("GPL"); -#define WATCH_QUEUE_NOTE_SIZE 128 -#define WATCH_QUEUE_NOTES_PER_PAGE (PAGE_SIZE / WATCH_QUEUE_NOTE_SIZE) - /* * This must be called under the RCU read-lock, which makes * sure that the wqueue still exists. It can then take the lock, @@ -57,37 +55,45 @@ static inline void unlock_wqueue(struct watch_queue *wqueue) spin_unlock_bh(&wqueue->lock); } +static ssize_t watchqueue_copy_buf_to_iter(struct pipe_inode_info *pipe, + struct pipe_buffer *buf, + struct iov_iter *iter) +{ + const struct watch_notification *n = buf->private; + struct watch_notification hdr; + size_t size = buf->size, body = size - sizeof(hdr); + __u32 id = buf->private_2 & 0xff; + + /* Substitute the ID at the point of copying so the notification buffer + * can be shared + */ + hdr = *n; + hdr.info &= ~WATCH_INFO_ID; + hdr.info |= id << WATCH_INFO_ID__SHIFT; + + if (size > iov_iter_count(iter)) + return -ENOBUFS; /* All or nothing */ + if (copy_to_iter(&hdr, sizeof(hdr), iter) != sizeof(hdr)) + return -EFAULT; + if (size > sizeof(hdr) && + copy_to_iter(buf->private + sizeof(hdr), body, iter) != body) + return -EFAULT; + kfree(buf->private); + buf->size = 0; + return size; +} + static void watch_queue_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { - struct watch_queue *wqueue = (struct watch_queue *)buf->private; - struct page *page; - unsigned int bit; - - /* We need to work out which note within the page this refers to, but - * the note might have been maximum size, so merely ANDing the offset - * off doesn't work. OTOH, the note must've been more than zero size. - */ - bit = buf->offset + buf->len; - if ((bit & (WATCH_QUEUE_NOTE_SIZE - 1)) == 0) - bit -= WATCH_QUEUE_NOTE_SIZE; - bit /= WATCH_QUEUE_NOTE_SIZE; - - page = buf->page; - bit += page->index; - - set_bit(bit, wqueue->notes_bitmap); - generic_pipe_buf_release(pipe, buf); + kfree(buf->private); } -// No try_steal function => no stealing -#define watch_queue_pipe_buf_try_steal NULL - /* New data written to a pipe may be appended to a buffer with this type. */ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { .release = watch_queue_pipe_buf_release, - .try_steal = watch_queue_pipe_buf_try_steal, .get = generic_pipe_buf_get, + .copy_to_iter = watchqueue_copy_buf_to_iter, }; /* @@ -97,62 +103,52 @@ static const struct pipe_buf_operations watch_queue_pipe_buf_ops = { * watch_queue lock held, which guarantees that the pipe * hasn't been released. */ -static bool post_one_notification(struct watch_queue *wqueue, - struct watch_notification *n) +static void post_one_notification(struct watch_queue *wqueue, + struct watch_notification *n, + unsigned int id) { - void *p; + struct watch_notification *buf2; struct pipe_inode_info *pipe = wqueue->pipe; struct pipe_buffer *buf; - struct page *page; - unsigned int note, offset, len; - bool done = false, full = false; + unsigned int len = n->info & WATCH_INFO_LENGTH; + bool wake = false, full = false; int error = 0; if (!pipe) - return false; - - spin_lock_irq(&pipe->rd_wait.lock); + return; buf = pipe_alloc_buffer(pipe, &watch_queue_pipe_buf_ops, 1, GFP_ATOMIC, &error); - if (IS_ERR_OR_NULL(buf)) + if (!buf) + goto lost; + buf2 = kmemdup(n, n->info & WATCH_INFO_LENGTH, GFP_ATOMIC); + if (!buf2) goto lost; - note = find_first_bit(wqueue->notes_bitmap, wqueue->nr_notes); - if (note >= wqueue->nr_notes) - goto lost; + buf->flags = PIPE_BUF_FLAG_WHOLE; + buf->ops = &watch_queue_pipe_buf_ops; + buf->private = buf2; + buf->private_2 = id; + buf->size = len; + buf->footprint += len; - page = wqueue->notes[note / WATCH_QUEUE_NOTES_PER_PAGE]; - offset = note % WATCH_QUEUE_NOTES_PER_PAGE * WATCH_QUEUE_NOTE_SIZE; - get_page(page); - len = n->info & WATCH_INFO_LENGTH; - p = kmap_atomic(page); - memcpy(p + offset, n, len); - kunmap_atomic(p); - - buf->page = page; - buf->private = (unsigned long)wqueue; - buf->offset = offset; - buf->len = len; - buf->flags = PIPE_BUF_FLAG_WHOLE; pipe_add(pipe, buf, &full); - - if (!test_and_clear_bit(note, wqueue->notes_bitmap)) { - spin_unlock_irq(&pipe->rd_wait.lock); - BUG(); - } - wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); - done = true; - -out: + wake = true; spin_unlock_irq(&pipe->rd_wait.lock); - if (done) + +wake: + if (wake) { + wake_up_interruptible_sync_poll_locked(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); - return done; + } + return; lost: pipe_set_lost_mark(pipe); - goto out; + if (buf) + pipe_buf_release(pipe, buf); + wake = true; + goto wake; } /* @@ -186,6 +182,7 @@ static bool filter_watch_notification(const struct watch_filter *wf, * @wlist: The watch list to post the event to. * @n: The notification record to post. * @cred: The creds of the process that triggered the notification. + * @gfp: Allocation flags for notification and pipe buf. * @id: The ID to match on the watch. * * Post a notification of an event into a set of watch queues and let the users @@ -197,6 +194,7 @@ static bool filter_watch_notification(const struct watch_filter *wf, void __post_watch_notification(struct watch_list *wlist, struct watch_notification *n, const struct cred *cred, + gfp_t gfp, u64 id) { const struct watch_filter *wf; @@ -225,7 +223,7 @@ void __post_watch_notification(struct watch_list *wlist, continue; if (lock_wqueue(wqueue)) { - post_one_notification(wqueue, n); + post_one_notification(wqueue, n, watch->info_id); unlock_wqueue(wqueue); } } @@ -235,75 +233,6 @@ void __post_watch_notification(struct watch_list *wlist, EXPORT_SYMBOL(__post_watch_notification); /* - * Allocate sufficient pages to preallocation for the requested number of - * notifications. - */ -long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) -{ - struct watch_queue *wqueue = pipe->watch_queue; - struct page **pages; - unsigned long *bitmap; - unsigned long user_bufs; - int ret, i, nr_pages; - - if (!wqueue) - return -ENODEV; - if (wqueue->notes) - return -EBUSY; - - if (nr_notes < 1 || - nr_notes > 512) /* TODO: choose a better hard limit */ - return -EINVAL; - - nr_pages = (nr_notes + WATCH_QUEUE_NOTES_PER_PAGE - 1); - nr_pages /= WATCH_QUEUE_NOTES_PER_PAGE; - user_bufs = account_pipe_buffers(pipe->user, pipe->nr_accounted, nr_pages); - - if (nr_pages > pipe->max_usage && - (too_many_pipe_buffers_hard(user_bufs) || - too_many_pipe_buffers_soft(user_bufs)) && - pipe_is_unprivileged_user()) { - ret = -EPERM; - goto error; - } - - nr_notes = nr_pages * WATCH_QUEUE_NOTES_PER_PAGE; - ret = pipe_resize_ring(pipe, roundup_pow_of_two(nr_notes)); - if (ret < 0) - goto error; - - pages = kcalloc(sizeof(struct page *), nr_pages, GFP_KERNEL); - if (!pages) - goto error; - - for (i = 0; i < nr_pages; i++) { - pages[i] = alloc_page(GFP_KERNEL); - if (!pages[i]) - goto error_p; - pages[i]->index = i * WATCH_QUEUE_NOTES_PER_PAGE; - } - - bitmap = bitmap_alloc(nr_notes, GFP_KERNEL); - if (!bitmap) - goto error_p; - - bitmap_fill(bitmap, nr_notes); - wqueue->notes = pages; - wqueue->notes_bitmap = bitmap; - wqueue->nr_pages = nr_pages; - wqueue->nr_notes = nr_notes; - return 0; - -error_p: - while (--i >= 0) - __free_page(pages[i]); - kfree(pages); -error: - (void) account_pipe_buffers(pipe->user, nr_pages, pipe->nr_accounted); - return ret; -} - -/* * Set the filter on a watch queue. */ long watch_queue_set_filter(struct pipe_inode_info *pipe, @@ -560,7 +489,7 @@ int remove_watch_from_object(struct watch_list *wlist, struct watch_queue *wq, wqueue = rcu_dereference(watch->queue); if (lock_wqueue(wqueue)) { - post_one_notification(wqueue, &n.watch); + post_one_notification(wqueue, &n.watch, watch->info_id); if (!hlist_unhashed(&watch->queue_node)) { hlist_del_init_rcu(&watch->queue_node);
diff --git a/mm/filemap.c b/mm/filemap.c index b75a4b7..8f34ab4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c
@@ -2841,39 +2841,6 @@ generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) EXPORT_SYMBOL(generic_file_read_iter); /* - * Splice subpages from a folio into a pipe. - */ -size_t splice_folio_into_pipe(struct pipe_inode_info *pipe, - struct folio *folio, loff_t fpos, size_t size, - bool *full, int *error) -{ - struct pipe_buffer *buf; - struct page *page; - size_t spliced = 0, offset = offset_in_folio(folio, fpos); - - page = folio_page(folio, offset / PAGE_SIZE); - size = min(size, folio_size(folio) - offset); - offset %= PAGE_SIZE; - - while (spliced < size && - (buf = pipe_alloc_buffer(pipe, &page_cache_pipe_buf_ops, 1, - GFP_KERNEL, error))) { - size_t part = min_t(size_t, PAGE_SIZE - offset, size - spliced); - - buf->page = page++; - buf->offset = offset; - buf->len = part; - folio_get(folio); - spliced += pipe_add(pipe, buf, full); - offset = 0; - if (*full) - break; - } - - return spliced; -} - -/* * Splice folios from the pagecache of a buffered (ie. non-O_DIRECT) file into * a pipe. */ @@ -2881,9 +2848,10 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + struct pipe_buffer *buf = NULL; struct folio_batch fbatch; struct kiocb iocb; - size_t total_spliced = 0; + size_t total_spliced = 0, max_pages; loff_t isize, end_offset; bool writably_mapped, full = false; int i, error = 0; @@ -2892,11 +2860,17 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, iocb.ki_pos = *ppos; /* Work out how much data we can actually add into the pipe */ - if (!pipe_query_space(pipe, &len, &error)) + max_pages = pipe_query_space(pipe, &len, &error); + if (!max_pages) return error; folio_batch_init(&fbatch); + buf = pipe_alloc_buffer(pipe, &page_cache_pipe_buf_ops, max_pages, + GFP_KERNEL, &error); + if (!buf) + goto out; + do { cond_resched(); @@ -2929,10 +2903,11 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, for (i = 0; i < folio_batch_count(&fbatch); i++) { struct folio *folio = fbatch.folios[i]; - size_t n; + struct bio_vec *bv = buf->bvec; + size_t n, o; if (folio_pos(folio) >= end_offset) - goto out; + break; folio_mark_accessed(folio); /* @@ -2943,22 +2918,32 @@ ssize_t filemap_splice_read(struct file *in, loff_t *ppos, if (writably_mapped) flush_dcache_folio(folio); + o = offset_in_folio(folio, *ppos); n = min_t(loff_t, len, isize - *ppos); - n = splice_folio_into_pipe(pipe, folio, *ppos, n, - &full, &error); - if (!n) - goto out; + n = min_t(size_t, n, folio_size(folio) - o); + + bv[buf->nr].bv_page = folio_page(folio, 0); + bv[buf->nr].bv_offset = offset_in_folio(folio, *ppos); + bv[buf->nr].bv_len = n; + buf->nr++; + buf->size += n; + buf->footprint += folio_size(folio); + + folio_get(folio); //fbatch.folios[i] = NULL; + len -= n; total_spliced += n; *ppos += n; in->f_ra.prev_pos = *ppos; - if (full) - goto out; + + if (buf->footprint >= max_pages) + break; } folio_batch_release(&fbatch); } while (len); + pipe_add(pipe, buf, &full); out: folio_batch_release(&fbatch); file_accessed(in);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9fa333e..bc736cb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c
@@ -2744,15 +2744,6 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len) } EXPORT_SYMBOL(skb_copy_bits); -/* - * Callback from splice_to_pipe(), if we need to release some pages - * at the end of the spd in case we error'ed out in filling the pipe. - */ -static void sock_spd_release(struct splice_pipe_desc *spd, unsigned int i) -{ - put_page(spd->pages[i]); -} - static struct page *linear_to_page(struct page *page, unsigned int *len, unsigned int *offset, struct sock *sk) @@ -2772,26 +2763,27 @@ static struct page *linear_to_page(struct page *page, unsigned int *len, return pfrag->page; } -static bool spd_can_coalesce(const struct splice_pipe_desc *spd, +static bool spd_can_coalesce(const struct pipe_buffer *buf, struct page *page, unsigned int offset) { - return spd->nr_pages && - spd->pages[spd->nr_pages - 1] == page && - (spd->partial[spd->nr_pages - 1].offset + - spd->partial[spd->nr_pages - 1].len == offset); + const struct bio_vec *p = &buf->bvec[buf->nr - 1]; + + return buf->nr && + p->bv_page == page && + p->bv_offset + p->bv_len == offset; } /* * Fill page/offset/length into spd, if it can hold more pages. */ -static bool spd_fill_page(struct splice_pipe_desc *spd, +static bool spd_fill_page(struct pipe_buffer *buf, struct pipe_inode_info *pipe, struct page *page, unsigned int *len, unsigned int offset, bool linear, struct sock *sk) { - if (unlikely(spd->nr_pages == MAX_SKB_FRAGS)) + if (unlikely(buf->nr == MAX_SKB_FRAGS)) return true; if (linear) { @@ -2799,23 +2791,22 @@ static bool spd_fill_page(struct splice_pipe_desc *spd, if (!page) return true; } - if (spd_can_coalesce(spd, page, offset)) { - spd->partial[spd->nr_pages - 1].len += *len; + if (spd_can_coalesce(buf, page, offset)) { + buf->bvec[buf->nr - 1].bv_len += *len; return false; } get_page(page); - spd->pages[spd->nr_pages] = page; - spd->partial[spd->nr_pages].len = *len; - spd->partial[spd->nr_pages].offset = offset; - spd->nr_pages++; - + buf->bvec[buf->nr].bv_page = page; + buf->bvec[buf->nr].bv_len = *len; + buf->bvec[buf->nr].bv_offset = offset; + buf->nr++; return false; } static bool __splice_segment(struct page *page, unsigned int poff, unsigned int plen, unsigned int *off, unsigned int *len, - struct splice_pipe_desc *spd, bool linear, + struct pipe_buffer *buf, bool linear, struct sock *sk, struct pipe_inode_info *pipe) { @@ -2836,8 +2827,7 @@ static bool __splice_segment(struct page *page, unsigned int poff, do { unsigned int flen = min(*len, plen); - if (spd_fill_page(spd, pipe, page, &flen, poff, - linear, sk)) + if (spd_fill_page(buf, pipe, page, &flen, poff, linear, sk)) return true; poff += flen; plen -= flen; @@ -2853,7 +2843,7 @@ static bool __splice_segment(struct page *page, unsigned int poff, */ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, unsigned int *offset, unsigned int *len, - struct splice_pipe_desc *spd, struct sock *sk) + struct pipe_buffer *buf, struct sock *sk) { int seg; struct sk_buff *iter; @@ -2866,7 +2856,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, if (__splice_segment(virt_to_page(skb->data), (unsigned long) skb->data & (PAGE_SIZE - 1), skb_headlen(skb), - offset, len, spd, + offset, len, buf, skb_head_is_locked(skb), sk, pipe)) return true; @@ -2879,7 +2869,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, if (__splice_segment(skb_frag_page(f), skb_frag_off(f), skb_frag_size(f), - offset, len, spd, false, sk, pipe)) + offset, len, buf, false, sk, pipe)) return true; } @@ -2892,7 +2882,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe, * left, so no point in going over the frag_list for the error * case. */ - if (__skb_splice_bits(iter, pipe, offset, len, spd, sk)) + if (__skb_splice_bits(iter, pipe, offset, len, buf, sk)) return true; } @@ -2907,23 +2897,17 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, struct pipe_inode_info *pipe, unsigned int tlen, unsigned int flags) { - struct partial_page partial[MAX_SKB_FRAGS]; - struct page *pages[MAX_SKB_FRAGS]; - struct splice_pipe_desc spd = { - .pages = pages, - .partial = partial, - .nr_pages_max = MAX_SKB_FRAGS, - .ops = &nosteal_pipe_buf_ops, - .spd_release = sock_spd_release, - }; + struct pipe_buffer *buf; + bool full = false; int ret = 0; - __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk); + buf = pipe_alloc_buffer(pipe, &nosteal_pipe_buf_ops, MAX_SKB_FRAGS, + GFP_KERNEL, &ret); + if (!buf) + return ret; - if (spd.nr_pages) - ret = splice_to_pipe(pipe, &spd); - - return ret; + __skb_splice_bits(skb, pipe, &offset, &tlen, buf, sk); + return pipe_add(pipe, buf, &full); } EXPORT_SYMBOL_GPL(skb_splice_bits);
diff --git a/security/keys/internal.h b/security/keys/internal.h index 3c1e712..990edf7 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h
@@ -195,7 +195,7 @@ static inline void notify_key(struct key *key, }; post_watch_notification(key->watchers, &n.watch, current_cred(), - n.key_id); + n.key_id, GFP_KERNEL); #endif }