| // SPDX-License-Identifier: GPL-2.0 |
| /* |
| * FUSE: Filesystem in Userspace |
| * Copyright (c) 2023-2024 DataDirect Networks. |
| */ |
| |
| #include "fuse_i.h" |
| #include "dev_uring_i.h" |
| #include "fuse_dev_i.h" |
| |
| #include <linux/fs.h> |
| #include <linux/io_uring/cmd.h> |
| |
| static bool __read_mostly enable_uring; |
| module_param(enable_uring, bool, 0644); |
| MODULE_PARM_DESC(enable_uring, |
| "Enable userspace communication through io-uring"); |
| |
| #define FUSE_URING_IOV_SEGS 2 /* header and payload */ |
| |
| |
| bool fuse_uring_enabled(void) |
| { |
| return enable_uring; |
| } |
| |
| struct fuse_uring_pdu { |
| struct fuse_ring_ent *ent; |
| }; |
| |
| static const struct fuse_iqueue_ops fuse_io_uring_ops; |
| |
| static void uring_cmd_set_ring_ent(struct io_uring_cmd *cmd, |
| struct fuse_ring_ent *ring_ent) |
| { |
| struct fuse_uring_pdu *pdu = |
| io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); |
| |
| pdu->ent = ring_ent; |
| } |
| |
| static struct fuse_ring_ent *uring_cmd_to_ring_ent(struct io_uring_cmd *cmd) |
| { |
| struct fuse_uring_pdu *pdu = |
| io_uring_cmd_to_pdu(cmd, struct fuse_uring_pdu); |
| |
| return pdu->ent; |
| } |
| |
| static void fuse_uring_flush_bg(struct fuse_ring_queue *queue) |
| { |
| struct fuse_ring *ring = queue->ring; |
| struct fuse_conn *fc = ring->fc; |
| |
| lockdep_assert_held(&queue->lock); |
| lockdep_assert_held(&fc->bg_lock); |
| |
| /* |
| * Allow one bg request per queue, ignoring global fc limits. |
| * This prevents a single queue from consuming all resources and |
| * eliminates the need for remote queue wake-ups when global |
| * limits are met but this queue has no more waiting requests. |
| */ |
| while ((fc->active_background < fc->max_background || |
| !queue->active_background) && |
| (!list_empty(&queue->fuse_req_bg_queue))) { |
| struct fuse_req *req; |
| |
| req = list_first_entry(&queue->fuse_req_bg_queue, |
| struct fuse_req, list); |
| fc->active_background++; |
| queue->active_background++; |
| |
| list_move_tail(&req->list, &queue->fuse_req_queue); |
| } |
| } |
| |
| static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, |
| int error) |
| { |
| struct fuse_ring_queue *queue = ent->queue; |
| struct fuse_ring *ring = queue->ring; |
| struct fuse_conn *fc = ring->fc; |
| |
| lockdep_assert_not_held(&queue->lock); |
| spin_lock(&queue->lock); |
| ent->fuse_req = NULL; |
| if (test_bit(FR_BACKGROUND, &req->flags)) { |
| queue->active_background--; |
| spin_lock(&fc->bg_lock); |
| fuse_uring_flush_bg(queue); |
| spin_unlock(&fc->bg_lock); |
| } |
| |
| spin_unlock(&queue->lock); |
| |
| if (error) |
| req->out.h.error = error; |
| |
| clear_bit(FR_SENT, &req->flags); |
| fuse_request_end(req); |
| } |
| |
| /* Abort all list queued request on the given ring queue */ |
| static void fuse_uring_abort_end_queue_requests(struct fuse_ring_queue *queue) |
| { |
| struct fuse_req *req; |
| LIST_HEAD(req_list); |
| |
| spin_lock(&queue->lock); |
| list_for_each_entry(req, &queue->fuse_req_queue, list) |
| clear_bit(FR_PENDING, &req->flags); |
| list_splice_init(&queue->fuse_req_queue, &req_list); |
| spin_unlock(&queue->lock); |
| |
| /* must not hold queue lock to avoid order issues with fi->lock */ |
| fuse_dev_end_requests(&req_list); |
| } |
| |
| void fuse_uring_abort_end_requests(struct fuse_ring *ring) |
| { |
| int qid; |
| struct fuse_ring_queue *queue; |
| struct fuse_conn *fc = ring->fc; |
| |
| for (qid = 0; qid < ring->nr_queues; qid++) { |
| queue = READ_ONCE(ring->queues[qid]); |
| if (!queue) |
| continue; |
| |
| queue->stopped = true; |
| |
| WARN_ON_ONCE(ring->fc->max_background != UINT_MAX); |
| spin_lock(&queue->lock); |
| spin_lock(&fc->bg_lock); |
| fuse_uring_flush_bg(queue); |
| spin_unlock(&fc->bg_lock); |
| spin_unlock(&queue->lock); |
| fuse_uring_abort_end_queue_requests(queue); |
| } |
| } |
| |
| static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) |
| { |
| struct fuse_ring_ent *ent; |
| struct fuse_req *req; |
| |
| ent = list_first_entry_or_null(list, struct fuse_ring_ent, list); |
| if (!ent) |
| return false; |
| |
| req = ent->fuse_req; |
| |
| return time_is_before_jiffies(req->create_time + |
| fc->timeout.req_timeout); |
| } |
| |
| bool fuse_uring_request_expired(struct fuse_conn *fc) |
| { |
| struct fuse_ring *ring = fc->ring; |
| struct fuse_ring_queue *queue; |
| int qid; |
| |
| if (!ring) |
| return false; |
| |
| for (qid = 0; qid < ring->nr_queues; qid++) { |
| queue = READ_ONCE(ring->queues[qid]); |
| if (!queue) |
| continue; |
| |
| spin_lock(&queue->lock); |
| if (fuse_request_expired(fc, &queue->fuse_req_queue) || |
| fuse_request_expired(fc, &queue->fuse_req_bg_queue) || |
| ent_list_request_expired(fc, &queue->ent_w_req_queue) || |
| ent_list_request_expired(fc, &queue->ent_in_userspace)) { |
| spin_unlock(&queue->lock); |
| return true; |
| } |
| spin_unlock(&queue->lock); |
| } |
| |
| return false; |
| } |
| |
| void fuse_uring_destruct(struct fuse_conn *fc) |
| { |
| struct fuse_ring *ring = fc->ring; |
| int qid; |
| |
| if (!ring) |
| return; |
| |
| for (qid = 0; qid < ring->nr_queues; qid++) { |
| struct fuse_ring_queue *queue = ring->queues[qid]; |
| struct fuse_ring_ent *ent, *next; |
| |
| if (!queue) |
| continue; |
| |
| WARN_ON(!list_empty(&queue->ent_avail_queue)); |
| WARN_ON(!list_empty(&queue->ent_w_req_queue)); |
| WARN_ON(!list_empty(&queue->ent_commit_queue)); |
| WARN_ON(!list_empty(&queue->ent_in_userspace)); |
| |
| list_for_each_entry_safe(ent, next, &queue->ent_released, |
| list) { |
| list_del_init(&ent->list); |
| kfree(ent); |
| } |
| |
| kfree(queue->fpq.processing); |
| kfree(queue); |
| ring->queues[qid] = NULL; |
| } |
| |
| kfree(ring->queues); |
| kfree(ring); |
| fc->ring = NULL; |
| } |
| |
| /* |
| * Basic ring setup for this connection based on the provided configuration |
| */ |
| static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) |
| { |
| struct fuse_ring *ring; |
| size_t nr_queues = num_possible_cpus(); |
| struct fuse_ring *res = NULL; |
| size_t max_payload_size; |
| |
| ring = kzalloc(sizeof(*fc->ring), GFP_KERNEL_ACCOUNT); |
| if (!ring) |
| return NULL; |
| |
| ring->queues = kcalloc(nr_queues, sizeof(struct fuse_ring_queue *), |
| GFP_KERNEL_ACCOUNT); |
| if (!ring->queues) |
| goto out_err; |
| |
| max_payload_size = max(FUSE_MIN_READ_BUFFER, fc->max_write); |
| max_payload_size = max(max_payload_size, fc->max_pages * PAGE_SIZE); |
| |
| spin_lock(&fc->lock); |
| if (fc->ring) { |
| /* race, another thread created the ring in the meantime */ |
| spin_unlock(&fc->lock); |
| res = fc->ring; |
| goto out_err; |
| } |
| |
| init_waitqueue_head(&ring->stop_waitq); |
| |
| ring->nr_queues = nr_queues; |
| ring->fc = fc; |
| ring->max_payload_sz = max_payload_size; |
| smp_store_release(&fc->ring, ring); |
| |
| spin_unlock(&fc->lock); |
| return ring; |
| |
| out_err: |
| kfree(ring->queues); |
| kfree(ring); |
| return res; |
| } |
| |
| static struct fuse_ring_queue *fuse_uring_create_queue(struct fuse_ring *ring, |
| int qid) |
| { |
| struct fuse_conn *fc = ring->fc; |
| struct fuse_ring_queue *queue; |
| struct list_head *pq; |
| |
| queue = kzalloc(sizeof(*queue), GFP_KERNEL_ACCOUNT); |
| if (!queue) |
| return NULL; |
| pq = kcalloc(FUSE_PQ_HASH_SIZE, sizeof(struct list_head), GFP_KERNEL); |
| if (!pq) { |
| kfree(queue); |
| return NULL; |
| } |
| |
| queue->qid = qid; |
| queue->ring = ring; |
| spin_lock_init(&queue->lock); |
| |
| INIT_LIST_HEAD(&queue->ent_avail_queue); |
| INIT_LIST_HEAD(&queue->ent_commit_queue); |
| INIT_LIST_HEAD(&queue->ent_w_req_queue); |
| INIT_LIST_HEAD(&queue->ent_in_userspace); |
| INIT_LIST_HEAD(&queue->fuse_req_queue); |
| INIT_LIST_HEAD(&queue->fuse_req_bg_queue); |
| INIT_LIST_HEAD(&queue->ent_released); |
| |
| queue->fpq.processing = pq; |
| fuse_pqueue_init(&queue->fpq); |
| |
| spin_lock(&fc->lock); |
| if (ring->queues[qid]) { |
| spin_unlock(&fc->lock); |
| kfree(queue->fpq.processing); |
| kfree(queue); |
| return ring->queues[qid]; |
| } |
| |
| /* |
| * write_once and lock as the caller mostly doesn't take the lock at all |
| */ |
| WRITE_ONCE(ring->queues[qid], queue); |
| spin_unlock(&fc->lock); |
| |
| return queue; |
| } |
| |
| static void fuse_uring_stop_fuse_req_end(struct fuse_req *req) |
| { |
| clear_bit(FR_SENT, &req->flags); |
| req->out.h.error = -ECONNABORTED; |
| fuse_request_end(req); |
| } |
| |
| /* |
| * Release a request/entry on connection tear down |
| */ |
| static void fuse_uring_entry_teardown(struct fuse_ring_ent *ent) |
| { |
| struct fuse_req *req; |
| struct io_uring_cmd *cmd; |
| |
| struct fuse_ring_queue *queue = ent->queue; |
| |
| spin_lock(&queue->lock); |
| cmd = ent->cmd; |
| ent->cmd = NULL; |
| req = ent->fuse_req; |
| ent->fuse_req = NULL; |
| if (req) { |
| /* remove entry from queue->fpq->processing */ |
| list_del_init(&req->list); |
| } |
| |
| /* |
| * The entry must not be freed immediately, due to access of direct |
| * pointer access of entries through IO_URING_F_CANCEL - there is a risk |
| * of race between daemon termination (which triggers IO_URING_F_CANCEL |
| * and accesses entries without checking the list state first |
| */ |
| list_move(&ent->list, &queue->ent_released); |
| ent->state = FRRS_RELEASED; |
| spin_unlock(&queue->lock); |
| |
| if (cmd) |
| io_uring_cmd_done(cmd, -ENOTCONN, 0, IO_URING_F_UNLOCKED); |
| |
| if (req) |
| fuse_uring_stop_fuse_req_end(req); |
| } |
| |
| static void fuse_uring_stop_list_entries(struct list_head *head, |
| struct fuse_ring_queue *queue, |
| enum fuse_ring_req_state exp_state) |
| { |
| struct fuse_ring *ring = queue->ring; |
| struct fuse_ring_ent *ent, *next; |
| ssize_t queue_refs = SSIZE_MAX; |
| LIST_HEAD(to_teardown); |
| |
| spin_lock(&queue->lock); |
| list_for_each_entry_safe(ent, next, head, list) { |
| if (ent->state != exp_state) { |
| pr_warn("entry teardown qid=%d state=%d expected=%d", |
| queue->qid, ent->state, exp_state); |
| continue; |
| } |
| |
| ent->state = FRRS_TEARDOWN; |
| list_move(&ent->list, &to_teardown); |
| } |
| spin_unlock(&queue->lock); |
| |
| /* no queue lock to avoid lock order issues */ |
| list_for_each_entry_safe(ent, next, &to_teardown, list) { |
| fuse_uring_entry_teardown(ent); |
| queue_refs = atomic_dec_return(&ring->queue_refs); |
| WARN_ON_ONCE(queue_refs < 0); |
| } |
| } |
| |
| static void fuse_uring_teardown_entries(struct fuse_ring_queue *queue) |
| { |
| fuse_uring_stop_list_entries(&queue->ent_in_userspace, queue, |
| FRRS_USERSPACE); |
| fuse_uring_stop_list_entries(&queue->ent_avail_queue, queue, |
| FRRS_AVAILABLE); |
| } |
| |
| /* |
| * Log state debug info |
| */ |
| static void fuse_uring_log_ent_state(struct fuse_ring *ring) |
| { |
| int qid; |
| struct fuse_ring_ent *ent; |
| |
| for (qid = 0; qid < ring->nr_queues; qid++) { |
| struct fuse_ring_queue *queue = ring->queues[qid]; |
| |
| if (!queue) |
| continue; |
| |
| spin_lock(&queue->lock); |
| /* |
| * Log entries from the intermediate queue, the other queues |
| * should be empty |
| */ |
| list_for_each_entry(ent, &queue->ent_w_req_queue, list) { |
| pr_info(" ent-req-queue ring=%p qid=%d ent=%p state=%d\n", |
| ring, qid, ent, ent->state); |
| } |
| list_for_each_entry(ent, &queue->ent_commit_queue, list) { |
| pr_info(" ent-commit-queue ring=%p qid=%d ent=%p state=%d\n", |
| ring, qid, ent, ent->state); |
| } |
| spin_unlock(&queue->lock); |
| } |
| ring->stop_debug_log = 1; |
| } |
| |
| static void fuse_uring_async_stop_queues(struct work_struct *work) |
| { |
| int qid; |
| struct fuse_ring *ring = |
| container_of(work, struct fuse_ring, async_teardown_work.work); |
| |
| /* XXX code dup */ |
| for (qid = 0; qid < ring->nr_queues; qid++) { |
| struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); |
| |
| if (!queue) |
| continue; |
| |
| fuse_uring_teardown_entries(queue); |
| } |
| |
| /* |
| * Some ring entries might be in the middle of IO operations, |
| * i.e. in process to get handled by file_operations::uring_cmd |
| * or on the way to userspace - we could handle that with conditions in |
| * run time code, but easier/cleaner to have an async tear down handler |
| * If there are still queue references left |
| */ |
| if (atomic_read(&ring->queue_refs) > 0) { |
| if (time_after(jiffies, |
| ring->teardown_time + FUSE_URING_TEARDOWN_TIMEOUT)) |
| fuse_uring_log_ent_state(ring); |
| |
| schedule_delayed_work(&ring->async_teardown_work, |
| FUSE_URING_TEARDOWN_INTERVAL); |
| } else { |
| wake_up_all(&ring->stop_waitq); |
| } |
| } |
| |
| /* |
| * Stop the ring queues |
| */ |
| void fuse_uring_stop_queues(struct fuse_ring *ring) |
| { |
| int qid; |
| |
| for (qid = 0; qid < ring->nr_queues; qid++) { |
| struct fuse_ring_queue *queue = READ_ONCE(ring->queues[qid]); |
| |
| if (!queue) |
| continue; |
| |
| fuse_uring_teardown_entries(queue); |
| } |
| |
| if (atomic_read(&ring->queue_refs) > 0) { |
| ring->teardown_time = jiffies; |
| INIT_DELAYED_WORK(&ring->async_teardown_work, |
| fuse_uring_async_stop_queues); |
| schedule_delayed_work(&ring->async_teardown_work, |
| FUSE_URING_TEARDOWN_INTERVAL); |
| } else { |
| wake_up_all(&ring->stop_waitq); |
| } |
| } |
| |
| /* |
| * Handle IO_URING_F_CANCEL, typically should come on daemon termination. |
| * |
| * Releasing the last entry should trigger fuse_dev_release() if |
| * the daemon was terminated |
| */ |
| static void fuse_uring_cancel(struct io_uring_cmd *cmd, |
| unsigned int issue_flags) |
| { |
| struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); |
| struct fuse_ring_queue *queue; |
| bool need_cmd_done = false; |
| |
| /* |
| * direct access on ent - it must not be destructed as long as |
| * IO_URING_F_CANCEL might come up |
| */ |
| queue = ent->queue; |
| spin_lock(&queue->lock); |
| if (ent->state == FRRS_AVAILABLE) { |
| ent->state = FRRS_USERSPACE; |
| list_move_tail(&ent->list, &queue->ent_in_userspace); |
| need_cmd_done = true; |
| ent->cmd = NULL; |
| } |
| spin_unlock(&queue->lock); |
| |
| if (need_cmd_done) { |
| /* no queue lock to avoid lock order issues */ |
| io_uring_cmd_done(cmd, -ENOTCONN, 0, issue_flags); |
| } |
| } |
| |
| static void fuse_uring_prepare_cancel(struct io_uring_cmd *cmd, int issue_flags, |
| struct fuse_ring_ent *ring_ent) |
| { |
| uring_cmd_set_ring_ent(cmd, ring_ent); |
| io_uring_cmd_mark_cancelable(cmd, issue_flags); |
| } |
| |
| /* |
| * Checks for errors and stores it into the request |
| */ |
| static int fuse_uring_out_header_has_err(struct fuse_out_header *oh, |
| struct fuse_req *req, |
| struct fuse_conn *fc) |
| { |
| int err; |
| |
| err = -EINVAL; |
| if (oh->unique == 0) { |
| /* Not supported through io-uring yet */ |
| pr_warn_once("notify through fuse-io-uring not supported\n"); |
| goto err; |
| } |
| |
| if (oh->error <= -ERESTARTSYS || oh->error > 0) |
| goto err; |
| |
| if (oh->error) { |
| err = oh->error; |
| goto err; |
| } |
| |
| err = -ENOENT; |
| if ((oh->unique & ~FUSE_INT_REQ_BIT) != req->in.h.unique) { |
| pr_warn_ratelimited("unique mismatch, expected: %llu got %llu\n", |
| req->in.h.unique, |
| oh->unique & ~FUSE_INT_REQ_BIT); |
| goto err; |
| } |
| |
| /* |
| * Is it an interrupt reply ID? |
| * XXX: Not supported through fuse-io-uring yet, it should not even |
| * find the request - should not happen. |
| */ |
| WARN_ON_ONCE(oh->unique & FUSE_INT_REQ_BIT); |
| |
| err = 0; |
| err: |
| return err; |
| } |
| |
| static int fuse_uring_copy_from_ring(struct fuse_ring *ring, |
| struct fuse_req *req, |
| struct fuse_ring_ent *ent) |
| { |
| struct fuse_copy_state cs; |
| struct fuse_args *args = req->args; |
| struct iov_iter iter; |
| int err; |
| struct fuse_uring_ent_in_out ring_in_out; |
| |
| err = copy_from_user(&ring_in_out, &ent->headers->ring_ent_in_out, |
| sizeof(ring_in_out)); |
| if (err) |
| return -EFAULT; |
| |
| err = import_ubuf(ITER_SOURCE, ent->payload, ring->max_payload_sz, |
| &iter); |
| if (err) |
| return err; |
| |
| fuse_copy_init(&cs, false, &iter); |
| cs.is_uring = true; |
| cs.req = req; |
| |
| return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); |
| } |
| |
| /* |
| * Copy data from the req to the ring buffer |
| */ |
| static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, |
| struct fuse_ring_ent *ent) |
| { |
| struct fuse_copy_state cs; |
| struct fuse_args *args = req->args; |
| struct fuse_in_arg *in_args = args->in_args; |
| int num_args = args->in_numargs; |
| int err; |
| struct iov_iter iter; |
| struct fuse_uring_ent_in_out ent_in_out = { |
| .flags = 0, |
| .commit_id = req->in.h.unique, |
| }; |
| |
| err = import_ubuf(ITER_DEST, ent->payload, ring->max_payload_sz, &iter); |
| if (err) { |
| pr_info_ratelimited("fuse: Import of user buffer failed\n"); |
| return err; |
| } |
| |
| fuse_copy_init(&cs, true, &iter); |
| cs.is_uring = true; |
| cs.req = req; |
| |
| if (num_args > 0) { |
| /* |
| * Expectation is that the first argument is the per op header. |
| * Some op code have that as zero size. |
| */ |
| if (args->in_args[0].size > 0) { |
| err = copy_to_user(&ent->headers->op_in, in_args->value, |
| in_args->size); |
| if (err) { |
| pr_info_ratelimited( |
| "Copying the header failed.\n"); |
| return -EFAULT; |
| } |
| } |
| in_args++; |
| num_args--; |
| } |
| |
| /* copy the payload */ |
| err = fuse_copy_args(&cs, num_args, args->in_pages, |
| (struct fuse_arg *)in_args, 0); |
| if (err) { |
| pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); |
| return err; |
| } |
| |
| ent_in_out.payload_sz = cs.ring.copied_sz; |
| err = copy_to_user(&ent->headers->ring_ent_in_out, &ent_in_out, |
| sizeof(ent_in_out)); |
| return err ? -EFAULT : 0; |
| } |
| |
| static int fuse_uring_copy_to_ring(struct fuse_ring_ent *ent, |
| struct fuse_req *req) |
| { |
| struct fuse_ring_queue *queue = ent->queue; |
| struct fuse_ring *ring = queue->ring; |
| int err; |
| |
| err = -EIO; |
| if (WARN_ON(ent->state != FRRS_FUSE_REQ)) { |
| pr_err("qid=%d ring-req=%p invalid state %d on send\n", |
| queue->qid, ent, ent->state); |
| return err; |
| } |
| |
| err = -EINVAL; |
| if (WARN_ON(req->in.h.unique == 0)) |
| return err; |
| |
| /* copy the request */ |
| err = fuse_uring_args_to_ring(ring, req, ent); |
| if (unlikely(err)) { |
| pr_info_ratelimited("Copy to ring failed: %d\n", err); |
| return err; |
| } |
| |
| /* copy fuse_in_header */ |
| err = copy_to_user(&ent->headers->in_out, &req->in.h, |
| sizeof(req->in.h)); |
| if (err) { |
| err = -EFAULT; |
| return err; |
| } |
| |
| return 0; |
| } |
| |
| static int fuse_uring_prepare_send(struct fuse_ring_ent *ent, |
| struct fuse_req *req) |
| { |
| int err; |
| |
| err = fuse_uring_copy_to_ring(ent, req); |
| if (!err) |
| set_bit(FR_SENT, &req->flags); |
| else |
| fuse_uring_req_end(ent, req, err); |
| |
| return err; |
| } |
| |
| /* |
| * Write data to the ring buffer and send the request to userspace, |
| * userspace will read it |
| * This is comparable with classical read(/dev/fuse) |
| */ |
| static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, |
| struct fuse_req *req, |
| unsigned int issue_flags) |
| { |
| struct fuse_ring_queue *queue = ent->queue; |
| int err; |
| struct io_uring_cmd *cmd; |
| |
| err = fuse_uring_prepare_send(ent, req); |
| if (err) |
| return err; |
| |
| spin_lock(&queue->lock); |
| cmd = ent->cmd; |
| ent->cmd = NULL; |
| ent->state = FRRS_USERSPACE; |
| list_move_tail(&ent->list, &queue->ent_in_userspace); |
| spin_unlock(&queue->lock); |
| |
| io_uring_cmd_done(cmd, 0, 0, issue_flags); |
| return 0; |
| } |
| |
| /* |
| * Make a ring entry available for fuse_req assignment |
| */ |
| static void fuse_uring_ent_avail(struct fuse_ring_ent *ent, |
| struct fuse_ring_queue *queue) |
| { |
| WARN_ON_ONCE(!ent->cmd); |
| list_move(&ent->list, &queue->ent_avail_queue); |
| ent->state = FRRS_AVAILABLE; |
| } |
| |
| /* Used to find the request on SQE commit */ |
| static void fuse_uring_add_to_pq(struct fuse_ring_ent *ent, |
| struct fuse_req *req) |
| { |
| struct fuse_ring_queue *queue = ent->queue; |
| struct fuse_pqueue *fpq = &queue->fpq; |
| unsigned int hash; |
| |
| req->ring_entry = ent; |
| hash = fuse_req_hash(req->in.h.unique); |
| list_move_tail(&req->list, &fpq->processing[hash]); |
| } |
| |
| /* |
| * Assign a fuse queue entry to the given entry |
| */ |
| static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, |
| struct fuse_req *req) |
| { |
| struct fuse_ring_queue *queue = ent->queue; |
| |
| lockdep_assert_held(&queue->lock); |
| |
| if (WARN_ON_ONCE(ent->state != FRRS_AVAILABLE && |
| ent->state != FRRS_COMMIT)) { |
| pr_warn("%s qid=%d state=%d\n", __func__, ent->queue->qid, |
| ent->state); |
| } |
| |
| clear_bit(FR_PENDING, &req->flags); |
| ent->fuse_req = req; |
| ent->state = FRRS_FUSE_REQ; |
| list_move_tail(&ent->list, &queue->ent_w_req_queue); |
| fuse_uring_add_to_pq(ent, req); |
| } |
| |
| /* Fetch the next fuse request if available */ |
| static struct fuse_req *fuse_uring_ent_assign_req(struct fuse_ring_ent *ent) |
| __must_hold(&queue->lock) |
| { |
| struct fuse_req *req; |
| struct fuse_ring_queue *queue = ent->queue; |
| struct list_head *req_queue = &queue->fuse_req_queue; |
| |
| lockdep_assert_held(&queue->lock); |
| |
| /* get and assign the next entry while it is still holding the lock */ |
| req = list_first_entry_or_null(req_queue, struct fuse_req, list); |
| if (req) |
| fuse_uring_add_req_to_ring_ent(ent, req); |
| |
| return req; |
| } |
| |
| /* |
| * Read data from the ring buffer, which user space has written to |
| * This is comparible with handling of classical write(/dev/fuse). |
| * Also make the ring request available again for new fuse requests. |
| */ |
| static void fuse_uring_commit(struct fuse_ring_ent *ent, struct fuse_req *req, |
| unsigned int issue_flags) |
| { |
| struct fuse_ring *ring = ent->queue->ring; |
| struct fuse_conn *fc = ring->fc; |
| ssize_t err = 0; |
| |
| err = copy_from_user(&req->out.h, &ent->headers->in_out, |
| sizeof(req->out.h)); |
| if (err) { |
| req->out.h.error = -EFAULT; |
| goto out; |
| } |
| |
| err = fuse_uring_out_header_has_err(&req->out.h, req, fc); |
| if (err) { |
| /* req->out.h.error already set */ |
| goto out; |
| } |
| |
| err = fuse_uring_copy_from_ring(ring, req, ent); |
| out: |
| fuse_uring_req_end(ent, req, err); |
| } |
| |
| /* |
| * Get the next fuse req and send it |
| */ |
| static void fuse_uring_next_fuse_req(struct fuse_ring_ent *ent, |
| struct fuse_ring_queue *queue, |
| unsigned int issue_flags) |
| { |
| int err; |
| struct fuse_req *req; |
| |
| retry: |
| spin_lock(&queue->lock); |
| fuse_uring_ent_avail(ent, queue); |
| req = fuse_uring_ent_assign_req(ent); |
| spin_unlock(&queue->lock); |
| |
| if (req) { |
| err = fuse_uring_send_next_to_ring(ent, req, issue_flags); |
| if (err) |
| goto retry; |
| } |
| } |
| |
| static int fuse_ring_ent_set_commit(struct fuse_ring_ent *ent) |
| { |
| struct fuse_ring_queue *queue = ent->queue; |
| |
| lockdep_assert_held(&queue->lock); |
| |
| if (WARN_ON_ONCE(ent->state != FRRS_USERSPACE)) |
| return -EIO; |
| |
| ent->state = FRRS_COMMIT; |
| list_move(&ent->list, &queue->ent_commit_queue); |
| |
| return 0; |
| } |
| |
| /* FUSE_URING_CMD_COMMIT_AND_FETCH handler */ |
| static int fuse_uring_commit_fetch(struct io_uring_cmd *cmd, int issue_flags, |
| struct fuse_conn *fc) |
| { |
| const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); |
| struct fuse_ring_ent *ent; |
| int err; |
| struct fuse_ring *ring = fc->ring; |
| struct fuse_ring_queue *queue; |
| uint64_t commit_id = READ_ONCE(cmd_req->commit_id); |
| unsigned int qid = READ_ONCE(cmd_req->qid); |
| struct fuse_pqueue *fpq; |
| struct fuse_req *req; |
| |
| err = -ENOTCONN; |
| if (!ring) |
| return err; |
| |
| if (qid >= ring->nr_queues) |
| return -EINVAL; |
| |
| queue = ring->queues[qid]; |
| if (!queue) |
| return err; |
| fpq = &queue->fpq; |
| |
| if (!READ_ONCE(fc->connected) || READ_ONCE(queue->stopped)) |
| return err; |
| |
| spin_lock(&queue->lock); |
| /* Find a request based on the unique ID of the fuse request |
| * This should get revised, as it needs a hash calculation and list |
| * search. And full struct fuse_pqueue is needed (memory overhead). |
| * As well as the link from req to ring_ent. |
| */ |
| req = fuse_request_find(fpq, commit_id); |
| err = -ENOENT; |
| if (!req) { |
| pr_info("qid=%d commit_id %llu not found\n", queue->qid, |
| commit_id); |
| spin_unlock(&queue->lock); |
| return err; |
| } |
| list_del_init(&req->list); |
| ent = req->ring_entry; |
| req->ring_entry = NULL; |
| |
| err = fuse_ring_ent_set_commit(ent); |
| if (err != 0) { |
| pr_info_ratelimited("qid=%d commit_id %llu state %d", |
| queue->qid, commit_id, ent->state); |
| spin_unlock(&queue->lock); |
| req->out.h.error = err; |
| clear_bit(FR_SENT, &req->flags); |
| fuse_request_end(req); |
| return err; |
| } |
| |
| ent->cmd = cmd; |
| spin_unlock(&queue->lock); |
| |
| /* without the queue lock, as other locks are taken */ |
| fuse_uring_prepare_cancel(cmd, issue_flags, ent); |
| fuse_uring_commit(ent, req, issue_flags); |
| |
| /* |
| * Fetching the next request is absolutely required as queued |
| * fuse requests would otherwise not get processed - committing |
| * and fetching is done in one step vs legacy fuse, which has separated |
| * read (fetch request) and write (commit result). |
| */ |
| fuse_uring_next_fuse_req(ent, queue, issue_flags); |
| return 0; |
| } |
| |
| static bool is_ring_ready(struct fuse_ring *ring, int current_qid) |
| { |
| int qid; |
| struct fuse_ring_queue *queue; |
| bool ready = true; |
| |
| for (qid = 0; qid < ring->nr_queues && ready; qid++) { |
| if (current_qid == qid) |
| continue; |
| |
| queue = ring->queues[qid]; |
| if (!queue) { |
| ready = false; |
| break; |
| } |
| |
| spin_lock(&queue->lock); |
| if (list_empty(&queue->ent_avail_queue)) |
| ready = false; |
| spin_unlock(&queue->lock); |
| } |
| |
| return ready; |
| } |
| |
| /* |
| * fuse_uring_req_fetch command handling |
| */ |
| static void fuse_uring_do_register(struct fuse_ring_ent *ent, |
| struct io_uring_cmd *cmd, |
| unsigned int issue_flags) |
| { |
| struct fuse_ring_queue *queue = ent->queue; |
| struct fuse_ring *ring = queue->ring; |
| struct fuse_conn *fc = ring->fc; |
| struct fuse_iqueue *fiq = &fc->iq; |
| |
| fuse_uring_prepare_cancel(cmd, issue_flags, ent); |
| |
| spin_lock(&queue->lock); |
| ent->cmd = cmd; |
| fuse_uring_ent_avail(ent, queue); |
| spin_unlock(&queue->lock); |
| |
| if (!ring->ready) { |
| bool ready = is_ring_ready(ring, queue->qid); |
| |
| if (ready) { |
| WRITE_ONCE(fiq->ops, &fuse_io_uring_ops); |
| WRITE_ONCE(ring->ready, true); |
| wake_up_all(&fc->blocked_waitq); |
| } |
| } |
| } |
| |
| /* |
| * sqe->addr is a ptr to an iovec array, iov[0] has the headers, iov[1] |
| * the payload |
| */ |
| static int fuse_uring_get_iovec_from_sqe(const struct io_uring_sqe *sqe, |
| struct iovec iov[FUSE_URING_IOV_SEGS]) |
| { |
| struct iovec __user *uiov = u64_to_user_ptr(READ_ONCE(sqe->addr)); |
| struct iov_iter iter; |
| ssize_t ret; |
| |
| if (sqe->len != FUSE_URING_IOV_SEGS) |
| return -EINVAL; |
| |
| /* |
| * Direction for buffer access will actually be READ and WRITE, |
| * using write for the import should include READ access as well. |
| */ |
| ret = import_iovec(WRITE, uiov, FUSE_URING_IOV_SEGS, |
| FUSE_URING_IOV_SEGS, &iov, &iter); |
| if (ret < 0) |
| return ret; |
| |
| return 0; |
| } |
| |
| static struct fuse_ring_ent * |
| fuse_uring_create_ring_ent(struct io_uring_cmd *cmd, |
| struct fuse_ring_queue *queue) |
| { |
| struct fuse_ring *ring = queue->ring; |
| struct fuse_ring_ent *ent; |
| size_t payload_size; |
| struct iovec iov[FUSE_URING_IOV_SEGS]; |
| int err; |
| |
| err = fuse_uring_get_iovec_from_sqe(cmd->sqe, iov); |
| if (err) { |
| pr_info_ratelimited("Failed to get iovec from sqe, err=%d\n", |
| err); |
| return ERR_PTR(err); |
| } |
| |
| err = -EINVAL; |
| if (iov[0].iov_len < sizeof(struct fuse_uring_req_header)) { |
| pr_info_ratelimited("Invalid header len %zu\n", iov[0].iov_len); |
| return ERR_PTR(err); |
| } |
| |
| payload_size = iov[1].iov_len; |
| if (payload_size < ring->max_payload_sz) { |
| pr_info_ratelimited("Invalid req payload len %zu\n", |
| payload_size); |
| return ERR_PTR(err); |
| } |
| |
| err = -ENOMEM; |
| ent = kzalloc(sizeof(*ent), GFP_KERNEL_ACCOUNT); |
| if (!ent) |
| return ERR_PTR(err); |
| |
| INIT_LIST_HEAD(&ent->list); |
| |
| ent->queue = queue; |
| ent->headers = iov[0].iov_base; |
| ent->payload = iov[1].iov_base; |
| |
| atomic_inc(&ring->queue_refs); |
| return ent; |
| } |
| |
| /* |
| * Register header and payload buffer with the kernel and puts the |
| * entry as "ready to get fuse requests" on the queue |
| */ |
| static int fuse_uring_register(struct io_uring_cmd *cmd, |
| unsigned int issue_flags, struct fuse_conn *fc) |
| { |
| const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); |
| struct fuse_ring *ring = smp_load_acquire(&fc->ring); |
| struct fuse_ring_queue *queue; |
| struct fuse_ring_ent *ent; |
| int err; |
| unsigned int qid = READ_ONCE(cmd_req->qid); |
| |
| err = -ENOMEM; |
| if (!ring) { |
| ring = fuse_uring_create(fc); |
| if (!ring) |
| return err; |
| } |
| |
| if (qid >= ring->nr_queues) { |
| pr_info_ratelimited("fuse: Invalid ring qid %u\n", qid); |
| return -EINVAL; |
| } |
| |
| queue = ring->queues[qid]; |
| if (!queue) { |
| queue = fuse_uring_create_queue(ring, qid); |
| if (!queue) |
| return err; |
| } |
| |
| /* |
| * The created queue above does not need to be destructed in |
| * case of entry errors below, will be done at ring destruction time. |
| */ |
| |
| ent = fuse_uring_create_ring_ent(cmd, queue); |
| if (IS_ERR(ent)) |
| return PTR_ERR(ent); |
| |
| fuse_uring_do_register(ent, cmd, issue_flags); |
| |
| return 0; |
| } |
| |
| /* |
| * Entry function from io_uring to handle the given passthrough command |
| * (op code IORING_OP_URING_CMD) |
| */ |
| int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) |
| { |
| struct fuse_dev *fud; |
| struct fuse_conn *fc; |
| u32 cmd_op = cmd->cmd_op; |
| int err; |
| |
| if ((unlikely(issue_flags & IO_URING_F_CANCEL))) { |
| fuse_uring_cancel(cmd, issue_flags); |
| return 0; |
| } |
| |
| /* This extra SQE size holds struct fuse_uring_cmd_req */ |
| if (!(issue_flags & IO_URING_F_SQE128)) |
| return -EINVAL; |
| |
| fud = fuse_get_dev(cmd->file); |
| if (!fud) { |
| pr_info_ratelimited("No fuse device found\n"); |
| return -ENOTCONN; |
| } |
| fc = fud->fc; |
| |
| /* Once a connection has io-uring enabled on it, it can't be disabled */ |
| if (!enable_uring && !fc->io_uring) { |
| pr_info_ratelimited("fuse-io-uring is disabled\n"); |
| return -EOPNOTSUPP; |
| } |
| |
| if (fc->aborted) |
| return -ECONNABORTED; |
| if (!fc->connected) |
| return -ENOTCONN; |
| |
| /* |
| * fuse_uring_register() needs the ring to be initialized, |
| * we need to know the max payload size |
| */ |
| if (!fc->initialized) |
| return -EAGAIN; |
| |
| switch (cmd_op) { |
| case FUSE_IO_URING_CMD_REGISTER: |
| err = fuse_uring_register(cmd, issue_flags, fc); |
| if (err) { |
| pr_info_once("FUSE_IO_URING_CMD_REGISTER failed err=%d\n", |
| err); |
| fc->io_uring = 0; |
| wake_up_all(&fc->blocked_waitq); |
| return err; |
| } |
| break; |
| case FUSE_IO_URING_CMD_COMMIT_AND_FETCH: |
| err = fuse_uring_commit_fetch(cmd, issue_flags, fc); |
| if (err) { |
| pr_info_once("FUSE_IO_URING_COMMIT_AND_FETCH failed err=%d\n", |
| err); |
| return err; |
| } |
| break; |
| default: |
| return -EINVAL; |
| } |
| |
| return -EIOCBQUEUED; |
| } |
| |
| static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, |
| ssize_t ret, unsigned int issue_flags) |
| { |
| struct fuse_ring_queue *queue = ent->queue; |
| |
| spin_lock(&queue->lock); |
| ent->state = FRRS_USERSPACE; |
| list_move_tail(&ent->list, &queue->ent_in_userspace); |
| ent->cmd = NULL; |
| spin_unlock(&queue->lock); |
| |
| io_uring_cmd_done(cmd, ret, 0, issue_flags); |
| } |
| |
| /* |
| * This prepares and sends the ring request in fuse-uring task context. |
| * User buffers are not mapped yet - the application does not have permission |
| * to write to it - this has to be executed in ring task context. |
| */ |
| static void fuse_uring_send_in_task(struct io_uring_cmd *cmd, |
| unsigned int issue_flags) |
| { |
| struct fuse_ring_ent *ent = uring_cmd_to_ring_ent(cmd); |
| struct fuse_ring_queue *queue = ent->queue; |
| int err; |
| |
| if (!(issue_flags & IO_URING_F_TASK_DEAD)) { |
| err = fuse_uring_prepare_send(ent, ent->fuse_req); |
| if (err) { |
| fuse_uring_next_fuse_req(ent, queue, issue_flags); |
| return; |
| } |
| } else { |
| err = -ECANCELED; |
| } |
| |
| fuse_uring_send(ent, cmd, err, issue_flags); |
| } |
| |
| static struct fuse_ring_queue *fuse_uring_task_to_queue(struct fuse_ring *ring) |
| { |
| unsigned int qid; |
| struct fuse_ring_queue *queue; |
| |
| qid = task_cpu(current); |
| |
| if (WARN_ONCE(qid >= ring->nr_queues, |
| "Core number (%u) exceeds nr queues (%zu)\n", qid, |
| ring->nr_queues)) |
| qid = 0; |
| |
| queue = ring->queues[qid]; |
| WARN_ONCE(!queue, "Missing queue for qid %d\n", qid); |
| |
| return queue; |
| } |
| |
| static void fuse_uring_dispatch_ent(struct fuse_ring_ent *ent) |
| { |
| struct io_uring_cmd *cmd = ent->cmd; |
| |
| uring_cmd_set_ring_ent(cmd, ent); |
| io_uring_cmd_complete_in_task(cmd, fuse_uring_send_in_task); |
| } |
| |
| /* queue a fuse request and send it if a ring entry is available */ |
| void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) |
| { |
| struct fuse_conn *fc = req->fm->fc; |
| struct fuse_ring *ring = fc->ring; |
| struct fuse_ring_queue *queue; |
| struct fuse_ring_ent *ent = NULL; |
| int err; |
| |
| err = -EINVAL; |
| queue = fuse_uring_task_to_queue(ring); |
| if (!queue) |
| goto err; |
| |
| if (req->in.h.opcode != FUSE_NOTIFY_REPLY) |
| req->in.h.unique = fuse_get_unique(fiq); |
| |
| spin_lock(&queue->lock); |
| err = -ENOTCONN; |
| if (unlikely(queue->stopped)) |
| goto err_unlock; |
| |
| set_bit(FR_URING, &req->flags); |
| req->ring_queue = queue; |
| ent = list_first_entry_or_null(&queue->ent_avail_queue, |
| struct fuse_ring_ent, list); |
| if (ent) |
| fuse_uring_add_req_to_ring_ent(ent, req); |
| else |
| list_add_tail(&req->list, &queue->fuse_req_queue); |
| spin_unlock(&queue->lock); |
| |
| if (ent) |
| fuse_uring_dispatch_ent(ent); |
| |
| return; |
| |
| err_unlock: |
| spin_unlock(&queue->lock); |
| err: |
| req->out.h.error = err; |
| clear_bit(FR_PENDING, &req->flags); |
| fuse_request_end(req); |
| } |
| |
| bool fuse_uring_queue_bq_req(struct fuse_req *req) |
| { |
| struct fuse_conn *fc = req->fm->fc; |
| struct fuse_ring *ring = fc->ring; |
| struct fuse_ring_queue *queue; |
| struct fuse_ring_ent *ent = NULL; |
| |
| queue = fuse_uring_task_to_queue(ring); |
| if (!queue) |
| return false; |
| |
| spin_lock(&queue->lock); |
| if (unlikely(queue->stopped)) { |
| spin_unlock(&queue->lock); |
| return false; |
| } |
| |
| set_bit(FR_URING, &req->flags); |
| req->ring_queue = queue; |
| list_add_tail(&req->list, &queue->fuse_req_bg_queue); |
| |
| ent = list_first_entry_or_null(&queue->ent_avail_queue, |
| struct fuse_ring_ent, list); |
| spin_lock(&fc->bg_lock); |
| fc->num_background++; |
| if (fc->num_background == fc->max_background) |
| fc->blocked = 1; |
| fuse_uring_flush_bg(queue); |
| spin_unlock(&fc->bg_lock); |
| |
| /* |
| * Due to bg_queue flush limits there might be other bg requests |
| * in the queue that need to be handled first. Or no further req |
| * might be available. |
| */ |
| req = list_first_entry_or_null(&queue->fuse_req_queue, struct fuse_req, |
| list); |
| if (ent && req) { |
| fuse_uring_add_req_to_ring_ent(ent, req); |
| spin_unlock(&queue->lock); |
| |
| fuse_uring_dispatch_ent(ent); |
| } else { |
| spin_unlock(&queue->lock); |
| } |
| |
| return true; |
| } |
| |
| bool fuse_uring_remove_pending_req(struct fuse_req *req) |
| { |
| struct fuse_ring_queue *queue = req->ring_queue; |
| |
| return fuse_remove_pending_req(req, &queue->lock); |
| } |
| |
| static const struct fuse_iqueue_ops fuse_io_uring_ops = { |
| /* should be send over io-uring as enhancement */ |
| .send_forget = fuse_dev_queue_forget, |
| |
| /* |
| * could be send over io-uring, but interrupts should be rare, |
| * no need to make the code complex |
| */ |
| .send_interrupt = fuse_dev_queue_interrupt, |
| .send_req = fuse_uring_queue_fuse_req, |
| }; |