Merge tag 'io_uring-5.13-2021-05-07' of git://git.kernel.dk/linux-block

Pull io_uring fixes from Jens Axboe:
 "Mostly fixes for merge window merged code. In detail:

   - Error case memory leak fixes (Colin, Zqiang)

   - Add the tools/io_uring/ to the list of maintained files (Lukas)

   - Set of fixes for the modified buffer registration API (Pavel)

   - Sanitize io thread setup on x86 (Stefan)

   - Ensure we truncate transfer count for registered buffers (Thadeu)"

* tag 'io_uring-5.13-2021-05-07' of git://git.kernel.dk/linux-block:
  x86/process: setup io_threads more like normal user space threads
  MAINTAINERS: add io_uring tool to IO_URING
  io_uring: truncate lengths larger than MAX_RW_COUNT on provide buffers
  io_uring: Fix memory leak in io_sqe_buffers_register()
  io_uring: Fix premature return from loop and memory leak
  io_uring: fix unchecked error in switch_start()
  io_uring: allow empty slots for reg buffers
  io_uring: add more build check for uapi
  io_uring: dont overlap internal and user req flags
  io_uring: fix drain with rsrc CQEs
diff --git a/MAINTAINERS b/MAINTAINERS
index e845f05..ad0e9be 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -9552,6 +9552,7 @@
 F:	fs/io_uring.c
 F:	include/linux/io_uring.h
 F:	include/uapi/linux/io_uring.h
+F:	tools/io_uring/
 
 IPMI SUBSYSTEM
 M:	Corey Minyard <minyard@acm.org>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 43cbfc8..5e1f381 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -156,7 +156,7 @@
 #endif
 
 	/* Kernel thread ? */
-	if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) {
+	if (unlikely(p->flags & PF_KTHREAD)) {
 		memset(childregs, 0, sizeof(struct pt_regs));
 		kthread_frame_init(frame, sp, arg);
 		return 0;
@@ -172,6 +172,23 @@
 	task_user_gs(p) = get_user_gs(current_pt_regs());
 #endif
 
+	if (unlikely(p->flags & PF_IO_WORKER)) {
+		/*
+		 * An IO thread is a user space thread, but it doesn't
+		 * return to ret_after_fork().
+		 *
+		 * In order to indicate that to tools like gdb,
+		 * we reset the stack and instruction pointers.
+		 *
+		 * It does the same kernel frame setup to return to a kernel
+		 * function that a kernel thread does.
+		 */
+		childregs->sp = 0;
+		childregs->ip = 0;
+		kthread_frame_init(frame, sp, arg);
+		return 0;
+	}
+
 	/* Set a new TLS for the child thread? */
 	if (clone_flags & CLONE_SETTLS)
 		ret = set_new_tls(p, tls);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index 360f813..f46acbb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -251,7 +251,7 @@
 struct io_buffer {
 	struct list_head list;
 	__u64 addr;
-	__s32 len;
+	__u32 len;
 	__u16 bid;
 };
 
@@ -456,6 +456,7 @@
 	spinlock_t			rsrc_ref_lock;
 	struct io_rsrc_node		*rsrc_node;
 	struct io_rsrc_node		*rsrc_backup_node;
+	struct io_mapped_ubuf		*dummy_ubuf;
 
 	struct io_restriction		restrictions;
 
@@ -702,7 +703,8 @@
 	REQ_F_FORCE_ASYNC_BIT	= IOSQE_ASYNC_BIT,
 	REQ_F_BUFFER_SELECT_BIT	= IOSQE_BUFFER_SELECT_BIT,
 
-	REQ_F_FAIL_LINK_BIT,
+	/* first byte is taken by user flags, shift it to not overlap */
+	REQ_F_FAIL_LINK_BIT	= 8,
 	REQ_F_INFLIGHT_BIT,
 	REQ_F_CUR_POS_BIT,
 	REQ_F_NOWAIT_BIT,
@@ -1157,6 +1159,12 @@
 		goto err;
 	__hash_init(ctx->cancel_hash, 1U << hash_bits);
 
+	ctx->dummy_ubuf = kzalloc(sizeof(*ctx->dummy_ubuf), GFP_KERNEL);
+	if (!ctx->dummy_ubuf)
+		goto err;
+	/* set invalid range, so io_import_fixed() fails meeting it */
+	ctx->dummy_ubuf->ubuf = -1UL;
+
 	if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free,
 			    PERCPU_REF_ALLOW_REINIT, GFP_KERNEL))
 		goto err;
@@ -1184,6 +1192,7 @@
 	INIT_LIST_HEAD(&ctx->submit_state.comp.locked_free_list);
 	return ctx;
 err:
+	kfree(ctx->dummy_ubuf);
 	kfree(ctx->cancel_hash);
 	kfree(ctx);
 	return NULL;
@@ -3977,7 +3986,7 @@
 			break;
 
 		buf->addr = addr;
-		buf->len = pbuf->len;
+		buf->len = min_t(__u32, pbuf->len, MAX_RW_COUNT);
 		buf->bid = bid;
 		addr += pbuf->len;
 		bid++;
@@ -6503,14 +6512,10 @@
 	req->work.creds = NULL;
 
 	/* enforce forwards compatibility on users */
-	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) {
-		req->flags = 0;
+	if (unlikely(sqe_flags & ~SQE_VALID_FLAGS))
 		return -EINVAL;
-	}
-
 	if (unlikely(req->opcode >= IORING_OP_LAST))
 		return -EINVAL;
-
 	if (unlikely(!io_check_restriction(ctx, req, sqe_flags)))
 		return -EACCES;
 
@@ -7539,6 +7544,7 @@
 			io_ring_submit_lock(ctx, lock_ring);
 			spin_lock_irqsave(&ctx->completion_lock, flags);
 			io_cqring_fill_event(ctx, prsrc->tag, 0, 0);
+			ctx->cq_extra++;
 			io_commit_cqring(ctx);
 			spin_unlock_irqrestore(&ctx->completion_lock, flags);
 			io_cqring_ev_posted(ctx);
@@ -8111,11 +8117,13 @@
 	struct io_mapped_ubuf *imu = *slot;
 	unsigned int i;
 
-	for (i = 0; i < imu->nr_bvecs; i++)
-		unpin_user_page(imu->bvec[i].bv_page);
-	if (imu->acct_pages)
-		io_unaccount_mem(ctx, imu->acct_pages);
-	kvfree(imu);
+	if (imu != ctx->dummy_ubuf) {
+		for (i = 0; i < imu->nr_bvecs; i++)
+			unpin_user_page(imu->bvec[i].bv_page);
+		if (imu->acct_pages)
+			io_unaccount_mem(ctx, imu->acct_pages);
+		kvfree(imu);
+	}
 	*slot = NULL;
 }
 
@@ -8132,7 +8140,7 @@
 	for (i = 0; i < ctx->nr_user_bufs; i++)
 		io_buffer_unmap(ctx, &ctx->user_bufs[i]);
 	kfree(ctx->user_bufs);
-	kfree(ctx->buf_data);
+	io_rsrc_data_free(ctx->buf_data);
 	ctx->user_bufs = NULL;
 	ctx->buf_data = NULL;
 	ctx->nr_user_bufs = 0;
@@ -8255,6 +8263,11 @@
 	size_t size;
 	int ret, pret, nr_pages, i;
 
+	if (!iov->iov_base) {
+		*pimu = ctx->dummy_ubuf;
+		return 0;
+	}
+
 	ubuf = (unsigned long) iov->iov_base;
 	end = (ubuf + iov->iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
 	start = ubuf >> PAGE_SHIFT;
@@ -8352,7 +8365,9 @@
 	 * constraints here, we'll -EINVAL later when IO is
 	 * submitted if they are wrong.
 	 */
-	if (!iov->iov_base || !iov->iov_len)
+	if (!iov->iov_base)
+		return iov->iov_len ? -EFAULT : 0;
+	if (!iov->iov_len)
 		return -EFAULT;
 
 	/* arbitrary limit, but we need something */
@@ -8385,7 +8400,7 @@
 		return -ENOMEM;
 	ret = io_buffers_map_alloc(ctx, nr_args);
 	if (ret) {
-		kfree(data);
+		io_rsrc_data_free(data);
 		return ret;
 	}
 
@@ -8402,6 +8417,10 @@
 		ret = io_buffer_validate(&iov);
 		if (ret)
 			break;
+		if (!iov.iov_base && tag) {
+			ret = -EINVAL;
+			break;
+		}
 
 		ret = io_sqe_buffer_register(ctx, &iov, &ctx->user_bufs[i],
 					     &last_hpage);
@@ -8451,12 +8470,16 @@
 		err = io_buffer_validate(&iov);
 		if (err)
 			break;
+		if (!iov.iov_base && tag) {
+			err = -EINVAL;
+			break;
+		}
 		err = io_sqe_buffer_register(ctx, &iov, &imu, &last_hpage);
 		if (err)
 			break;
 
 		i = array_index_nospec(offset, ctx->nr_user_bufs);
-		if (ctx->user_bufs[i]) {
+		if (ctx->user_bufs[i] != ctx->dummy_ubuf) {
 			err = io_queue_rsrc_removal(ctx->buf_data, offset,
 						    ctx->rsrc_node, ctx->user_bufs[i]);
 			if (unlikely(err)) {
@@ -8604,6 +8627,7 @@
 	if (ctx->hash_map)
 		io_wq_put_hash(ctx->hash_map);
 	kfree(ctx->cancel_hash);
+	kfree(ctx->dummy_ubuf);
 	kfree(ctx);
 }
 
@@ -9607,7 +9631,9 @@
 	if (ret)
 		goto err;
 	/* always set a rsrc node */
-	io_rsrc_node_switch_start(ctx);
+	ret = io_rsrc_node_switch_start(ctx);
+	if (ret)
+		goto err;
 	io_rsrc_node_switch(ctx, NULL);
 
 	memset(&p->sq_off, 0, sizeof(p->sq_off));
@@ -10136,6 +10162,13 @@
 	BUILD_BUG_SQE_ELEM(42, __u16,  personality);
 	BUILD_BUG_SQE_ELEM(44, __s32,  splice_fd_in);
 
+	BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
+		     sizeof(struct io_uring_rsrc_update));
+	BUILD_BUG_ON(sizeof(struct io_uring_rsrc_update) >
+		     sizeof(struct io_uring_rsrc_update2));
+	/* should fit into one byte */
+	BUILD_BUG_ON(SQE_VALID_FLAGS >= (1 << 8));
+
 	BUILD_BUG_ON(ARRAY_SIZE(io_op_defs) != IORING_OP_LAST);
 	BUILD_BUG_ON(__REQ_F_LAST_BIT >= 8 * sizeof(int));
 	req_cachep = KMEM_CACHE(io_kiocb, SLAB_HWCACHE_ALIGN | SLAB_PANIC |