io_uring: punt final io_ring_ctx wait-and-free to workqueue We can't reliably wait in io_ring_ctx_wait_and_kill(), since the task_works list isn't ordered (in fact it's LIFO ordered). We could either fix this with a separate task_works list for io_uring work, or just punt the wait-and-free to async context. This ensures that task_work that comes in while we're shutting down is processed correctly. If we don't go async, we could have work past the fput() work for the ring that depends on work that won't be executed until after we're done with the wait-and-free. But as this operation is blocking, it'll never get a chance to run. This was reproduced with hundreds of thousands of sockets running memcached, haven't been able to reproduce this synthetically. Reported-by: Dan Melnic <dmm@fb.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>

commit: 85faa7b8346ebef0606d2d0df6d3f8c76acb3654 [log] [tgz]
author: Jens Axboe <axboe@kernel.dk> Thu Apr 09 18:14:00 2020 -0600
committer: Jens Axboe <axboe@kernel.dk> Thu Apr 09 18:45:27 2020 -0600
tree: 5889b4560c5d4c6be271181d7d48dd350e88dd4a
parent: c398ecb3d611925e4a5411afdf7489914a5c0460 [diff]
diff --git a/fs/io_uring.c b/fs/io_uring.c
index be65eda..5190bfb 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c

@@ -326,6 +326,8 @@
 		spinlock_t		inflight_lock;
 		struct list_head	inflight_list;
 	} ____cacheline_aligned_in_smp;
+
+	struct work_struct		exit_work;
 };
 
 /*
@@ -7271,6 +7273,18 @@
 	return 0;
 }
 
+static void io_ring_exit_work(struct work_struct *work)
+{
+	struct io_ring_ctx *ctx;
+
+	ctx = container_of(work, struct io_ring_ctx, exit_work);
+	if (ctx->rings)
+		io_cqring_overflow_flush(ctx, true);
+
+	wait_for_completion(&ctx->completions[0]);
+	io_ring_ctx_free(ctx);
+}
+
 static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx)
 {
 	mutex_lock(&ctx->uring_lock);
@@ -7298,8 +7312,8 @@
 	if (ctx->rings)
 		io_cqring_overflow_flush(ctx, true);
 	idr_for_each(&ctx->personality_idr, io_remove_personalities, ctx);
-	wait_for_completion(&ctx->completions[0]);
-	io_ring_ctx_free(ctx);
+	INIT_WORK(&ctx->exit_work, io_ring_exit_work);
+	queue_work(system_wq, &ctx->exit_work);
 }
 
 static int io_uring_release(struct inode *inode, struct file *file)
commit	85faa7b8346ebef0606d2d0df6d3f8c76acb3654	[log] [tgz]
author	Jens Axboe <axboe@kernel.dk>	Thu Apr 09 18:14:00 2020 -0600
committer	Jens Axboe <axboe@kernel.dk>	Thu Apr 09 18:45:27 2020 -0600
tree	5889b4560c5d4c6be271181d7d48dd350e88dd4a
parent	c398ecb3d611925e4a5411afdf7489914a5c0460 [diff]