io_uring: add blkcg accounting to offloaded operations

There are a few operations that are offloaded to the worker threads. In
this case, we lose process context and end up in kthread context. This
results in ios to be not accounted to the issuing cgroup and
consequently end up as issued by root. Just like others, adopt the
personality of the blkcg too when issuing via the workqueues.

For the SQPOLL thread, it will live and attach in the inited cgroup's
context.

Signed-off-by: Dennis Zhou <dennis@kernel.org>
diff --git a/fs/io-wq.c b/fs/io-wq.c
index 414beb5..3c076d5 100644
--- a/fs/io-wq.c
+++ b/fs/io-wq.c
@@ -17,6 +17,7 @@
 #include <linux/rculist_nulls.h>
 #include <linux/fs_struct.h>
 #include <linux/task_work.h>
+#include <linux/blk-cgroup.h>
 
 #include "io-wq.h"
 
@@ -57,6 +58,9 @@
 
 	struct rcu_head rcu;
 	struct mm_struct *mm;
+#ifdef CONFIG_BLK_CGROUP
+	struct cgroup_subsys_state *blkcg_css;
+#endif
 	const struct cred *cur_creds;
 	const struct cred *saved_creds;
 	struct files_struct *restore_files;
@@ -175,6 +179,13 @@
 		worker->mm = NULL;
 	}
 
+#ifdef CONFIG_BLK_CGROUP
+	if (worker->blkcg_css) {
+		kthread_associate_blkcg(NULL);
+		worker->blkcg_css = NULL;
+	}
+#endif
+
 	return dropped_lock;
 }
 
@@ -436,6 +447,15 @@
 	work->flags |= IO_WQ_WORK_CANCEL;
 }
 
+static inline void io_wq_switch_blkcg(struct io_worker *worker,
+				      struct io_wq_work *work)
+{
+#ifdef CONFIG_BLK_CGROUP
+	if (work->blkcg_css != worker->blkcg_css)
+		kthread_associate_blkcg(work->blkcg_css);
+#endif
+}
+
 static void io_wq_switch_creds(struct io_worker *worker,
 			       struct io_wq_work *work)
 {
@@ -463,6 +483,7 @@
 	if (worker->cur_creds != work->creds)
 		io_wq_switch_creds(worker, work);
 	current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->fsize;
+	io_wq_switch_blkcg(worker, work);
 }
 
 static void io_assign_current_work(struct io_worker *worker,
diff --git a/fs/io-wq.h b/fs/io-wq.h
index ddaf961..126a82a 100644
--- a/fs/io-wq.h
+++ b/fs/io-wq.h
@@ -87,6 +87,9 @@
 	struct io_wq_work_node list;
 	struct files_struct *files;
 	struct mm_struct *mm;
+#ifdef CONFIG_BLK_CGROUP
+	struct cgroup_subsys_state *blkcg_css;
+#endif
 	const struct cred *creds;
 	struct fs_struct *fs;
 	unsigned long fsize;
diff --git a/fs/io_uring.c b/fs/io_uring.c
index ce69bd9..944c048 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -79,6 +79,7 @@
 #include <linux/splice.h>
 #include <linux/task_work.h>
 #include <linux/pagemap.h>
+#include <linux/blk-cgroup.h>
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/io_uring.h>
@@ -267,6 +268,9 @@
 	struct task_struct	*sqo_thread;	/* if using sq thread polling */
 	struct mm_struct	*sqo_mm;
 	wait_queue_head_t	sqo_wait;
+#ifdef CONFIG_BLK_CGROUP
+	struct cgroup_subsys_state *sqo_blkcg_css;
+#endif
 
 	/*
 	 * If used, fixed file set. Writers must ensure that ->refs is dead,
@@ -701,6 +705,8 @@
 	unsigned		async_ctx : 1;
 	/* needs current->mm setup, does mm access */
 	unsigned		needs_mm : 1;
+	/* needs blkcg context, issues async io */
+	unsigned		needs_blkcg : 1;
 	/* needs req->file assigned */
 	unsigned		needs_file : 1;
 	/* don't fail if file grab fails */
@@ -728,6 +734,7 @@
 	[IORING_OP_READV] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
@@ -736,6 +743,7 @@
 	[IORING_OP_WRITEV] = {
 		.async_ctx		= 1,
 		.needs_mm		= 1,
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
@@ -743,14 +751,17 @@
 		.needs_fsize		= 1,
 	},
 	[IORING_OP_FSYNC] = {
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 	},
 	[IORING_OP_READ_FIXED] = {
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 	},
 	[IORING_OP_WRITE_FIXED] = {
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
@@ -763,10 +774,12 @@
 	},
 	[IORING_OP_POLL_REMOVE] = {},
 	[IORING_OP_SYNC_FILE_RANGE] = {
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 	},
 	[IORING_OP_SENDMSG] = {
 		.async_ctx		= 1,
+		.needs_blkcg		= 1,
 		.needs_mm		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
@@ -807,14 +820,17 @@
 		.pollout		= 1,
 	},
 	[IORING_OP_FALLOCATE] = {
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.needs_fsize		= 1,
 	},
 	[IORING_OP_OPENAT] = {
+		.needs_blkcg		= 1,
 		.file_table		= 1,
 		.needs_fs		= 1,
 	},
 	[IORING_OP_CLOSE] = {
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.needs_file_no_error	= 1,
 		.file_table		= 1,
@@ -825,11 +841,13 @@
 	},
 	[IORING_OP_STATX] = {
 		.needs_mm		= 1,
+		.needs_blkcg		= 1,
 		.needs_fs		= 1,
 		.file_table		= 1,
 	},
 	[IORING_OP_READ] = {
 		.needs_mm		= 1,
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
@@ -837,31 +855,37 @@
 	},
 	[IORING_OP_WRITE] = {
 		.needs_mm		= 1,
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
 		.needs_fsize		= 1,
 	},
 	[IORING_OP_FADVISE] = {
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 	},
 	[IORING_OP_MADVISE] = {
 		.needs_mm		= 1,
+		.needs_blkcg		= 1,
 	},
 	[IORING_OP_SEND] = {
 		.needs_mm		= 1,
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollout		= 1,
 	},
 	[IORING_OP_RECV] = {
 		.needs_mm		= 1,
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.unbound_nonreg_file	= 1,
 		.pollin			= 1,
 		.buffer_select		= 1,
 	},
 	[IORING_OP_OPENAT2] = {
+		.needs_blkcg		= 1,
 		.file_table		= 1,
 		.needs_fs		= 1,
 	},
@@ -870,6 +894,7 @@
 		.file_table		= 1,
 	},
 	[IORING_OP_SPLICE] = {
+		.needs_blkcg		= 1,
 		.needs_file		= 1,
 		.hash_reg_file		= 1,
 		.unbound_nonreg_file	= 1,
@@ -985,6 +1010,20 @@
 	return __io_sq_thread_acquire_mm(ctx);
 }
 
+static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx)
+{
+#ifdef CONFIG_BLK_CGROUP
+	kthread_associate_blkcg(ctx->sqo_blkcg_css);
+#endif
+}
+
+static void io_sq_thread_unassociate_blkcg(void)
+{
+#ifdef CONFIG_BLK_CGROUP
+	kthread_associate_blkcg(NULL);
+#endif
+}
+
 static inline void req_set_fail_links(struct io_kiocb *req)
 {
 	if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK)
@@ -1121,6 +1160,10 @@
 		mmdrop(req->work.mm);
 		req->work.mm = NULL;
 	}
+#ifdef CONFIG_BLK_CGROUP
+	if (req->work.blkcg_css)
+		css_put(req->work.blkcg_css);
+#endif
 	if (req->work.creds) {
 		put_cred(req->work.creds);
 		req->work.creds = NULL;
@@ -1160,6 +1203,17 @@
 		mmgrab(current->mm);
 		req->work.mm = current->mm;
 	}
+#ifdef CONFIG_BLK_CGROUP
+	if (!req->work.blkcg_css && def->needs_blkcg) {
+		req->work.blkcg_css = blkcg_css();
+		/*
+		 * This should be rare, either the cgroup is dying or the task
+		 * is moving cgroups. Just punt to root for the handful of ios.
+		 */
+		if (!css_tryget_online(req->work.blkcg_css))
+			req->work.blkcg_css = NULL;
+	}
+#endif
 	if (!req->work.creds)
 		req->work.creds = get_current_cred();
 	if (!req->work.fs && def->needs_fs) {
@@ -6528,6 +6582,7 @@
 	complete(&ctx->sq_thread_comp);
 
 	old_cred = override_creds(ctx->creds);
+	io_sq_thread_associate_blkcg(ctx);
 
 	timeout = jiffies + ctx->sq_thread_idle;
 	while (!kthread_should_park()) {
@@ -6626,6 +6681,7 @@
 
 	io_run_task_work();
 
+	io_sq_thread_unassociate_blkcg();
 	io_sq_thread_drop_mm();
 	revert_creds(old_cred);
 
@@ -7861,6 +7917,10 @@
 		mmdrop(ctx->sqo_mm);
 		ctx->sqo_mm = NULL;
 	}
+#ifdef CONFIG_BLK_CGROUP
+	if (ctx->sqo_blkcg_css)
+		css_put(ctx->sqo_blkcg_css);
+#endif
 
 	io_sqe_files_unregister(ctx);
 	io_eventfd_unregister(ctx);
@@ -8548,6 +8608,21 @@
 	mmgrab(current->mm);
 	ctx->sqo_mm = current->mm;
 
+#ifdef CONFIG_BLK_CGROUP
+	/*
+	 * The sq thread will belong to the original cgroup it was inited in.
+	 * If the cgroup goes offline (e.g. disabling the io controller), then
+	 * issued bios will be associated with the closest cgroup later in the
+	 * block layer.
+	 */
+	ctx->sqo_blkcg_css = blkcg_css();
+	if (!css_tryget_online(ctx->sqo_blkcg_css)) {
+		/* don't init against a dying cgroup, have the user try again */
+		ret = -ENODEV;
+		goto err;
+	}
+#endif
+
 	/*
 	 * Account memory _before_ installing the file descriptor. Once
 	 * the descriptor is installed, it can get closed at any time. Also