| From 3217d38dbfba0f08b7c6a5e6b02153949a8c041e Mon Sep 17 00:00:00 2001 |
| From: Liu Bo <bo.liu@linux.alibaba.com> |
| Date: Fri, 25 Jan 2019 08:12:47 +0800 |
| Subject: blk-iolatency: fix IO hang due to negative inflight counter |
| |
| [ Upstream commit 8c772a9bfc7c07c76f4a58b58910452fbb20843b ] |
| |
| Our test reported the following stack, and vmcore showed that |
| ->inflight counter is -1. |
| |
| [ffffc9003fcc38d0] __schedule at ffffffff8173d95d |
| [ffffc9003fcc3958] schedule at ffffffff8173de26 |
| [ffffc9003fcc3970] io_schedule at ffffffff810bb6b6 |
| [ffffc9003fcc3988] blkcg_iolatency_throttle at ffffffff813911cb |
| [ffffc9003fcc3a20] rq_qos_throttle at ffffffff813847f3 |
| [ffffc9003fcc3a48] blk_mq_make_request at ffffffff8137468a |
| [ffffc9003fcc3b08] generic_make_request at ffffffff81368b49 |
| [ffffc9003fcc3b68] submit_bio at ffffffff81368d7d |
| [ffffc9003fcc3bb8] ext4_io_submit at ffffffffa031be00 [ext4] |
| [ffffc9003fcc3c00] ext4_writepages at ffffffffa03163de [ext4] |
| [ffffc9003fcc3d68] do_writepages at ffffffff811c49ae |
| [ffffc9003fcc3d78] __filemap_fdatawrite_range at ffffffff811b6188 |
| [ffffc9003fcc3e30] filemap_write_and_wait_range at ffffffff811b6301 |
| [ffffc9003fcc3e60] ext4_sync_file at ffffffffa030cee8 [ext4] |
| [ffffc9003fcc3ea8] vfs_fsync_range at ffffffff8128594b |
| [ffffc9003fcc3ee8] do_fsync at ffffffff81285abd |
| [ffffc9003fcc3f18] sys_fsync at ffffffff81285d50 |
| [ffffc9003fcc3f28] do_syscall_64 at ffffffff81003c04 |
| [ffffc9003fcc3f50] entry_SYSCALL_64_after_swapgs at ffffffff81742b8e |
| |
| The ->inflight counter may be negative (-1) if |
| |
| 1) blk-iolatency was disabled when the IO was issued, |
| |
| 2) blk-iolatency was enabled before this IO reached its endio, |
| |
| 3) the ->inflight counter is decreased from 0 to -1 in endio() |
| |
| In fact the hang can be easily reproduced by the below script, |
| |
| H=/sys/fs/cgroup/unified/ |
| P=/sys/fs/cgroup/unified/test |
| |
| echo "+io" > $H/cgroup.subtree_control |
| mkdir -p $P |
| |
| echo $$ > $P/cgroup.procs |
| |
| xfs_io -f -d -c "pwrite 0 4k" /dev/sdg |
| |
| echo "`cat /sys/block/sdg/dev` target=1000000" > $P/io.latency |
| |
| xfs_io -f -d -c "pwrite 0 4k" /dev/sdg |
| |
| This fixes the problem by freezing the queue so that while |
| enabling/disabling iolatency, there is no inflight rq running. |
| |
| Note that quiesce_queue is not needed as this only updating iolatency |
| configuration about which dispatching request_queue doesn't care. |
| |
| Signed-off-by: Liu Bo <bo.liu@linux.alibaba.com> |
| Signed-off-by: Jens Axboe <axboe@kernel.dk> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| block/blk-iolatency.c | 52 +++++++++++++++++++++++++++++++++++++------ |
| 1 file changed, 45 insertions(+), 7 deletions(-) |
| |
| diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c |
| index 19923f8a029d..b154e057ca67 100644 |
| --- a/block/blk-iolatency.c |
| +++ b/block/blk-iolatency.c |
| @@ -72,6 +72,7 @@ |
| #include <linux/sched/loadavg.h> |
| #include <linux/sched/signal.h> |
| #include <trace/events/block.h> |
| +#include <linux/blk-mq.h> |
| #include "blk-rq-qos.h" |
| #include "blk-stat.h" |
| |
| @@ -568,6 +569,9 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) |
| return; |
| |
| enabled = blk_iolatency_enabled(iolat->blkiolat); |
| + if (!enabled) |
| + return; |
| + |
| while (blkg && blkg->parent) { |
| iolat = blkg_to_lat(blkg); |
| if (!iolat) { |
| @@ -577,7 +581,7 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) |
| rqw = &iolat->rq_wait; |
| |
| atomic_dec(&rqw->inflight); |
| - if (!enabled || iolat->min_lat_nsec == 0) |
| + if (iolat->min_lat_nsec == 0) |
| goto next; |
| iolatency_record_time(iolat, &bio->bi_issue, now, |
| issue_as_root); |
| @@ -721,10 +725,13 @@ int blk_iolatency_init(struct request_queue *q) |
| return 0; |
| } |
| |
| -static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) |
| +/* |
| + * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise |
| + * return 0. |
| + */ |
| +static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) |
| { |
| struct iolatency_grp *iolat = blkg_to_lat(blkg); |
| - struct blk_iolatency *blkiolat = iolat->blkiolat; |
| u64 oldval = iolat->min_lat_nsec; |
| |
| iolat->min_lat_nsec = val; |
| @@ -733,9 +740,10 @@ static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) |
| BLKIOLATENCY_MAX_WIN_SIZE); |
| |
| if (!oldval && val) |
| - atomic_inc(&blkiolat->enabled); |
| + return 1; |
| if (oldval && !val) |
| - atomic_dec(&blkiolat->enabled); |
| + return -1; |
| + return 0; |
| } |
| |
| static void iolatency_clear_scaling(struct blkcg_gq *blkg) |
| @@ -768,6 +776,7 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, |
| u64 lat_val = 0; |
| u64 oldval; |
| int ret; |
| + int enable = 0; |
| |
| ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); |
| if (ret) |
| @@ -803,7 +812,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, |
| blkg = ctx.blkg; |
| oldval = iolat->min_lat_nsec; |
| |
| - iolatency_set_min_lat_nsec(blkg, lat_val); |
| + enable = iolatency_set_min_lat_nsec(blkg, lat_val); |
| + if (enable) { |
| + WARN_ON_ONCE(!blk_get_queue(blkg->q)); |
| + blkg_get(blkg); |
| + } |
| + |
| if (oldval != iolat->min_lat_nsec) { |
| iolatency_clear_scaling(blkg); |
| } |
| @@ -811,6 +825,24 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, |
| ret = 0; |
| out: |
| blkg_conf_finish(&ctx); |
| + if (ret == 0 && enable) { |
| + struct iolatency_grp *tmp = blkg_to_lat(blkg); |
| + struct blk_iolatency *blkiolat = tmp->blkiolat; |
| + |
| + blk_mq_freeze_queue(blkg->q); |
| + |
| + if (enable == 1) |
| + atomic_inc(&blkiolat->enabled); |
| + else if (enable == -1) |
| + atomic_dec(&blkiolat->enabled); |
| + else |
| + WARN_ON_ONCE(1); |
| + |
| + blk_mq_unfreeze_queue(blkg->q); |
| + |
| + blkg_put(blkg); |
| + blk_put_queue(blkg->q); |
| + } |
| return ret ?: nbytes; |
| } |
| |
| @@ -910,8 +942,14 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd) |
| { |
| struct iolatency_grp *iolat = pd_to_lat(pd); |
| struct blkcg_gq *blkg = lat_to_blkg(iolat); |
| + struct blk_iolatency *blkiolat = iolat->blkiolat; |
| + int ret; |
| |
| - iolatency_set_min_lat_nsec(blkg, 0); |
| + ret = iolatency_set_min_lat_nsec(blkg, 0); |
| + if (ret == 1) |
| + atomic_inc(&blkiolat->enabled); |
| + if (ret == -1) |
| + atomic_dec(&blkiolat->enabled); |
| iolatency_clear_scaling(blkg); |
| } |
| |
| -- |
| 2.19.1 |
| |