| From 0048b4837affd153897ed1222283492070027aa9 Mon Sep 17 00:00:00 2001 |
| From: Ming Lei <ming.lei@canonical.com> |
| Date: Sun, 9 Aug 2015 03:41:51 -0400 |
| Subject: blk-mq: fix race between timeout and freeing request |
| |
| From: Ming Lei <ming.lei@canonical.com> |
| |
| commit 0048b4837affd153897ed1222283492070027aa9 upstream. |
| |
| Inside timeout handler, blk_mq_tag_to_rq() is called |
| to retrieve the request from one tag. This way is obviously |
| wrong because the request can be freed any time and some |
| fiedds of the request can't be trusted, then kernel oops |
| might be triggered[1]. |
| |
| Currently wrt. blk_mq_tag_to_rq(), the only special case is |
| that the flush request can share same tag with the request |
| cloned from, and the two requests can't be active at the same |
| time, so this patch fixes the above issue by updating tags->rqs[tag] |
| with the active request(either flush rq or the request cloned |
| from) of the tag. |
| |
| Also blk_mq_tag_to_rq() gets much simplified with this patch. |
| |
| Given blk_mq_tag_to_rq() is mainly for drivers and the caller must |
| make sure the request can't be freed, so in bt_for_each() this |
| helper is replaced with tags->rqs[tag]. |
| |
| [1] kernel oops log |
| [ 439.696220] BUG: unable to handle kernel NULL pointer dereference at 0000000000000158^M |
| [ 439.697162] IP: [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M |
| [ 439.700653] PGD 7ef765067 PUD 7ef764067 PMD 0 ^M |
| [ 439.700653] Oops: 0000 [#1] PREEMPT SMP DEBUG_PAGEALLOC ^M |
| [ 439.700653] Dumping ftrace buffer:^M |
| [ 439.700653] (ftrace buffer empty)^M |
| [ 439.700653] Modules linked in: nbd ipv6 kvm_intel kvm serio_raw^M |
| [ 439.700653] CPU: 6 PID: 2779 Comm: stress-ng-sigfd Not tainted 4.2.0-rc5-next-20150805+ #265^M |
| [ 439.730500] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011^M |
| [ 439.730500] task: ffff880605308000 ti: ffff88060530c000 task.ti: ffff88060530c000^M |
| [ 439.730500] RIP: 0010:[<ffffffff812d89ba>] [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M |
| [ 439.730500] RSP: 0018:ffff880819203da0 EFLAGS: 00010283^M |
| [ 439.730500] RAX: ffff880811b0e000 RBX: ffff8800bb465f00 RCX: 0000000000000002^M |
| [ 439.730500] RDX: 0000000000000000 RSI: 0000000000000202 RDI: 0000000000000000^M |
| [ 439.730500] RBP: ffff880819203db0 R08: 0000000000000002 R09: 0000000000000000^M |
| [ 439.730500] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000202^M |
| [ 439.730500] R13: ffff880814104800 R14: 0000000000000002 R15: ffff880811a2ea00^M |
| [ 439.730500] FS: 00007f165b3f5740(0000) GS:ffff880819200000(0000) knlGS:0000000000000000^M |
| [ 439.730500] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b^M |
| [ 439.730500] CR2: 0000000000000158 CR3: 00000007ef766000 CR4: 00000000000006e0^M |
| [ 439.730500] Stack:^M |
| [ 439.730500] 0000000000000008 ffff8808114eed90 ffff880819203e00 ffffffff812dc104^M |
| [ 439.755663] ffff880819203e40 ffffffff812d9f5e 0000020000000000 ffff8808114eed80^M |
| [ 439.755663] Call Trace:^M |
| [ 439.755663] <IRQ> ^M |
| [ 439.755663] [<ffffffff812dc104>] bt_for_each+0x6e/0xc8^M |
| [ 439.755663] [<ffffffff812d9f5e>] ? blk_mq_rq_timed_out+0x6a/0x6a^M |
| [ 439.755663] [<ffffffff812d9f5e>] ? blk_mq_rq_timed_out+0x6a/0x6a^M |
| [ 439.755663] [<ffffffff812dc1b3>] blk_mq_tag_busy_iter+0x55/0x5e^M |
| [ 439.755663] [<ffffffff812d88b4>] ? blk_mq_bio_to_request+0x38/0x38^M |
| [ 439.755663] [<ffffffff812d8911>] blk_mq_rq_timer+0x5d/0xd4^M |
| [ 439.755663] [<ffffffff810a3e10>] call_timer_fn+0xf7/0x284^M |
| [ 439.755663] [<ffffffff810a3d1e>] ? call_timer_fn+0x5/0x284^M |
| [ 439.755663] [<ffffffff812d88b4>] ? blk_mq_bio_to_request+0x38/0x38^M |
| [ 439.755663] [<ffffffff810a46d6>] run_timer_softirq+0x1ce/0x1f8^M |
| [ 439.755663] [<ffffffff8104c367>] __do_softirq+0x181/0x3a4^M |
| [ 439.755663] [<ffffffff8104c76e>] irq_exit+0x40/0x94^M |
| [ 439.755663] [<ffffffff81031482>] smp_apic_timer_interrupt+0x33/0x3e^M |
| [ 439.755663] [<ffffffff815559a4>] apic_timer_interrupt+0x84/0x90^M |
| [ 439.755663] <EOI> ^M |
| [ 439.755663] [<ffffffff81554350>] ? _raw_spin_unlock_irq+0x32/0x4a^M |
| [ 439.755663] [<ffffffff8106a98b>] finish_task_switch+0xe0/0x163^M |
| [ 439.755663] [<ffffffff8106a94d>] ? finish_task_switch+0xa2/0x163^M |
| [ 439.755663] [<ffffffff81550066>] __schedule+0x469/0x6cd^M |
| [ 439.755663] [<ffffffff8155039b>] schedule+0x82/0x9a^M |
| [ 439.789267] [<ffffffff8119b28b>] signalfd_read+0x186/0x49a^M |
| [ 439.790911] [<ffffffff8106d86a>] ? wake_up_q+0x47/0x47^M |
| [ 439.790911] [<ffffffff811618c2>] __vfs_read+0x28/0x9f^M |
| [ 439.790911] [<ffffffff8117a289>] ? __fget_light+0x4d/0x74^M |
| [ 439.790911] [<ffffffff811620a7>] vfs_read+0x7a/0xc6^M |
| [ 439.790911] [<ffffffff8116292b>] SyS_read+0x49/0x7f^M |
| [ 439.790911] [<ffffffff81554c17>] entry_SYSCALL_64_fastpath+0x12/0x6f^M |
| [ 439.790911] Code: 48 89 e5 e8 a9 b8 e7 ff 5d c3 0f 1f 44 00 00 55 89 |
| f2 48 89 e5 41 54 41 89 f4 53 48 8b 47 60 48 8b 1c d0 48 8b 7b 30 48 8b |
| 53 38 <48> 8b 87 58 01 00 00 48 85 c0 75 09 48 8b 97 88 0c 00 00 eb 10 |
| ^M |
| [ 439.790911] RIP [<ffffffff812d89ba>] blk_mq_tag_to_rq+0x21/0x6e^M |
| [ 439.790911] RSP <ffff880819203da0>^M |
| [ 439.790911] CR2: 0000000000000158^M |
| [ 439.790911] ---[ end trace d40af58949325661 ]---^M |
| |
| Signed-off-by: Ming Lei <ming.lei@canonical.com> |
| Signed-off-by: Jens Axboe <axboe@fb.com> |
| Signed-off-by: Dmitry Shmidt <dimitrysh@google.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| --- |
| block/blk-flush.c | 15 ++++++++++++++- |
| block/blk-mq-tag.c | 2 +- |
| block/blk-mq-tag.h | 12 ++++++++++++ |
| block/blk-mq.c | 16 +--------------- |
| block/blk.h | 6 ++++++ |
| 5 files changed, 34 insertions(+), 17 deletions(-) |
| |
| --- a/block/blk-flush.c |
| +++ b/block/blk-flush.c |
| @@ -73,6 +73,7 @@ |
| |
| #include "blk.h" |
| #include "blk-mq.h" |
| +#include "blk-mq-tag.h" |
| |
| /* FLUSH/FUA sequences */ |
| enum { |
| @@ -226,7 +227,12 @@ static void flush_end_io(struct request |
| struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); |
| |
| if (q->mq_ops) { |
| + struct blk_mq_hw_ctx *hctx; |
| + |
| + /* release the tag's ownership to the req cloned from */ |
| spin_lock_irqsave(&fq->mq_flush_lock, flags); |
| + hctx = q->mq_ops->map_queue(q, flush_rq->mq_ctx->cpu); |
| + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); |
| flush_rq->tag = -1; |
| } |
| |
| @@ -308,11 +314,18 @@ static bool blk_kick_flush(struct reques |
| |
| /* |
| * Borrow tag from the first request since they can't |
| - * be in flight at the same time. |
| + * be in flight at the same time. And acquire the tag's |
| + * ownership for flush req. |
| */ |
| if (q->mq_ops) { |
| + struct blk_mq_hw_ctx *hctx; |
| + |
| flush_rq->mq_ctx = first_rq->mq_ctx; |
| flush_rq->tag = first_rq->tag; |
| + fq->orig_rq = first_rq; |
| + |
| + hctx = q->mq_ops->map_queue(q, first_rq->mq_ctx->cpu); |
| + blk_mq_tag_set_rq(hctx, first_rq->tag, flush_rq); |
| } |
| |
| flush_rq->cmd_type = REQ_TYPE_FS; |
| --- a/block/blk-mq-tag.c |
| +++ b/block/blk-mq-tag.c |
| @@ -403,7 +403,7 @@ static void bt_for_each(struct blk_mq_hw |
| for (bit = find_first_bit(&bm->word, bm->depth); |
| bit < bm->depth; |
| bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { |
| - rq = blk_mq_tag_to_rq(hctx->tags, off + bit); |
| + rq = hctx->tags->rqs[off + bit]; |
| if (rq->q == hctx->queue) |
| fn(hctx, rq, data, reserved); |
| } |
| --- a/block/blk-mq-tag.h |
| +++ b/block/blk-mq-tag.h |
| @@ -85,4 +85,16 @@ static inline void blk_mq_tag_idle(struc |
| __blk_mq_tag_idle(hctx); |
| } |
| |
| +/* |
| + * This helper should only be used for flush request to share tag |
| + * with the request cloned from, and both the two requests can't be |
| + * in flight at the same time. The caller has to make sure the tag |
| + * can't be freed. |
| + */ |
| +static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, |
| + unsigned int tag, struct request *rq) |
| +{ |
| + hctx->tags->rqs[tag] = rq; |
| +} |
| + |
| #endif |
| --- a/block/blk-mq.c |
| +++ b/block/blk-mq.c |
| @@ -498,23 +498,9 @@ void blk_mq_kick_requeue_list(struct req |
| } |
| EXPORT_SYMBOL(blk_mq_kick_requeue_list); |
| |
| -static inline bool is_flush_request(struct request *rq, |
| - struct blk_flush_queue *fq, unsigned int tag) |
| -{ |
| - return ((rq->cmd_flags & REQ_FLUSH_SEQ) && |
| - fq->flush_rq->tag == tag); |
| -} |
| - |
| struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) |
| { |
| - struct request *rq = tags->rqs[tag]; |
| - /* mq_ctx of flush rq is always cloned from the corresponding req */ |
| - struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); |
| - |
| - if (!is_flush_request(rq, fq, tag)) |
| - return rq; |
| - |
| - return fq->flush_rq; |
| + return tags->rqs[tag]; |
| } |
| EXPORT_SYMBOL(blk_mq_tag_to_rq); |
| |
| --- a/block/blk.h |
| +++ b/block/blk.h |
| @@ -22,6 +22,12 @@ struct blk_flush_queue { |
| struct list_head flush_queue[2]; |
| struct list_head flush_data_in_flight; |
| struct request *flush_rq; |
| + |
| + /* |
| + * flush_rq shares tag with this rq, both can't be active |
| + * at the same time |
| + */ |
| + struct request *orig_rq; |
| spinlock_t mq_flush_lock; |
| }; |
| |