| From 247e4498a44bab217134a1377bcd6e5072b92f21 Mon Sep 17 00:00:00 2001 |
| From: Jan Kara <jack@suse.cz> |
| Date: Thu, 6 Feb 2020 15:28:12 +0100 |
| Subject: [PATCH] blktrace: Protect q->blk_trace with RCU |
| |
| commit c780e86dd48ef6467a1146cf7d0fe1e05a635039 upstream. |
| |
| KASAN is reporting that __blk_add_trace() has a use-after-free issue |
| when accessing q->blk_trace. Indeed the switching of block tracing (and |
| thus eventual freeing of q->blk_trace) is completely unsynchronized with |
| the currently running tracing and thus it can happen that the blk_trace |
| structure is being freed just while __blk_add_trace() works on it. |
| Protect accesses to q->blk_trace by RCU during tracing and make sure we |
| wait for the end of RCU grace period when shutting down tracing. Luckily |
| that is rare enough event that we can afford that. Note that postponing |
| the freeing of blk_trace to an RCU callback should better be avoided as |
| it could have unexpected user visible side-effects as debugfs files |
| would be still existing for a short while block tracing has been shut |
| down. |
| |
| Link: https://bugzilla.kernel.org/show_bug.cgi?id=205711 |
| CC: stable@vger.kernel.org |
| Reviewed-by: Chaitanya Kulkarni <chaitanya.kulkarni@wdc.com> |
| Reviewed-by: Ming Lei <ming.lei@redhat.com> |
| Tested-by: Ming Lei <ming.lei@redhat.com> |
| Reviewed-by: Bart Van Assche <bvanassche@acm.org> |
| Reported-by: Tristan Madani <tristmd@gmail.com> |
| Signed-off-by: Jan Kara <jack@suse.cz> |
| Signed-off-by: Jens Axboe <axboe@kernel.dk> |
| Signed-off-by: Paul Gortmaker <paul.gortmaker@windriver.com> |
| |
| diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h |
| index 5d9fe980125c..ab54e897eb47 100644 |
| --- a/include/linux/blkdev.h |
| +++ b/include/linux/blkdev.h |
| @@ -529,7 +529,7 @@ struct request_queue { |
| unsigned int sg_reserved_size; |
| int node; |
| #ifdef CONFIG_BLK_DEV_IO_TRACE |
| - struct blk_trace *blk_trace; |
| + struct blk_trace __rcu *blk_trace; |
| struct mutex blk_trace_mutex; |
| #endif |
| /* |
| diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h |
| index 7bb2d8de9f30..3b6ff5902edc 100644 |
| --- a/include/linux/blktrace_api.h |
| +++ b/include/linux/blktrace_api.h |
| @@ -51,9 +51,13 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f |
| **/ |
| #define blk_add_cgroup_trace_msg(q, cg, fmt, ...) \ |
| do { \ |
| - struct blk_trace *bt = (q)->blk_trace; \ |
| + struct blk_trace *bt; \ |
| + \ |
| + rcu_read_lock(); \ |
| + bt = rcu_dereference((q)->blk_trace); \ |
| if (unlikely(bt)) \ |
| __trace_note_message(bt, cg, fmt, ##__VA_ARGS__);\ |
| + rcu_read_unlock(); \ |
| } while (0) |
| #define blk_add_trace_msg(q, fmt, ...) \ |
| blk_add_cgroup_trace_msg(q, NULL, fmt, ##__VA_ARGS__) |
| @@ -61,10 +65,14 @@ void __trace_note_message(struct blk_trace *, struct blkcg *blkcg, const char *f |
| |
| static inline bool blk_trace_note_message_enabled(struct request_queue *q) |
| { |
| - struct blk_trace *bt = q->blk_trace; |
| - if (likely(!bt)) |
| - return false; |
| - return bt->act_mask & BLK_TC_NOTIFY; |
| + struct blk_trace *bt; |
| + bool ret; |
| + |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| + ret = bt && (bt->act_mask & BLK_TC_NOTIFY); |
| + rcu_read_unlock(); |
| + return ret; |
| } |
| |
| extern void blk_add_driver_data(struct request_queue *q, struct request *rq, |
| diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c |
| index e1c6d79fb4cc..debd1bea7fe6 100644 |
| --- a/kernel/trace/blktrace.c |
| +++ b/kernel/trace/blktrace.c |
| @@ -336,6 +336,7 @@ static void put_probe_ref(void) |
| |
| static void blk_trace_cleanup(struct blk_trace *bt) |
| { |
| + synchronize_rcu(); |
| blk_trace_free(bt); |
| put_probe_ref(); |
| } |
| @@ -636,8 +637,10 @@ static int compat_blk_trace_setup(struct request_queue *q, char *name, |
| static int __blk_trace_startstop(struct request_queue *q, int start) |
| { |
| int ret; |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| |
| + bt = rcu_dereference_protected(q->blk_trace, |
| + lockdep_is_held(&q->blk_trace_mutex)); |
| if (bt == NULL) |
| return -EINVAL; |
| |
| @@ -747,8 +750,8 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) |
| void blk_trace_shutdown(struct request_queue *q) |
| { |
| mutex_lock(&q->blk_trace_mutex); |
| - |
| - if (q->blk_trace) { |
| + if (rcu_dereference_protected(q->blk_trace, |
| + lockdep_is_held(&q->blk_trace_mutex))) { |
| __blk_trace_startstop(q, 0); |
| __blk_trace_remove(q); |
| } |
| @@ -760,8 +763,10 @@ void blk_trace_shutdown(struct request_queue *q) |
| static union kernfs_node_id * |
| blk_trace_bio_get_cgid(struct request_queue *q, struct bio *bio) |
| { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| |
| + /* We don't use the 'bt' value here except as an optimization... */ |
| + bt = rcu_dereference_protected(q->blk_trace, 1); |
| if (!bt || !(blk_tracer_flags.val & TRACE_BLK_OPT_CGROUP)) |
| return NULL; |
| |
| @@ -806,10 +811,14 @@ static void blk_add_trace_rq(struct request *rq, int error, |
| unsigned int nr_bytes, u32 what, |
| union kernfs_node_id *cgid) |
| { |
| - struct blk_trace *bt = rq->q->blk_trace; |
| + struct blk_trace *bt; |
| |
| - if (likely(!bt)) |
| + rcu_read_lock(); |
| + bt = rcu_dereference(rq->q->blk_trace); |
| + if (likely(!bt)) { |
| + rcu_read_unlock(); |
| return; |
| + } |
| |
| if (blk_rq_is_passthrough(rq)) |
| what |= BLK_TC_ACT(BLK_TC_PC); |
| @@ -818,6 +827,7 @@ static void blk_add_trace_rq(struct request *rq, int error, |
| |
| __blk_add_trace(bt, blk_rq_trace_sector(rq), nr_bytes, req_op(rq), |
| rq->cmd_flags, what, error, 0, NULL, cgid); |
| + rcu_read_unlock(); |
| } |
| |
| static void blk_add_trace_rq_insert(void *ignore, |
| @@ -863,14 +873,19 @@ static void blk_add_trace_rq_complete(void *ignore, struct request *rq, |
| static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, |
| u32 what, int error) |
| { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| |
| - if (likely(!bt)) |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| + if (likely(!bt)) { |
| + rcu_read_unlock(); |
| return; |
| + } |
| |
| __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
| bio_op(bio), bio->bi_opf, what, error, 0, NULL, |
| blk_trace_bio_get_cgid(q, bio)); |
| + rcu_read_unlock(); |
| } |
| |
| static void blk_add_trace_bio_bounce(void *ignore, |
| @@ -915,11 +930,14 @@ static void blk_add_trace_getrq(void *ignore, |
| if (bio) |
| blk_add_trace_bio(q, bio, BLK_TA_GETRQ, 0); |
| else { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| if (bt) |
| __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_GETRQ, 0, 0, |
| NULL, NULL); |
| + rcu_read_unlock(); |
| } |
| } |
| |
| @@ -931,27 +949,35 @@ static void blk_add_trace_sleeprq(void *ignore, |
| if (bio) |
| blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ, 0); |
| else { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| if (bt) |
| __blk_add_trace(bt, 0, 0, rw, 0, BLK_TA_SLEEPRQ, |
| 0, 0, NULL, NULL); |
| + rcu_read_unlock(); |
| } |
| } |
| |
| static void blk_add_trace_plug(void *ignore, struct request_queue *q) |
| { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| if (bt) |
| __blk_add_trace(bt, 0, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL, NULL); |
| + rcu_read_unlock(); |
| } |
| |
| static void blk_add_trace_unplug(void *ignore, struct request_queue *q, |
| unsigned int depth, bool explicit) |
| { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| if (bt) { |
| __be64 rpdu = cpu_to_be64(depth); |
| u32 what; |
| @@ -963,14 +989,17 @@ static void blk_add_trace_unplug(void *ignore, struct request_queue *q, |
| |
| __blk_add_trace(bt, 0, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu, NULL); |
| } |
| + rcu_read_unlock(); |
| } |
| |
| static void blk_add_trace_split(void *ignore, |
| struct request_queue *q, struct bio *bio, |
| unsigned int pdu) |
| { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| if (bt) { |
| __be64 rpdu = cpu_to_be64(pdu); |
| |
| @@ -979,6 +1008,7 @@ static void blk_add_trace_split(void *ignore, |
| BLK_TA_SPLIT, bio->bi_status, sizeof(rpdu), |
| &rpdu, blk_trace_bio_get_cgid(q, bio)); |
| } |
| + rcu_read_unlock(); |
| } |
| |
| /** |
| @@ -998,11 +1028,15 @@ static void blk_add_trace_bio_remap(void *ignore, |
| struct request_queue *q, struct bio *bio, |
| dev_t dev, sector_t from) |
| { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| struct blk_io_trace_remap r; |
| |
| - if (likely(!bt)) |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| + if (likely(!bt)) { |
| + rcu_read_unlock(); |
| return; |
| + } |
| |
| r.device_from = cpu_to_be32(dev); |
| r.device_to = cpu_to_be32(bio_dev(bio)); |
| @@ -1011,6 +1045,7 @@ static void blk_add_trace_bio_remap(void *ignore, |
| __blk_add_trace(bt, bio->bi_iter.bi_sector, bio->bi_iter.bi_size, |
| bio_op(bio), bio->bi_opf, BLK_TA_REMAP, bio->bi_status, |
| sizeof(r), &r, blk_trace_bio_get_cgid(q, bio)); |
| + rcu_read_unlock(); |
| } |
| |
| /** |
| @@ -1031,11 +1066,15 @@ static void blk_add_trace_rq_remap(void *ignore, |
| struct request *rq, dev_t dev, |
| sector_t from) |
| { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| struct blk_io_trace_remap r; |
| |
| - if (likely(!bt)) |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| + if (likely(!bt)) { |
| + rcu_read_unlock(); |
| return; |
| + } |
| |
| r.device_from = cpu_to_be32(dev); |
| r.device_to = cpu_to_be32(disk_devt(rq->rq_disk)); |
| @@ -1044,6 +1083,7 @@ static void blk_add_trace_rq_remap(void *ignore, |
| __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), |
| rq_data_dir(rq), 0, BLK_TA_REMAP, 0, |
| sizeof(r), &r, blk_trace_request_get_cgid(q, rq)); |
| + rcu_read_unlock(); |
| } |
| |
| /** |
| @@ -1061,14 +1101,19 @@ void blk_add_driver_data(struct request_queue *q, |
| struct request *rq, |
| void *data, size_t len) |
| { |
| - struct blk_trace *bt = q->blk_trace; |
| + struct blk_trace *bt; |
| |
| - if (likely(!bt)) |
| + rcu_read_lock(); |
| + bt = rcu_dereference(q->blk_trace); |
| + if (likely(!bt)) { |
| + rcu_read_unlock(); |
| return; |
| + } |
| |
| __blk_add_trace(bt, blk_rq_trace_sector(rq), blk_rq_bytes(rq), 0, 0, |
| BLK_TA_DRV_DATA, 0, len, data, |
| blk_trace_request_get_cgid(q, rq)); |
| + rcu_read_unlock(); |
| } |
| EXPORT_SYMBOL_GPL(blk_add_driver_data); |
| |
| @@ -1595,6 +1640,7 @@ static int blk_trace_remove_queue(struct request_queue *q) |
| return -EINVAL; |
| |
| put_probe_ref(); |
| + synchronize_rcu(); |
| blk_trace_free(bt); |
| return 0; |
| } |
| @@ -1756,6 +1802,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, |
| struct hd_struct *p = dev_to_part(dev); |
| struct request_queue *q; |
| struct block_device *bdev; |
| + struct blk_trace *bt; |
| ssize_t ret = -ENXIO; |
| |
| bdev = bdget(part_devt(p)); |
| @@ -1768,21 +1815,23 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev, |
| |
| mutex_lock(&q->blk_trace_mutex); |
| |
| + bt = rcu_dereference_protected(q->blk_trace, |
| + lockdep_is_held(&q->blk_trace_mutex)); |
| if (attr == &dev_attr_enable) { |
| - ret = sprintf(buf, "%u\n", !!q->blk_trace); |
| + ret = sprintf(buf, "%u\n", !!bt); |
| goto out_unlock_bdev; |
| } |
| |
| - if (q->blk_trace == NULL) |
| + if (bt == NULL) |
| ret = sprintf(buf, "disabled\n"); |
| else if (attr == &dev_attr_act_mask) |
| - ret = blk_trace_mask2str(buf, q->blk_trace->act_mask); |
| + ret = blk_trace_mask2str(buf, bt->act_mask); |
| else if (attr == &dev_attr_pid) |
| - ret = sprintf(buf, "%u\n", q->blk_trace->pid); |
| + ret = sprintf(buf, "%u\n", bt->pid); |
| else if (attr == &dev_attr_start_lba) |
| - ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba); |
| + ret = sprintf(buf, "%llu\n", bt->start_lba); |
| else if (attr == &dev_attr_end_lba) |
| - ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba); |
| + ret = sprintf(buf, "%llu\n", bt->end_lba); |
| |
| out_unlock_bdev: |
| mutex_unlock(&q->blk_trace_mutex); |
| @@ -1799,6 +1848,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, |
| struct block_device *bdev; |
| struct request_queue *q; |
| struct hd_struct *p; |
| + struct blk_trace *bt; |
| u64 value; |
| ssize_t ret = -EINVAL; |
| |
| @@ -1829,8 +1879,10 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, |
| |
| mutex_lock(&q->blk_trace_mutex); |
| |
| + bt = rcu_dereference_protected(q->blk_trace, |
| + lockdep_is_held(&q->blk_trace_mutex)); |
| if (attr == &dev_attr_enable) { |
| - if (!!value == !!q->blk_trace) { |
| + if (!!value == !!bt) { |
| ret = 0; |
| goto out_unlock_bdev; |
| } |
| @@ -1842,18 +1894,18 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, |
| } |
| |
| ret = 0; |
| - if (q->blk_trace == NULL) |
| + if (bt == NULL) |
| ret = blk_trace_setup_queue(q, bdev); |
| |
| if (ret == 0) { |
| if (attr == &dev_attr_act_mask) |
| - q->blk_trace->act_mask = value; |
| + bt->act_mask = value; |
| else if (attr == &dev_attr_pid) |
| - q->blk_trace->pid = value; |
| + bt->pid = value; |
| else if (attr == &dev_attr_start_lba) |
| - q->blk_trace->start_lba = value; |
| + bt->start_lba = value; |
| else if (attr == &dev_attr_end_lba) |
| - q->blk_trace->end_lba = value; |
| + bt->end_lba = value; |
| } |
| |
| out_unlock_bdev: |
| -- |
| 2.7.4 |
| |