| From: NeilBrown <neilb@suse.com> |
| Date: Fri, 14 Aug 2015 11:26:17 +1000 |
| Subject: md/raid10: ensure device failure recorded before write request |
| returns. |
| |
| commit 95af587e95aacb9cfda4a9641069a5244a540dc8 upstream. |
| |
| When a write to one of the legs of a RAID10 fails, the failure is |
| recorded in the metadata of the other legs so that after a restart |
| the data on the failed drive wont be trusted even if that drive seems |
| to be working again (maybe a cable was unplugged). |
| |
| Currently there is no interlock between the write request completing |
| and the metadata update. So it is possible that the write will |
| complete, the app will confirm success in some way, and then the |
| machine will crash before the metadata update completes. |
| |
| This is an extremely small hole for a racy to fit in, but it is |
| theoretically possible and so should be closed. |
| |
| So: |
| - set MD_CHANGE_PENDING when requesting a metadata update for a |
| failed device, so we can know with certainty when it completes |
| - queue requests that experienced an error on a new queue which |
| is only processed after the metadata update completes |
| - call raid_end_bio_io() on bios in that queue when the time comes. |
| |
| Signed-off-by: NeilBrown <neilb@suse.com> |
| [bwh: Backported to 3.2: adjust context] |
| Signed-off-by: Ben Hutchings <ben@decadent.org.uk> |
| --- |
| drivers/md/raid10.c | 29 ++++++++++++++++++++++++++++- |
| drivers/md/raid10.h | 6 ++++++ |
| 2 files changed, 34 insertions(+), 1 deletion(-) |
| |
| --- a/drivers/md/raid10.c |
| +++ b/drivers/md/raid10.c |
| @@ -1280,6 +1280,7 @@ static void error(struct mddev *mddev, s |
| set_bit(Blocked, &rdev->flags); |
| set_bit(Faulty, &rdev->flags); |
| set_bit(MD_CHANGE_DEVS, &mddev->flags); |
| + set_bit(MD_CHANGE_PENDING, &mddev->flags); |
| printk(KERN_ALERT |
| "md/raid10:%s: Disk failure on %s, disabling device.\n" |
| "md/raid10:%s: Operation continuing on %d devices.\n", |
| @@ -2215,6 +2216,7 @@ static void handle_write_completed(struc |
| } |
| put_buf(r10_bio); |
| } else { |
| + bool fail = false; |
| for (m = 0; m < conf->copies; m++) { |
| int dev = r10_bio->devs[m].devnum; |
| struct bio *bio = r10_bio->devs[m].bio; |
| @@ -2227,6 +2229,7 @@ static void handle_write_completed(struc |
| rdev_dec_pending(rdev, conf->mddev); |
| } else if (bio != NULL && |
| !test_bit(BIO_UPTODATE, &bio->bi_flags)) { |
| + fail = true; |
| if (!narrow_write_error(r10_bio, m)) { |
| md_error(conf->mddev, rdev); |
| set_bit(R10BIO_Degraded, |
| @@ -2238,7 +2241,13 @@ static void handle_write_completed(struc |
| if (test_bit(R10BIO_WriteError, |
| &r10_bio->state)) |
| close_write(r10_bio); |
| - raid_end_bio_io(r10_bio); |
| + if (fail) { |
| + spin_lock_irq(&conf->device_lock); |
| + list_add(&r10_bio->retry_list, &conf->bio_end_io_list); |
| + spin_unlock_irq(&conf->device_lock); |
| + md_wakeup_thread(conf->mddev->thread); |
| + } else |
| + raid_end_bio_io(r10_bio); |
| } |
| } |
| |
| @@ -2252,6 +2261,23 @@ static void raid10d(struct mddev *mddev) |
| |
| md_check_recovery(mddev); |
| |
| + if (!list_empty_careful(&conf->bio_end_io_list) && |
| + !test_bit(MD_CHANGE_PENDING, &mddev->flags)) { |
| + LIST_HEAD(tmp); |
| + spin_lock_irqsave(&conf->device_lock, flags); |
| + if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) { |
| + list_add(&tmp, &conf->bio_end_io_list); |
| + list_del_init(&conf->bio_end_io_list); |
| + } |
| + spin_unlock_irqrestore(&conf->device_lock, flags); |
| + while (!list_empty(&tmp)) { |
| + r10_bio = list_first_entry(&conf->bio_end_io_list, |
| + struct r10bio, retry_list); |
| + list_del(&r10_bio->retry_list); |
| + raid_end_bio_io(r10_bio); |
| + } |
| + } |
| + |
| blk_start_plug(&plug); |
| for (;;) { |
| |
| @@ -2860,6 +2886,7 @@ static struct r10conf *setup_conf(struct |
| |
| spin_lock_init(&conf->device_lock); |
| INIT_LIST_HEAD(&conf->retry_list); |
| + INIT_LIST_HEAD(&conf->bio_end_io_list); |
| |
| spin_lock_init(&conf->resync_lock); |
| init_waitqueue_head(&conf->wait_barrier); |
| --- a/drivers/md/raid10.h |
| +++ b/drivers/md/raid10.h |
| @@ -40,6 +40,12 @@ struct r10conf { |
| sector_t chunk_mask; |
| |
| struct list_head retry_list; |
| + /* A separate list of r1bio which just need raid_end_bio_io called. |
| + * This mustn't happen for writes which had any errors if the superblock |
| + * needs to be written. |
| + */ |
| + struct list_head bio_end_io_list; |
| + |
| /* queue pending writes and submit them on unplug */ |
| struct bio_list pending_bio_list; |
| int pending_count; |