| From stable-bounces@linux.kernel.org Tue May 13 12:10:35 2008 |
| From: Dan Williams <dan.j.williams@intel.com> |
| Date: Tue, 13 May 2008 19:10:11 GMT |
| Subject: md: fix raid5 'repair' operations |
| To: jejb@kernel.org, stable@kernel.org |
| Message-ID: <200805131910.m4DJABfs007550@hera.kernel.org> |
| |
| From: Dan Williams <dan.j.williams@intel.com> |
| |
| commit c8894419acf5e56851de9741c5047bebd78acd1f upstream |
| Date: Mon, 12 May 2008 14:02:12 -0700 |
| Subject: md: fix raid5 'repair' operations |
| |
| commit bd2ab67030e9116f1e4aae1289220255412b37fd "md: close a livelock window |
| in handle_parity_checks5" introduced a bug in handling 'repair' operations. |
| After a repair operation completes we clear the state bits tracking this |
| operation. However, they are cleared too early and this results in the code |
| deciding to re-run the parity check operation. Since we have done the repair |
| in memory the second check does not find a mismatch and thus does not do a |
| writeback. |
| |
| Test results: |
| $ echo repair > /sys/block/md0/md/sync_action |
| $ cat /sys/block/md0/md/mismatch_cnt |
| 51072 |
| $ echo repair > /sys/block/md0/md/sync_action |
| $ cat /sys/block/md0/md/mismatch_cnt |
| 0 |
| |
| (also fix incorrect indentation) |
| |
| Tested-by: George Spelvin <linux@horizon.com> |
| Acked-by: NeilBrown <neilb@suse.de> |
| Signed-off-by: Dan Williams <dan.j.williams@intel.com> |
| Signed-off-by: Andrew Morton <akpm@linux-foundation.org> |
| Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de> |
| |
| --- |
| drivers/md/raid5.c | 25 +++++++++++++------------ |
| 1 file changed, 13 insertions(+), 12 deletions(-) |
| |
| --- a/drivers/md/raid5.c |
| +++ b/drivers/md/raid5.c |
| @@ -2354,8 +2354,8 @@ static void handle_parity_checks5(raid5_ |
| |
| /* complete a check operation */ |
| if (test_and_clear_bit(STRIPE_OP_CHECK, &sh->ops.complete)) { |
| - clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); |
| - clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); |
| + clear_bit(STRIPE_OP_CHECK, &sh->ops.ack); |
| + clear_bit(STRIPE_OP_CHECK, &sh->ops.pending); |
| if (s->failed == 0) { |
| if (sh->ops.zero_sum_result == 0) |
| /* parity is correct (on disc, |
| @@ -2385,16 +2385,6 @@ static void handle_parity_checks5(raid5_ |
| canceled_check = 1; /* STRIPE_INSYNC is not set */ |
| } |
| |
| - /* check if we can clear a parity disk reconstruct */ |
| - if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && |
| - test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { |
| - |
| - clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); |
| - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); |
| - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); |
| - clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); |
| - } |
| - |
| /* start a new check operation if there are no failures, the stripe is |
| * not insync, and a repair is not in flight |
| */ |
| @@ -2409,6 +2399,17 @@ static void handle_parity_checks5(raid5_ |
| } |
| } |
| |
| + /* check if we can clear a parity disk reconstruct */ |
| + if (test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete) && |
| + test_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending)) { |
| + |
| + clear_bit(STRIPE_OP_MOD_REPAIR_PD, &sh->ops.pending); |
| + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.complete); |
| + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.ack); |
| + clear_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending); |
| + } |
| + |
| + |
| /* Wait for check parity and compute block operations to complete |
| * before write-back. If a failure occurred while the check operation |
| * was in flight we need to cycle this stripe through handle_stripe |