| From 72d8c36ec364c82bf1bf0c64dfa1041cfaf139f7 Mon Sep 17 00:00:00 2001 |
| From: Wei Fang <fangwei1@huawei.com> |
| Date: Tue, 7 Jun 2016 14:53:56 +0800 |
| Subject: scsi: fix race between simultaneous decrements of ->host_failed |
| |
| From: Wei Fang <fangwei1@huawei.com> |
| |
| commit 72d8c36ec364c82bf1bf0c64dfa1041cfaf139f7 upstream. |
| |
| sas_ata_strategy_handler() adds the works of the ata error handler to |
| system_unbound_wq. This workqueue asynchronously runs work items, so the |
| ata error handler will be performed concurrently on different CPUs. In |
| this case, ->host_failed will be decreased simultaneously in |
| scsi_eh_finish_cmd() on different CPUs, and become abnormal. |
| |
| It will lead to permanently inequality between ->host_failed and |
| ->host_busy, and scsi error handler thread won't start running. IO |
| errors after that won't be handled. |
| |
| Since all scmds must have been handled in the strategy handler, just |
| remove the decrement in scsi_eh_finish_cmd() and zero ->host_busy after |
| the strategy handler to fix this race. |
| |
| Fixes: 50824d6c5657 ("[SCSI] libsas: async ata-eh") |
| Signed-off-by: Wei Fang <fangwei1@huawei.com> |
| Reviewed-by: James Bottomley <jejb@linux.vnet.ibm.com> |
| Signed-off-by: Martin K. Petersen <martin.petersen@oracle.com> |
| Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> |
| |
| --- |
| Documentation/scsi/scsi_eh.txt | 8 ++++++-- |
| drivers/ata/libata-eh.c | 2 +- |
| drivers/scsi/scsi_error.c | 4 +++- |
| 3 files changed, 10 insertions(+), 4 deletions(-) |
| |
| --- a/Documentation/scsi/scsi_eh.txt |
| +++ b/Documentation/scsi/scsi_eh.txt |
| @@ -263,19 +263,23 @@ scmd->allowed. |
| |
| 3. scmd recovered |
| ACTION: scsi_eh_finish_cmd() is invoked to EH-finish scmd |
| - - shost->host_failed-- |
| - clear scmd->eh_eflags |
| - scsi_setup_cmd_retry() |
| - move from local eh_work_q to local eh_done_q |
| LOCKING: none |
| + CONCURRENCY: at most one thread per separate eh_work_q to |
| + keep queue manipulation lockless |
| |
| 4. EH completes |
| ACTION: scsi_eh_flush_done_q() retries scmds or notifies upper |
| - layer of failure. |
| + layer of failure. May be called concurrently but must have |
| + a no more than one thread per separate eh_work_q to |
| + manipulate the queue locklessly |
| - scmd is removed from eh_done_q and scmd->eh_entry is cleared |
| - if retry is necessary, scmd is requeued using |
| scsi_queue_insert() |
| - otherwise, scsi_finish_command() is invoked for scmd |
| + - zero shost->host_failed |
| LOCKING: queue or finish function performs appropriate locking |
| |
| |
| --- a/drivers/ata/libata-eh.c |
| +++ b/drivers/ata/libata-eh.c |
| @@ -606,7 +606,7 @@ void ata_scsi_error(struct Scsi_Host *ho |
| ata_scsi_port_error_handler(host, ap); |
| |
| /* finish or retry handled scmd's and clean up */ |
| - WARN_ON(host->host_failed || !list_empty(&eh_work_q)); |
| + WARN_ON(!list_empty(&eh_work_q)); |
| |
| DPRINTK("EXIT\n"); |
| } |
| --- a/drivers/scsi/scsi_error.c |
| +++ b/drivers/scsi/scsi_error.c |
| @@ -1127,7 +1127,6 @@ static int scsi_eh_action(struct scsi_cm |
| */ |
| void scsi_eh_finish_cmd(struct scsi_cmnd *scmd, struct list_head *done_q) |
| { |
| - scmd->device->host->host_failed--; |
| scmd->eh_eflags = 0; |
| list_move_tail(&scmd->eh_entry, done_q); |
| } |
| @@ -2226,6 +2225,9 @@ int scsi_error_handler(void *data) |
| else |
| scsi_unjam_host(shost); |
| |
| + /* All scmds have been handled */ |
| + shost->host_failed = 0; |
| + |
| /* |
| * Note - if the above fails completely, the action is to take |
| * individual devices offline and flush the queue of any |