| From 99afadd4cd85e1c005fd0089e6aa035895aa4b1b Mon Sep 17 00:00:00 2001 |
| From: Sagi Grimberg <sagi@grimberg.me> |
| Date: Tue, 8 Jan 2019 00:53:22 -0800 |
| Subject: nvme-rdma: fix timeout handler |
| |
| [ Upstream commit 4c174e6366746ae8d49f9cc409f728eebb7a9ac9 ] |
| |
| Currently, we have several problems with the timeout |
| handler: |
| 1. If we timeout on the controller establishment flow, we will hang |
| because we don't execute the error recovery (and we shouldn't because |
| the create_ctrl flow needs to fail and cleanup on its own) |
| 2. We might also hang if we get a disconnet on a queue while the |
| controller is already deleting. This racy flow can cause the controller |
| disable/shutdown admin command to hang. |
| |
| We cannot complete a timed out request from the timeout handler without |
| mutual exclusion from the teardown flow (e.g. nvme_rdma_error_recovery_work). |
| So we serialize it in the timeout handler and teardown io and admin |
| queues to guarantee that no one races with us from completing the |
| request. |
| |
| Reported-by: Jaesoo Lee <jalee@purestorage.com> |
| Reviewed-by: Christoph Hellwig <hch@lst.de> |
| Signed-off-by: Sagi Grimberg <sagi@grimberg.me> |
| Signed-off-by: Jens Axboe <axboe@kernel.dk> |
| Signed-off-by: Sasha Levin <sashal@kernel.org> |
| --- |
| drivers/nvme/host/rdma.c | 26 ++++++++++++++++++-------- |
| 1 file changed, 18 insertions(+), 8 deletions(-) |
| |
| diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c |
| index b6a28de682e85..0939a4e178fb9 100644 |
| --- a/drivers/nvme/host/rdma.c |
| +++ b/drivers/nvme/host/rdma.c |
| @@ -1672,18 +1672,28 @@ static enum blk_eh_timer_return |
| nvme_rdma_timeout(struct request *rq, bool reserved) |
| { |
| struct nvme_rdma_request *req = blk_mq_rq_to_pdu(rq); |
| + struct nvme_rdma_queue *queue = req->queue; |
| + struct nvme_rdma_ctrl *ctrl = queue->ctrl; |
| |
| - dev_warn(req->queue->ctrl->ctrl.device, |
| - "I/O %d QID %d timeout, reset controller\n", |
| - rq->tag, nvme_rdma_queue_idx(req->queue)); |
| + dev_warn(ctrl->ctrl.device, "I/O %d QID %d timeout\n", |
| + rq->tag, nvme_rdma_queue_idx(queue)); |
| |
| - /* queue error recovery */ |
| - nvme_rdma_error_recovery(req->queue->ctrl); |
| + if (ctrl->ctrl.state != NVME_CTRL_LIVE) { |
| + /* |
| + * Teardown immediately if controller times out while starting |
| + * or we are already started error recovery. all outstanding |
| + * requests are completed on shutdown, so we return BLK_EH_DONE. |
| + */ |
| + flush_work(&ctrl->err_work); |
| + nvme_rdma_teardown_io_queues(ctrl, false); |
| + nvme_rdma_teardown_admin_queue(ctrl, false); |
| + return BLK_EH_DONE; |
| + } |
| |
| - /* fail with DNR on cmd timeout */ |
| - nvme_req(rq)->status = NVME_SC_ABORT_REQ | NVME_SC_DNR; |
| + dev_warn(ctrl->ctrl.device, "starting error recovery\n"); |
| + nvme_rdma_error_recovery(ctrl); |
| |
| - return BLK_EH_DONE; |
| + return BLK_EH_RESET_TIMER; |
| } |
| |
| static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx, |
| -- |
| 2.19.1 |
| |