[PATCH] nvme-rdma: fix deadlock when delete ctrl due to reconnect fail

Sagi Grimberg sagi at grimberg.me
Mon Jul 27 19:31:58 EDT 2020


>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
>> index f8f856dc0c67..b381e2cde50a 100644
>> --- a/drivers/nvme/host/rdma.c
>> +++ b/drivers/nvme/host/rdma.c
>> @@ -989,8 +989,7 @@ static void nvme_rdma_teardown_io_queues(struct 
>> nvme_rdma_ctrl *ctrl,
>>                   nvme_cancel_request, &ctrl->ctrl);
>>               blk_mq_tagset_wait_completed_request(ctrl->ctrl.tagset);
>>           }
>> -        if (remove)
>> -            nvme_start_queues(&ctrl->ctrl);
>> +        nvme_start_queues(&ctrl->ctrl);
> 
> This will fail I/O during controller reset, so nak on this.

Can you try this:
--
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index d58231636d11..96c0d664fe9b 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1149,6 +1149,11 @@ static void nvme_rdma_reconnect_ctrl_work(struct 
work_struct *work)
         return;

  requeue:
+       /*
+        * make sure queues are not quiesced due to a reconnect
+        * sequence that failed after creating some I/O queues
+        */
+       nvme_start_queues(ctrl);
         dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
                         ctrl->ctrl.nr_reconnects);
         nvme_rdma_reconnect_or_remove(ctrl);
--



More information about the Linux-nvme mailing list