[PATCH] nvme/mpath: fix hang when disk goes live over reconnect

Sagi Grimberg sagi at grimberg.me
Tue Oct 5 05:38:01 PDT 2021


>> How do we proceed with this fix?
> 
> Please resend with the suggested updates.
> 
>> I believe error propagation here is not wanted because:
>> 1) A failure to fetch or parse the ANA log should not be considered
>>     an error in ctrl initialization.
> 
> We must handle error that are due to a controller failing to initialize.
> 
>> 2) Such error will not cause problems in teardown.
>> 3) The same failure is possible in ANA work and we do not take
>>     any action in such case.
> 
> Failing in a workqueue is different from failing in the initialization
> path.
> 
>> 4) Adding support for failure to nvme_start_ctrl() adds complexity
>>     and does not look useful due to the above 1-3.

The feedback here is that this patch is changing the functionality
as before we failed initialization and now we don't.

What is the issue in propagating the error and then modify
the call-sites? Shouldn't it be simple enough to do?
--
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index aa14ad963d91..d86623b90eea 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -3158,8 +3158,11 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)

         ctrl->ctrl.nr_reconnects = 0;

-       if (changed)
-               nvme_start_ctrl(&ctrl->ctrl);
+       if (changed) {
+               ret = nvme_start_ctrl(&ctrl->ctrl);
+               if (ret)
+                       goto out_term_aen_ops;
+       }

         return 0;       /* Success */

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index b82492cd7503..59bfdd72a51a 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -2827,7 +2827,10 @@ static void nvme_reset_work(struct work_struct *work)
                         &nvme_pci_attr_group))
                 dev->attrs_added = true;

-       nvme_start_ctrl(&dev->ctrl);
+       ret = nvme_start_ctrl(&dev->ctrl);
+       if (ret)
+               goto out_unlock;
+
         return;

   out_unlock:
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 042c594bc57e..9e9e34a012e7 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1141,7 +1141,10 @@ static int nvme_rdma_setup_ctrl(struct 
nvme_rdma_ctrl *ctrl, bool new)
                 goto destroy_io;
         }

-       nvme_start_ctrl(&ctrl->ctrl);
+       ret = nvme_start_ctrl(&ctrl->ctrl);
+       if (ret)
+               goto destroy_io;
+
         return 0;

  destroy_io:
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 10b164f02b5d..22199281ad17 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -923,6 +923,11 @@ static void nvme_tcp_fail_request(struct 
nvme_tcp_request *req)
         nvme_tcp_end_request(blk_mq_rq_from_pdu(req), 
NVME_SC_HOST_PATH_ERROR);
  }
@@ -2045,7 +2052,10 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl 
*ctrl, bool new)
                 goto destroy_io;
         }

-       nvme_start_ctrl(ctrl);
+       ret = nvme_start_ctrl(ctrl);
+       if (ret)
+               goto destroy_io;
+
         return 0;

  destroy_io:
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 0285ccc7541f..3fa89650c0d1 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -490,7 +490,9 @@ static void nvme_loop_reset_ctrl_work(struct 
work_struct *work)
         if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE))
                 WARN_ON_ONCE(1);

-       nvme_start_ctrl(&ctrl->ctrl);
+       ret = nvme_start_ctrl(&ctrl->ctrl);
+       if (ret)
+               goto out_destroy_io;

         return;
--



More information about the Linux-nvme mailing list