[PATCH 02/10] nvme-fabrics: introduce nvmf_reconnect_or_remove API

Tue Nov 2 16:38:34 PDT 2021

On 10/20/2021 3:38 AM, Max Gurtovoy wrote:
> This logic is duplicated today for RDMA and TCP controllers. Move it to
> the fabrics driver and export it as a new API.
> 
> Also update the RDMA/TCP transport drivers to use this API and remove
> the duplicated code.
> 
> Reviewed-by: Israel Rukshin <israelr at nvidia.com>
> Reviewed-by: Chaitanya Kulkarni <kch at nvidia.com>
> Reviewed-by: Hannes Reinecke <hare at suse.de>
> Signed-off-by: Max Gurtovoy <mgurtovoy at nvidia.com>
> ---
>   drivers/nvme/host/fabrics.c | 21 +++++++++++++++++++++
>   drivers/nvme/host/fabrics.h |  1 +
>   drivers/nvme/host/rdma.c    | 25 +++----------------------
>   drivers/nvme/host/tcp.c     | 26 +++-----------------------
>   4 files changed, 28 insertions(+), 45 deletions(-)
> 
> diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
> index 668c6bb7a567..4a1ef67c6fb3 100644
> --- a/drivers/nvme/host/fabrics.c
> +++ b/drivers/nvme/host/fabrics.c
> @@ -472,6 +472,27 @@ bool nvmf_should_reconnect(struct nvme_ctrl *ctrl)
>   }
>   EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
>   
> +void nvmf_reconnect_or_remove(struct nvme_ctrl *ctrl)
> +{
> +	/* If we are resetting/deleting then do nothing */
> +	if (ctrl->state != NVME_CTRL_CONNECTING) {
> +		WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
> +			ctrl->state == NVME_CTRL_LIVE);
> +		return;
> +	}
> +
> +	if (nvmf_should_reconnect(ctrl)) {
> +		dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
> +			ctrl->opts->reconnect_delay);
> +		queue_delayed_work(nvme_wq, &ctrl->connect_work,
> +				ctrl->opts->reconnect_delay * HZ);
> +	} else {
> +		dev_info(ctrl->device, "Removing controller...\n");
> +		nvme_delete_ctrl(ctrl);
> +	}
> +}
> +EXPORT_SYMBOL_GPL(nvmf_reconnect_or_remove);
> +

This won't be sufficient for FC so it can't use it.  I'd have to think 
if there's a way to restructure or wrapper it. But not a great fit.

I do think what FC is doing relative to NVME_SC_DNR should be done in 
rdma/tcp as well.

In other words, this should minimally be:

void nvmf_reconnect_or_remove(struct nvme_ctrl *ctrl, int status)
{
         /* If we are resetting/deleting then do nothing */
         if (ctrl->state != NVME_CTRL_CONNECTING) {
                 WARN_ON_ONCE(ctrl->state == NVME_CTRL_NEW ||
                         ctrl->state == NVME_CTRL_LIVE);
                 return;
         }

         if (!(status > 0 && status & NVME_SC_DNR) &&
             nvmf_should_reconnect(ctrl)) {
                 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
                         ctrl->opts->reconnect_delay);
                 queue_delayed_work(nvme_wq, &ctrl->connect_work,
                                 ctrl->opts->reconnect_delay * HZ);
         } else {
                 dev_info(ctrl->device, "Removing controller...\n");
                 nvme_delete_ctrl(ctrl);
         }
}
EXPORT_SYMBOL_GPL(nvmf_reconnect_or_remove);

then change the callee's to set status to pass the return value from the 
status that caused the reschedule. It'll either be set to a -Exxx value 
or to a NVME status code returned by one of the core routines during the 
controller init. This allows an uncorrectable failure during controller 
init will just fail w/o rescheduling.


...
> @@ -1181,7 +1162,7 @@ static void nvme_rdma_reconnect_ctrl_work(struct work_struct *work)
>   requeue:
>   	dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
>   			ctrl->ctrl.nr_reconnects);
> -	nvme_rdma_reconnect_or_remove(ctrl);
> +	nvmf_reconnect_or_remove(&ctrl->ctrl);

This would become:

@@ -2,10 +2,12 @@ static void nvme_rdma_reconnect_ctrl_wor
  {
  	struct nvme_rdma_ctrl *ctrl = container_of(to_delayed_work(work),
  			struct nvme_rdma_ctrl, reconnect_work);
+	int ret;

  	++ctrl->ctrl.nr_reconnects;

-	if (nvme_rdma_setup_ctrl(ctrl, false))
+	ret = nvme_rdma_setup_ctrl(ctrl, false);
+	if (ret)
  		goto requeue;

  	dev_info(ctrl->ctrl.device, "Successfully reconnected (%d attempts)\n",
@@ -18,5 +20,5 @@ static void nvme_rdma_reconnect_ctrl_wor
  requeue:
  	dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
  			ctrl->ctrl.nr_reconnects);
-	nvme_rdma_reconnect_or_remove(ctrl);
+	nvme_rdma_reconnect_or_remove(ctrl, ret);
  }


>   }
>   
>   static void nvme_rdma_error_recovery_work(struct work_struct *work)
> @@ -1202,7 +1183,7 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
>   		return;
>   	}
>   
> -	nvme_rdma_reconnect_or_remove(ctrl);
> +	nvmf_reconnect_or_remove(&ctrl->ctrl);
>   }

@@ -16,5 +16,5 @@ static void nvme_rdma_error_recovery_wor
  		return;
  	}

-	nvme_rdma_reconnect_or_remove(ctrl);
+	nvme_rdma_reconnect_or_remove(ctrl, 0);
  }


>   
>   static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
> @@ -2265,7 +2246,7 @@ static void nvme_rdma_reset_ctrl_work(struct work_struct *work)
>   
>   out_fail:
>   	++ctrl->ctrl.nr_reconnects;
> -	nvme_rdma_reconnect_or_remove(ctrl);
> +	nvmf_reconnect_or_remove(&ctrl->ctrl);
>   }

@@ -2,6 +2,7 @@ static void nvme_rdma_reset_ctrl_work(st
  {
  	struct nvme_rdma_ctrl *ctrl =
  		container_of(work, struct nvme_rdma_ctrl, ctrl.reset_work);
+	int ret;

  	nvme_stop_ctrl(&ctrl->ctrl);
  	nvme_rdma_shutdown_ctrl(ctrl, false);
@@ -12,12 +13,13 @@ static void nvme_rdma_reset_ctrl_work(st
  		return;
  	}

-	if (nvme_rdma_setup_ctrl(ctrl, false))
+	ret = nvme_rdma_setup_ctrl(ctrl, false);
+	if (ret)
  		goto out_fail;

  	return;

  out_fail:
  	++ctrl->ctrl.nr_reconnects;
-	nvme_rdma_reconnect_or_remove(ctrl);
+	nvme_rdma_reconnect_or_remove(ctrl, ret);
  }


And similar mods to tcp.

-- james