[PATCH 4/7] nvme-fabrics: introduce nvmf_error_recovery API

Wed Oct 20 06:34:27 PDT 2021


> On Oct 18, 2021, at 8:40 AM, Max Gurtovoy <mgurtovoy at nvidia.com> wrote:
> 
> Error recovery mechanism is duplicated in RDMA and TCP transports. Move
> this logic to common code.
> 
> Also update the RDMA/TCP transport drivers to use this API and remove
> the duplicated code.
> 
> Reviewed-by: Chaitanya Kulkarni <kch at nvidia.com>
> Reviewed-by: Israel Rukshin <israelr at nvidia.com>
> Signed-off-by: Max Gurtovoy <mgurtovoy at nvidia.com>
> ---
> drivers/nvme/host/fabrics.c | 10 ++++++++++
> drivers/nvme/host/fabrics.h |  1 +
> drivers/nvme/host/rdma.c    | 25 ++++++++-----------------
> drivers/nvme/host/tcp.c     | 19 +++++--------------
> 4 files changed, 24 insertions(+), 31 deletions(-)
> 
> diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
> index 4a1ef67c6fb3..2edd086fa922 100644
> --- a/drivers/nvme/host/fabrics.c
> +++ b/drivers/nvme/host/fabrics.c
> @@ -493,6 +493,16 @@ void nvmf_reconnect_or_remove(struct nvme_ctrl *ctrl)
> }
> EXPORT_SYMBOL_GPL(nvmf_reconnect_or_remove);
> 
> +void nvmf_error_recovery(struct nvme_ctrl *ctrl)
> +{
> +	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> +		return;
> +
> +	dev_warn(ctrl->device, "starting error recovery\n");
> +	queue_work(nvme_reset_wq, &ctrl->err_work);
> +}
> +EXPORT_SYMBOL_GPL(nvmf_error_recovery);
> +
> /**
>  * nvmf_register_transport() - NVMe Fabrics Library registration function.
>  * @ops:	Transport ops instance to be registered to the
> diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
> index de213ab26977..3d8ec7133fc8 100644
> --- a/drivers/nvme/host/fabrics.h
> +++ b/drivers/nvme/host/fabrics.h
> @@ -189,6 +189,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts);
> int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size);
> bool nvmf_should_reconnect(struct nvme_ctrl *ctrl);
> void nvmf_reconnect_or_remove(struct nvme_ctrl *ctrl);
> +void nvmf_error_recovery(struct nvme_ctrl *ctrl);
> bool nvmf_ip_options_match(struct nvme_ctrl *ctrl,
> 		struct nvmf_ctrl_options *opts);
> 
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index da7f61a5fac4..1c57e371af61 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -1185,15 +1185,6 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
> 	nvmf_reconnect_or_remove(&ctrl->ctrl);
> }
> 
> -static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
> -{
> -	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
> -		return;
> -
> -	dev_warn(ctrl->ctrl.device, "starting error recovery\n");
> -	queue_work(nvme_reset_wq, &ctrl->ctrl.err_work);
> -}
> -
> static void nvme_rdma_end_request(struct nvme_rdma_request *req)
> {
> 	struct request *rq = blk_mq_rq_from_pdu(req);
> @@ -1215,7 +1206,7 @@ static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
> 			     "%s for CQE 0x%p failed with status %s (%d)\n",
> 			     op, wc->wr_cqe,
> 			     ib_wc_status_msg(wc->status), wc->status);
> -	nvme_rdma_error_recovery(ctrl);
> +	nvmf_error_recovery(&ctrl->ctrl);
> }
> 
> static void nvme_rdma_memreg_done(struct ib_cq *cq, struct ib_wc *wc)
> @@ -1715,7 +1706,7 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
> 		dev_err(queue->ctrl->ctrl.device,
> 			"got bad command_id %#x on QP %#x\n",
> 			cqe->command_id, queue->qp->qp_num);
> -		nvme_rdma_error_recovery(queue->ctrl);
> +		nvmf_error_recovery(&queue->ctrl->ctrl);
> 		return;
> 	}
> 	req = blk_mq_rq_to_pdu(rq);
> @@ -1729,7 +1720,7 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
> 			dev_err(queue->ctrl->ctrl.device,
> 				"Bogus remote invalidation for rkey %#x\n",
> 				req->mr ? req->mr->rkey : 0);
> -			nvme_rdma_error_recovery(queue->ctrl);
> +			nvmf_error_recovery(&queue->ctrl->ctrl);
> 		}
> 	} else if (req->mr) {
> 		int ret;
> @@ -1739,7 +1730,7 @@ static void nvme_rdma_process_nvme_rsp(struct nvme_rdma_queue *queue,
> 			dev_err(queue->ctrl->ctrl.device,
> 				"Queueing INV WR for rkey %#x failed (%d)\n",
> 				req->mr->rkey, ret);
> -			nvme_rdma_error_recovery(queue->ctrl);
> +			nvmf_error_recovery(&queue->ctrl->ctrl);
> 		}
> 		/* the local invalidation completion will end the request */
> 		return;
> @@ -1766,7 +1757,7 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc)
> 	if (unlikely(wc->byte_len < len)) {
> 		dev_err(queue->ctrl->ctrl.device,
> 			"Unexpected nvme completion length(%d)\n", wc->byte_len);
> -		nvme_rdma_error_recovery(queue->ctrl);
> +		nvmf_error_recovery(&queue->ctrl->ctrl);
> 		return;
> 	}
> 
> @@ -1936,7 +1927,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
> 	case RDMA_CM_EVENT_TIMEWAIT_EXIT:
> 		dev_dbg(queue->ctrl->ctrl.device,
> 			"disconnect received - connection closed\n");
> -		nvme_rdma_error_recovery(queue->ctrl);
> +		nvmf_error_recovery(&queue->ctrl->ctrl);
> 		break;
> 	case RDMA_CM_EVENT_DEVICE_REMOVAL:
> 		/* device removal is handled via the ib_client API */
> @@ -1944,7 +1935,7 @@ static int nvme_rdma_cm_handler(struct rdma_cm_id *cm_id,
> 	default:
> 		dev_err(queue->ctrl->ctrl.device,
> 			"Unexpected RDMA CM event (%d)\n", ev->event);
> -		nvme_rdma_error_recovery(queue->ctrl);
> +		nvmf_error_recovery(&queue->ctrl->ctrl);
> 		break;
> 	}
> 
> @@ -2000,7 +1991,7 @@ nvme_rdma_timeout(struct request *rq, bool reserved)
> 	 * LIVE state should trigger the normal error recovery which will
> 	 * handle completing this request.
> 	 */
> -	nvme_rdma_error_recovery(ctrl);
> +	nvmf_error_recovery(&ctrl->ctrl);
> 	return BLK_EH_RESET_TIMER;
> }
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 07a9cc4f2274..fe1f2fec457b 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -479,15 +479,6 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
> 	queue->ddgst_remaining = 0;
> }
> 
> -static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
> -{
> -	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> -		return;
> -
> -	dev_warn(ctrl->device, "starting error recovery\n");
> -	queue_work(nvme_reset_wq, &ctrl->err_work);
> -}
> -
> static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
> 		struct nvme_completion *cqe)
> {
> @@ -499,7 +490,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
> 		dev_err(queue->ctrl->ctrl.device,
> 			"got bad cqe.command_id %#x on queue %d\n",
> 			cqe->command_id, nvme_tcp_queue_id(queue));
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		nvmf_error_recovery(&queue->ctrl->ctrl);
> 		return -EINVAL;
> 	}
> 
> @@ -541,7 +532,7 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
> 		dev_err(queue->ctrl->ctrl.device,
> 			"queue %d tag %#x SUCCESS set but not last PDU\n",
> 			nvme_tcp_queue_id(queue), rq->tag);
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		nvmf_error_recovery(&queue->ctrl->ctrl);
> 		return -EPROTO;
> 	}
> 
> @@ -850,7 +841,7 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
> 			dev_err(queue->ctrl->ctrl.device,
> 				"receive failed:  %d\n", result);
> 			queue->rd_enabled = false;
> -			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +			nvmf_error_recovery(&queue->ctrl->ctrl);
> 			return result;
> 		}
> 	}
> @@ -898,7 +889,7 @@ static void nvme_tcp_state_change(struct sock *sk)
> 	case TCP_LAST_ACK:
> 	case TCP_FIN_WAIT1:
> 	case TCP_FIN_WAIT2:
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		nvmf_error_recovery(&queue->ctrl->ctrl);
> 		break;
> 	default:
> 		dev_info(queue->ctrl->ctrl.device,
> @@ -2252,7 +2243,7 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
> 	 * LIVE state should trigger the normal error recovery which will
> 	 * handle completing this request.
> 	 */
> -	nvme_tcp_error_recovery(ctrl);
> +	nvmf_error_recovery(ctrl);
> 	return BLK_EH_RESET_TIMER;
> }
> 
> -- 
> 2.18.1
> 
> 

Reviewed-by: Himanshu Madhani <himanshu.madhani at oracle.com>

--
Himanshu Madhani	 Oracle Linux Engineering