[PATCH] nvme-tcp: delay error recovery after link drop

Tue Sep 6 10:30:05 PDT 2022

I think the problem Hannes is trying to address with this patch is: the Spec currently says:

Base 2.0 3.3.2.4

If an NVMe Transport connection is lost as a result of an NVMe Transport error, then before performing
recovery actions related to commands sent on I/O queues associated with that NVMe Transport connection,
the host should wait for at least the longer of:

- the NVMe Keep Alive timeout; or
- the underlying fabric transport timeout, if any.

I'm not sure the NVMe/TCP host stack obeys this rule.

The problem is, when the host fails to follow this rule, some NVMe/TCP controllers are found to corrupt data during cable pull 
tests.

/John

On 7/14/22 08:27, Hannes Reinecke wrote:
> When the connection unexpectedly closes we must not start error
> recovery right away, as the controller might not have registered
> the connection failure and so retrying commands directly will
> might lead to data corruption.
> So wait for KATO before starting error recovery to be on the safe
> side; chances are the commands will time out before that anyway.
> 
> Signed-off-by: Hannes Reinecke <hare at suse.de>
> ---
>   drivers/nvme/host/tcp.c           | 37 +++++++++++++++++++++----------
>   drivers/nvme/target/io-cmd-bdev.c | 18 +++++++++++++++
>   2 files changed, 43 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index a7848e430a5c..70096a2e8762 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -167,7 +167,7 @@ struct nvme_tcp_ctrl {
>   	struct sockaddr_storage src_addr;
>   	struct nvme_ctrl	ctrl;
>   
> -	struct work_struct	err_work;
> +	struct delayed_work	err_work;
>   	struct delayed_work	connect_work;
>   	struct nvme_tcp_request async_req;
>   	u32			io_queues[HCTX_MAX_TYPES];
> @@ -522,13 +522,21 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
>   	queue->ddgst_remaining = 0;
>   }
>   
> -static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
> +static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl, bool delay)
>   {
> +	int tmo = delay ? (ctrl->kato ? ctrl->kato : NVME_DEFAULT_KATO) : 0;
> +
>   	if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
>   		return;
>   
> -	dev_warn(ctrl->device, "starting error recovery\n");
> -	queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
> +	if (tmo)
> +		dev_warn(ctrl->device,
> +			 "delaying error recovery by %d seconds\n", tmo);
> +	else
> +		dev_warn(ctrl->device,
> +			 "starting error recovery\n");
> +	queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work,
> +			   tmo * HZ);
>   }
>   
>   static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
> @@ -542,7 +550,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
>   		dev_err(queue->ctrl->ctrl.device,
>   			"got bad cqe.command_id %#x on queue %d\n",
>   			cqe->command_id, nvme_tcp_queue_id(queue));
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
>   		return -EINVAL;
>   	}
>   
> @@ -584,7 +592,7 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
>   		dev_err(queue->ctrl->ctrl.device,
>   			"queue %d tag %#x SUCCESS set but not last PDU\n",
>   			nvme_tcp_queue_id(queue), rq->tag);
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
>   		return -EPROTO;
>   	}
>   
> @@ -895,7 +903,7 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
>   			dev_err(queue->ctrl->ctrl.device,
>   				"receive failed:  %d\n", result);
>   			queue->rd_enabled = false;
> -			nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +			nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
>   			return result;
>   		}
>   	}
> @@ -943,7 +951,12 @@ static void nvme_tcp_state_change(struct sock *sk)
>   	case TCP_LAST_ACK:
>   	case TCP_FIN_WAIT1:
>   	case TCP_FIN_WAIT2:
> -		nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> +		/*
> +		 * We cannot start error recovery right away, as we need
> +		 * to wait for KATO to trigger to give the controller a
> +		 * chance to detect the failure.
> +		 */
> +		nvme_tcp_error_recovery(&queue->ctrl->ctrl, true);
>   		break;
>   	default:
>   		dev_info(queue->ctrl->ctrl.device,
> @@ -2171,7 +2184,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
>   static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   {
>   	struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> -				struct nvme_tcp_ctrl, err_work);
> +				struct nvme_tcp_ctrl, err_work.work);
>   	struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>   
>   	nvme_auth_stop(ctrl);
> @@ -2195,7 +2208,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
>   
>   static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
>   {
> -	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
> +	cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work.work);
>   	cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
>   
>   	nvme_tcp_teardown_io_queues(ctrl, shutdown);
> @@ -2355,7 +2368,7 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
>   	 * LIVE state should trigger the normal error recovery which will
>   	 * handle completing this request.
>   	 */
> -	nvme_tcp_error_recovery(ctrl);
> +	nvme_tcp_error_recovery(ctrl, false);
>   	return BLK_EH_RESET_TIMER;
>   }
>   
> @@ -2596,7 +2609,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
>   
>   	INIT_DELAYED_WORK(&ctrl->connect_work,
>   			nvme_tcp_reconnect_ctrl_work);
> -	INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
> +	INIT_DELAYED_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
>   	INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
>   
>   	if (!(opts->mask & NVMF_OPT_TRSVCID)) {
> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
> index 27a72504d31c..bbeeeeaedc9b 100644
> --- a/drivers/nvme/target/io-cmd-bdev.c
> +++ b/drivers/nvme/target/io-cmd-bdev.c
> @@ -157,6 +157,24 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
>   		req->error_loc = offsetof(struct nvme_rw_command, nsid);
>   		break;
>   	case BLK_STS_IOERR:
> +		switch (req->cmd->common.opcode) {
> +		case nvme_cmd_read:
> +			status = NVME_SC_READ_ERROR;
> +			req->error_loc = offsetof(struct nvme_rw_command,
> +						  slba);
> +			break;
> +		case nvme_cmd_write:
> +			status = NVME_SC_WRITE_FAULT;
> +			req->error_loc = offsetof(struct nvme_rw_command,
> +						  slba);
> +			break;
> +		default:
> +			status = NVME_SC_DATA_XFER_ERROR | NVME_SC_DNR;
> +			req->error_loc = offsetof(struct nvme_common_command,
> +						  opcode);
> +			break;
> +		}
> +		break;
>   	default:
>   		status = NVME_SC_INTERNAL | NVME_SC_DNR;
>   		req->error_loc = offsetof(struct nvme_common_command, opcode);