[PATCH] nvme-tcp: delay error recovery after link drop
John Meneghini
jmeneghi at redhat.com
Tue Sep 6 10:30:05 PDT 2022
I think the problem Hannes is trying to address with this patch is: the Spec currently says:
Base 2.0 3.3.2.4
If an NVMe Transport connection is lost as a result of an NVMe Transport error, then before performing
recovery actions related to commands sent on I/O queues associated with that NVMe Transport connection,
the host should wait for at least the longer of:
- the NVMe Keep Alive timeout; or
- the underlying fabric transport timeout, if any.
I'm not sure the NVMe/TCP host stack obeys this rule.
The problem is, when the host fails to follow this rule, some NVMe/TCP controllers are found to corrupt data during cable pull
tests.
/John
On 7/14/22 08:27, Hannes Reinecke wrote:
> When the connection unexpectedly closes we must not start error
> recovery right away, as the controller might not have registered
> the connection failure and so retrying commands directly will
> might lead to data corruption.
> So wait for KATO before starting error recovery to be on the safe
> side; chances are the commands will time out before that anyway.
>
> Signed-off-by: Hannes Reinecke <hare at suse.de>
> ---
> drivers/nvme/host/tcp.c | 37 +++++++++++++++++++++----------
> drivers/nvme/target/io-cmd-bdev.c | 18 +++++++++++++++
> 2 files changed, 43 insertions(+), 12 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index a7848e430a5c..70096a2e8762 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -167,7 +167,7 @@ struct nvme_tcp_ctrl {
> struct sockaddr_storage src_addr;
> struct nvme_ctrl ctrl;
>
> - struct work_struct err_work;
> + struct delayed_work err_work;
> struct delayed_work connect_work;
> struct nvme_tcp_request async_req;
> u32 io_queues[HCTX_MAX_TYPES];
> @@ -522,13 +522,21 @@ static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
> queue->ddgst_remaining = 0;
> }
>
> -static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
> +static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl, bool delay)
> {
> + int tmo = delay ? (ctrl->kato ? ctrl->kato : NVME_DEFAULT_KATO) : 0;
> +
> if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
> return;
>
> - dev_warn(ctrl->device, "starting error recovery\n");
> - queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
> + if (tmo)
> + dev_warn(ctrl->device,
> + "delaying error recovery by %d seconds\n", tmo);
> + else
> + dev_warn(ctrl->device,
> + "starting error recovery\n");
> + queue_delayed_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work,
> + tmo * HZ);
> }
>
> static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
> @@ -542,7 +550,7 @@ static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
> dev_err(queue->ctrl->ctrl.device,
> "got bad cqe.command_id %#x on queue %d\n",
> cqe->command_id, nvme_tcp_queue_id(queue));
> - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> + nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
> return -EINVAL;
> }
>
> @@ -584,7 +592,7 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
> dev_err(queue->ctrl->ctrl.device,
> "queue %d tag %#x SUCCESS set but not last PDU\n",
> nvme_tcp_queue_id(queue), rq->tag);
> - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> + nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
> return -EPROTO;
> }
>
> @@ -895,7 +903,7 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
> dev_err(queue->ctrl->ctrl.device,
> "receive failed: %d\n", result);
> queue->rd_enabled = false;
> - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> + nvme_tcp_error_recovery(&queue->ctrl->ctrl, false);
> return result;
> }
> }
> @@ -943,7 +951,12 @@ static void nvme_tcp_state_change(struct sock *sk)
> case TCP_LAST_ACK:
> case TCP_FIN_WAIT1:
> case TCP_FIN_WAIT2:
> - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> + /*
> + * We cannot start error recovery right away, as we need
> + * to wait for KATO to trigger to give the controller a
> + * chance to detect the failure.
> + */
> + nvme_tcp_error_recovery(&queue->ctrl->ctrl, true);
> break;
> default:
> dev_info(queue->ctrl->ctrl.device,
> @@ -2171,7 +2184,7 @@ static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
> static void nvme_tcp_error_recovery_work(struct work_struct *work)
> {
> struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
> - struct nvme_tcp_ctrl, err_work);
> + struct nvme_tcp_ctrl, err_work.work);
> struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
>
> nvme_auth_stop(ctrl);
> @@ -2195,7 +2208,7 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
>
> static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
> {
> - cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work);
> + cancel_work_sync(&to_tcp_ctrl(ctrl)->err_work.work);
> cancel_delayed_work_sync(&to_tcp_ctrl(ctrl)->connect_work);
>
> nvme_tcp_teardown_io_queues(ctrl, shutdown);
> @@ -2355,7 +2368,7 @@ nvme_tcp_timeout(struct request *rq, bool reserved)
> * LIVE state should trigger the normal error recovery which will
> * handle completing this request.
> */
> - nvme_tcp_error_recovery(ctrl);
> + nvme_tcp_error_recovery(ctrl, false);
> return BLK_EH_RESET_TIMER;
> }
>
> @@ -2596,7 +2609,7 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
>
> INIT_DELAYED_WORK(&ctrl->connect_work,
> nvme_tcp_reconnect_ctrl_work);
> - INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
> + INIT_DELAYED_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
> INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
>
> if (!(opts->mask & NVMF_OPT_TRSVCID)) {
> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
> index 27a72504d31c..bbeeeeaedc9b 100644
> --- a/drivers/nvme/target/io-cmd-bdev.c
> +++ b/drivers/nvme/target/io-cmd-bdev.c
> @@ -157,6 +157,24 @@ u16 blk_to_nvme_status(struct nvmet_req *req, blk_status_t blk_sts)
> req->error_loc = offsetof(struct nvme_rw_command, nsid);
> break;
> case BLK_STS_IOERR:
> + switch (req->cmd->common.opcode) {
> + case nvme_cmd_read:
> + status = NVME_SC_READ_ERROR;
> + req->error_loc = offsetof(struct nvme_rw_command,
> + slba);
> + break;
> + case nvme_cmd_write:
> + status = NVME_SC_WRITE_FAULT;
> + req->error_loc = offsetof(struct nvme_rw_command,
> + slba);
> + break;
> + default:
> + status = NVME_SC_DATA_XFER_ERROR | NVME_SC_DNR;
> + req->error_loc = offsetof(struct nvme_common_command,
> + opcode);
> + break;
> + }
> + break;
> default:
> status = NVME_SC_INTERNAL | NVME_SC_DNR;
> req->error_loc = offsetof(struct nvme_common_command, opcode);
More information about the Linux-nvme
mailing list