[PATCH 2/3] nvme: refactor command completion

Fri Aug 14 14:37:14 EDT 2020

On 8/14/20 8:15 AM, Christoph Hellwig wrote:
> Lift all the code to decide the dispostition of a completed command
> from nvme_complete_rq and nvme_failover_req into a new helper, which
> returns an emum of the potential actions.  nvme_complete_rq then
> just switches on those and calls the proper helper for the action.
> 
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
>   drivers/nvme/host/core.c      | 76 ++++++++++++++++++++++-------------
>   drivers/nvme/host/multipath.c | 47 ++++++----------------
>   drivers/nvme/host/nvme.h      | 31 ++++++++++++--
>   3 files changed, 90 insertions(+), 64 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 88cff309d8e4f0..8d474adad721fb 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -241,17 +241,6 @@ static blk_status_t nvme_error_status(u16 status)
>   	}
>   }
>   
> -static inline bool nvme_req_needs_retry(struct request *req)
> -{
> -	if (blk_noretry_request(req))
> -		return false;
> -	if (nvme_req(req)->status & NVME_SC_DNR)
> -		return false;
> -	if (nvme_req(req)->retries >= nvme_max_retries)
> -		return false;
> -	return true;
> -}
> -
>   static void nvme_retry_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> @@ -268,34 +257,67 @@ static void nvme_retry_req(struct request *req)
>   	blk_mq_delay_kick_requeue_list(req->q, delay);
>   }
>   
> -void nvme_complete_rq(struct request *req)
> +enum nvme_disposition {
> +	COMPLETE,
> +	RETRY,
> +	FAILOVER,
> +};
> +
> +static inline enum nvme_disposition nvme_decide_disposition(struct request *req)
>   {
> -	blk_status_t status = nvme_error_status(nvme_req(req)->status);
> +	u16 status = nvme_req(req)->status & 0x7ff;
>   
> -	trace_nvme_complete_rq(req);
> +	if (likely(status == 0))
> +		return COMPLETE;
>   
> -	nvme_cleanup_cmd(req);
> +	if (blk_noretry_request(req) || (status & NVME_SC_DNR) ||
> +	    nvme_req(req)->retries >= nvme_max_retries)
> +		return COMPLETE;
>   
> -	if (nvme_req(req)->ctrl->kas)
> -		nvme_req(req)->ctrl->comp_seen = true;
> +	if (req->cmd_flags & REQ_NVME_MPATH) {
> +		if (nvme_is_path_error(status))
> +			return FAILOVER;
> +	}
>   
> -	if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
> -		if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
> -			return;
> +	if (blk_queue_dying(req->q))
> +		return COMPLETE;
>   
> -		if (!blk_queue_dying(req->q)) {
> -			nvme_retry_req(req);
> -			return;
> -		}
> -	} else if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> -		   req_op(req) == REQ_OP_ZONE_APPEND) {
> +	return RETRY;
> +}
> +
> +static inline void nvme_end_req(struct request *req)
> +{
> +	blk_status_t status = nvme_error_status(nvme_req(req)->status);
> +
> +	if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
> +	    req_op(req) == REQ_OP_ZONE_APPEND)
>   		req->__sector = nvme_lba_to_sect(req->q->queuedata,
>   			le64_to_cpu(nvme_req(req)->result.u64));
> -	}
>   
>   	nvme_trace_bio_complete(req, status);
>   	blk_mq_end_request(req, status);
>   }
> +
> +void nvme_complete_rq(struct request *req)
> +{
> +	trace_nvme_complete_rq(req);
> +	nvme_cleanup_cmd(req);
> +
> +	if (nvme_req(req)->ctrl->kas)
> +		nvme_req(req)->ctrl->comp_seen = true;
> +
> +	switch (nvme_decide_disposition(req)) {
> +	case COMPLETE:
> +		nvme_end_req(req);
> +		return;
> +	case RETRY:
> +		nvme_retry_req(req);
> +		return;
> +	case FAILOVER:
> +		nvme_failover_req(req);
> +		return;
> +	}
> +}
>   EXPORT_SYMBOL_GPL(nvme_complete_rq);
>   
>   bool nvme_cancel_request(struct request *req, void *data, bool reserved)
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 3ded54d2c9c6ad..abc5bcf7209506 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -65,51 +65,30 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
>   	}
>   }
>   
> -bool nvme_failover_req(struct request *req)
> +void nvme_failover_req(struct request *req)
>   {
>   	struct nvme_ns *ns = req->q->queuedata;
> -	u16 status = nvme_req(req)->status;
> +	u16 status = nvme_req(req)->status & 0x7ff;
>   	unsigned long flags;
>   
> -	switch (status & 0x7ff) {
> -	case NVME_SC_ANA_TRANSITION:
> -	case NVME_SC_ANA_INACCESSIBLE:
> -	case NVME_SC_ANA_PERSISTENT_LOSS:
> -		/*
> -		 * If we got back an ANA error we know the controller is alive,
> -		 * but not ready to serve this namespaces.  The spec suggests
> -		 * we should update our general state here, but due to the fact
> -		 * that the admin and I/O queues are not serialized that is
> -		 * fundamentally racy.  So instead just clear the current path,
> -		 * mark the the path as pending and kick of a re-read of the ANA
> -		 * log page ASAP.
> -		 */
> -		nvme_mpath_clear_current_path(ns);
> -		if (ns->ctrl->ana_log_buf) {
> -			set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> -			queue_work(nvme_wq, &ns->ctrl->ana_work);
> -		}
> -		break;
> -	case NVME_SC_HOST_PATH_ERROR:
> -	case NVME_SC_HOST_ABORTED_CMD:
> -		/*
> -		 * Temporary transport disruption in talking to the controller.
> -		 * Try to send on a new path.
> -		 */
> -		nvme_mpath_clear_current_path(ns);
> -		break;
> -	default:
> -		/* This was a non-ANA error so follow the normal error path. */
> -		return false;
> +	nvme_mpath_clear_current_path(ns);
> +
> +	/*
> +	 * If we got back an ANA error, we know the controller is alive but not
> +	 * ready to serve this namespace.  Kick of a re-read of the ANA
> +	 * information page, and just try any other available path for now.
> +	 */
> +	if (nvme_is_ana_error(status) && ns->ctrl->ana_log_buf) {
> +		set_bit(NVME_NS_ANA_PENDING, &ns->flags);
> +		queue_work(nvme_wq, &ns->ctrl->ana_work);
>   	}
>   
>   	spin_lock_irqsave(&ns->head->requeue_lock, flags);
>   	blk_steal_bios(&ns->head->requeue_list, req);
>   	spin_unlock_irqrestore(&ns->head->requeue_lock, flags);
> -	blk_mq_end_request(req, 0);
>   
> +	blk_mq_end_request(req, 0);
>   	kblockd_schedule_work(&ns->head->requeue_work);
> -	return true;
>   }
>   
>   void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 6d69cc7306d034..dfcdeb318f3ab6 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -523,6 +523,32 @@ static inline u32 nvme_bytes_to_numd(size_t len)
>   	return (len >> 2) - 1;
>   }
>   
> +static inline bool nvme_is_ana_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}
> +
> +static inline bool nvme_is_path_error(u16 status)
> +{
> +	switch (status & 0x7ff) {
> +	case NVME_SC_HOST_PATH_ERROR:
> +	case NVME_SC_HOST_ABORTED_CMD:
> +	case NVME_SC_ANA_TRANSITION:
> +	case NVME_SC_ANA_INACCESSIBLE:
> +	case NVME_SC_ANA_PERSISTENT_LOSS:
> +		return true;
> +	default:
> +		return false;
> +	}
> +}

Would be better to reuse nvme_is_ana_error I think so we
don't need to maintain ana status codes in both places:

static inline bool nvme_is_path_error(u16 status)
{
	if (nvme_is_ana_error(status))
		return true;
	switch (status & 0x7ff) {
	case NVME_SC_HOST_PATH_ERROR:
	case NVME_SC_HOST_ABORTED_CMD:
		return true;
	default:
		return false;
	}
}

Otherwise looks good.