nvme: restore use of blk_path_error() in nvme_complete_rq()

Meneghini, John John.Meneghini at netapp.com
Tue Aug 11 08:36:34 EDT 2020


Chao,

I don't see this patch on patchwork.

https://patchwork.kernel.org/project/linux-block/list/?submitter=173677

Can you upload this there.  I'd like to apply this patch to my sandbox and take a closer look.

Thanks,

/John

On 8/10/20, 4:10 AM, "Chao Leng" <lengchao at huawei.com> wrote:

    NetApp Security WARNING: This is an external email. Do not click links or open attachments unless you recognize the sender and know the content is safe.




    I can not agree with you more.
    The root cause of the conflict is REQ_FAILFAST_TRANSPORT.
    REQ_FAILFAST_TRANSPORT may be designed for scsi, because scsi protocol
    do not difine the local retry mechanism. SCSI implements a fuzzy local
    retry mechanism, so need the REQ_FAILFAST_TRANSPORT for multipath
    software, multipath software retry according error code is expected.
    nvme is different with scsi about this. It define local retry mechanism
    and path error code, so nvme should not care REQ_FAILFAST_TRANSPORT.

    Another, for nvme multipath, if the error code is not a path error,
    multipath will not fail over to retry. but maybe blk_queue_dying return
    true, IO can not be retry at current path, thus IO will interrupted.
    blk_queue_dying and path error both need fail over to retry.

    So we can do like this:
    ---
      drivers/nvme/host/core.c      | 26 +++++++++++++++++++-------
      drivers/nvme/host/multipath.c | 11 +++--------
      drivers/nvme/host/nvme.h      |  5 ++---
      include/linux/nvme.h          |  9 +++++++++
      4 files changed, 33 insertions(+), 18 deletions(-)

    diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
    index 4ee2330c603e..07471bd37f60 100644
    --- a/drivers/nvme/host/core.c
    +++ b/drivers/nvme/host/core.c
    @@ -243,7 +243,7 @@ static blk_status_t nvme_error_status(u16 status)

      static inline bool nvme_req_needs_retry(struct request *req)
      {
    -       if (blk_noretry_request(req))
    +       if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_FAILFAST_DRIVER))
                    return false;
            if (nvme_req(req)->status & NVME_SC_DNR)
                    return false;
    @@ -252,6 +252,14 @@ static inline bool nvme_req_needs_retry(struct request *req)
            return true;
      }

    +static inline bool nvme_req_path_error(struct request *req)
    +{
    +       if ((nvme_req(req)->status & NVME_SCT_MASK) == NVME_SCT_PATH ||
    +               blk_queue_dying(req->q))
    +               return true;
    +       return false;
    +}
    +
      static void nvme_retry_req(struct request *req)
      {
            struct nvme_ns *ns = req->q->queuedata;
    @@ -270,7 +278,7 @@ static void nvme_retry_req(struct request *req)

      void nvme_complete_rq(struct request *req)
      {
    -       blk_status_t status = nvme_error_status(nvme_req(req)->status);
    +       blk_status_t status;

            trace_nvme_complete_rq(req);

    @@ -279,16 +287,20 @@ void nvme_complete_rq(struct request *req)
            if (nvme_req(req)->ctrl->kas)
                    nvme_req(req)->ctrl->comp_seen = true;

    -       if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
    -               if ((req->cmd_flags & REQ_NVME_MPATH) && nvme_failover_req(req))
    -                       return;
    -
    -               if (!blk_queue_dying(req->q)) {
    +       if (unlikely(nvme_req(req)->status != NVME_SC_SUCCESS &&
    +               nvme_req_needs_retry(req))) {
    +               if (nvme_req_path_error(req)) {
    +                       if (req->cmd_flags & REQ_NVME_MPATH) {
    +                               nvme_failover_req(req);
    +                               return;
    +                       }
    +               } else {
                            nvme_retry_req(req);
                            return;
                    }
            }

    +       status = nvme_error_status(nvme_req(req)->status);
            nvme_trace_bio_complete(req, status);
            blk_mq_end_request(req, status);
      }
    diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
    index 66509472fe06..e182fb3bcd0c 100644
    --- a/drivers/nvme/host/multipath.c
    +++ b/drivers/nvme/host/multipath.c
    @@ -65,7 +65,7 @@ void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
            }
      }

    -bool nvme_failover_req(struct request *req)
    +void nvme_failover_req(struct request *req)
      {
            struct nvme_ns *ns = req->q->queuedata;
            u16 status = nvme_req(req)->status;
    @@ -90,17 +90,13 @@ bool nvme_failover_req(struct request *req)
                            queue_work(nvme_wq, &ns->ctrl->ana_work);
                    }
                    break;
    -       case NVME_SC_HOST_PATH_ERROR:
    -       case NVME_SC_HOST_ABORTED_CMD:
    +       default:
                    /*
    -                * Temporary transport disruption in talking to the controller.
    +                * Normal error path.
                     * Try to send on a new path.
                     */
                    nvme_mpath_clear_current_path(ns);
                    break;
    -       default:
    -               /* This was a non-ANA error so follow the normal error path. */
    -               return false;
            }

            spin_lock_irqsave(&ns->head->requeue_lock, flags);
    @@ -109,7 +105,6 @@ bool nvme_failover_req(struct request *req)
            blk_mq_end_request(req, 0);

            kblockd_schedule_work(&ns->head->requeue_work);
    -       return true;
      }

      void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
    diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
    index 09ffc3246f60..cbb5d4ba6241 100644
    --- a/drivers/nvme/host/nvme.h
    +++ b/drivers/nvme/host/nvme.h
    @@ -582,7 +582,7 @@ void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys);
      void nvme_mpath_start_freeze(struct nvme_subsystem *subsys);
      void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
                            struct nvme_ctrl *ctrl, int *flags);
    -bool nvme_failover_req(struct request *req);
    +void nvme_failover_req(struct request *req);
      void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
      int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
      void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id);
    @@ -640,9 +640,8 @@ static inline void nvme_set_disk_name(char *disk_name, struct nvme_ns *ns,
            sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->head->instance);
      }

    -static inline bool nvme_failover_req(struct request *req)
    +static inline void nvme_failover_req(struct request *req)
      {
    -       return false;
      }
      static inline void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
      {
    diff --git a/include/linux/nvme.h b/include/linux/nvme.h
    index 5ce51ab4c50e..8c4a5b4d5b4d 100644
    --- a/include/linux/nvme.h
    +++ b/include/linux/nvme.h
    @@ -1441,6 +1441,15 @@ enum {
            NVME_SC_DNR                     = 0x4000,
      };

    +#define NVME_SCT_MASK 0x700
    +enum {
    +       NVME_SCT_GENERIC = 0,
    +       NVME_SCT_COMMAND_SPECIFIC = 0x100,
    +       NVME_SCT_MEDIA = 0x200,
    +       NVME_SCT_PATH = 0x300,
    +       NVME_SCT_VENDOR = 0x700
    +};
    +
      struct nvme_completion {
            /*
             * Used by Admin and Fabrics commands to return data:
    --
    2.16.4


    On 2020/8/9 5:08, Meneghini, John wrote:
    > I'd like to up level this whole conversation for a minute by talking about exactly what ACRE does.
    >
    > The genesis of the changes discussed in this thread is NVMe TP-4033 - Enhanced Command Retry.  You can find a copy of this TP here:
    >
    > http://nvmexpress.org/wp-content/uploads/NVM-Express-1.3-Ratified-TPs.zip
    >
    > This technical proposal added a command retry delay feature which is programmed by the controller. The controller advertises a set of 3 different timing delays though the Identify Controller data structure CRDT{1-2} fields.  To make use of these delay fields a new CRD field was added to the CQE Status Field.  This allows the NVMe controller to specify exactly how long a command retry should be delayed, with 3 possible timers that it chooses and controls.  CRDTs can range from 100 milliseconds to 6559 seconds.  Because this capability can have such a radical effect on backwards compatibility a new NVMe Feature Identifier was added (Host Behavior Support - Feature ID 16h) with an Advanced Command Retry Enable (ACRE) bit.  This allows the host to enable or disable the feature.
    >
    > With this background there are a couple of misconceptions in this thread which I'd like to address.
    >
    > The first is: ACRE has nothing to do with the NVME_SC_CMD_INTERRUPTED status.  Yes, this new error status was added as a part of TP-4033 but the CRD field of the CQE status can be set by the controller with *any* NVMe error status. As long as the DNR bit is not set the Command Retry Delay can come into effect. This is how the spec is written and this is exactly how it has been implemented in the core nvme_complete_rq() function
    > (after change 507fe46ac91276120). For example, the controller can return NVME_SC_NS_NOT_READY with a CRDT of 2 seconds.^  So CDRT needs to be supported with all error status if the host is going to enable ACRE, and I think it's a big mistake to get hung up over the NVME_SC_CMD_INTERRUPTED status translation. The NVME_SC_CMD_INTERRUPTED status was only added to the spec to provide a general purpose "busy" status, something that was missing from NVMe, and focusing your solution on NVME_SC_CMD_INTERRUPTED, or any other specific NVMe error status, is the wrong thing to do.  There is a much larger change in error semantics going on with ACRE than just this single error.
    >
    > The second is:  All NVMe error status that do not have a Status Code Type of 3h (Path Related Status) are subsystem scoped. This is a topic that has gone through some debate on the linux-nvme mailing list and at NVMexpress.org; and there have been some ECNs to the spec to address this. There may be some exceptions to this rule because there are always implementations out there that may not follow, and there are bugs in the spec.  However, this is the intention of the NVMe spec and it matters. This means that, in a multi-pathing environment, retrying any command on a different path will not provide a different result. Retries should all occur on the same controller - unless it is a path related status.  This is how NVMe error semantics work and this is a part of what was behind Keith's patch .
    >
    > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=35038bffa87da
    >
    > Retrying NVME_SC_NS_NOT_READY or NVME_SC_CMD_INTERRUPTED on another path is simply not the right thing to do, and returning BLK_STS_TARGET after all command retries, with CRDT, have been exhausted communicates the right thing to the upper layer. From the perspective of nvme-multipathing Keith's patch was exactly the correct thing to do.  I understand that this may have caused a backwards compatibly problem with dm-multipath, and that's the only reason why I've talked about backing it out.  However, ultimately, I think nvme-core should return an error status like  BLK_STS_TARGET that says, semantically - the IO has failed, no retry will work - because this is what the NVMe error semantics are.
    >
    > Taken together both of these facts about the NVMe protocol semantics are what's behind my patch which removed blk_path_error() from nvme_complete_rq()
    >
    > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/commit/?id=764e9332098c0e60251386a507fe46ac91276120
    >
    > I understand that there is a goal to try and avoid having different failure/recovery handling semantically in response to different error status between nvme-multipath and dm-multipath, but NVMe error semantics are truly different from SCSI error semantics, and they are changing. The Linux host needs to enable and support those changes unhampered by the past. With this goal in mind, removing the blk_path_error() code from nvme-core was the right thing to do.  Hannes and I struggled with the patch to try and make it work with blk_path_error() for weeks.  As pointed out in the thread below, blk_path_error() is the SCSI multipathing logic and we can't use it in nvme_complete_rq().  All it does is import all of the legacy problems of dm-multipath, and of SCSI, into the nvme completion/multipath logic.
    >
    > At NVMexpress.org we consciously added the ACRE feature because the SCSI protocol had no such capability.  This is something which has plagued SCSI implementations for years,  and all kinds of tricks have been played, in both the SCSI host stack and in SCSI target stack, to deal with the problem. The goal of NVMe is to create a better block storage protocol and ACRE is just one example of many places where the industry is trying to do this.  There are plans to introduce more Host Behavior Support features in the future.
    >
    > In the end, we are consciously changing NVMe, both in the spec and in its implementation, to make it different from SCSI. I think this is what's at the bottom of the changes discussed in this thread, and this is why so many people are so passionate about this.  We don't want to turn NVMe into SCSI.  I know I don't want to.
    >
    > /John
    >
    > ^Note: (maybe a 2 second delay sounds unreasonable for a flash storage device but there implementors that that want to use NVMe with spinning disks... so the CDRT gives them exactly what they need).
    >
    > On 8/7/20, 7:35 PM, "Sagi Grimberg" <sagi at grimberg.me> wrote:
    >
    >      >> Hey Mike,
    >      >>
    >      >>>> The point is: blk_path_error() has nothing to do with NVMe errors.
    >      >>>> This is dm-multipath logic stuck in the middle of the NVMe error
    >      >>>> handling code.
    >      >>>
    >      >>> No, it is a means to have multiple subsystems (to this point both SCSI
    >      >>> and NVMe) doing the correct thing when translating subsystem specific
    >      >>> error codes to BLK_STS codes.
    >      >>
    >      >> Not exactly, don't find any use of this in scsi. The purpose is to make
    >      >> sure that nvme and dm speak the same language.
    >      >
    >      > SCSI doesn't need to do additional work to train a multipath layer
    >      > because dm-multipath _is_ SCSI's multipathing in Linux.
    >
    >      Agree.
    >
    >      > So ensuring SCSI properly classifies its error codes happens as a
    >      > side-effect of ensuring continued multipath functionality.
    >      >
    >      > Hannes introduced all these differentiated error codes in block core
    >      > because of SCSI.  NVMe is meant to build on the infrastructure that was
    >      > established.
    >
    >      Yes, exactly my point. blk_path_error is designed to make nvme and
    >      dm-multipath speak the same language.
    >
    >      >>> If you, or others you name drop below, understood the point we wouldn't
    >      >>> be having this conversation.  You'd accept the point of blk_path_error()
    >      >>> to be valid and required codification of what constitutes a retryable
    >      >>> path error for the Linux block layer.
    >      >>
    >      >> This incident is a case where the specific nvme status was designed
    >      >> to retry on the same path respecting the controller retry delay.
    >      >> And because nvme used blk_path_error at the time it caused us to use a
    >      >> non-retryable status to get around that. Granted, no one had
    >      >> dm-multipath in mind.
    >      >>
    >      >> So in a sense, there is consensus on changing patch 35038bffa87da
    >      >> _because_ nvme no longer uses blk_path_error. Otherwise it would break.
    >      >
    >      > "break" meaning it would do failover instead of the more optimal local
    >      > retry to the same controller.
    >      >
    >      > I see.  Wish the header for commit 35038bffa87da touched on this even a
    >      > little bit ;)
    >
    >      I think it did, but maybe didn't put too much emphasis on it.
    >
    >      > But AFAICT the patch I provided doesn't compromise proper local retry --
    >      > as long as we first fix nvme_error_status() to return a retryable
    >      > BLK_STS for NVME_SC_CMD_INTERRUPTED -- which I assumed as a prereq.
    >      >
    >      > Think of blk_path_error() as a more coarse-grained "is this retryable or
    >      > a hard failure?" check.  So for NVME_SC_CMD_INTERRUPTED,
    >      > nvme_error_status() _should_ respond with something retryable (I'd
    >      > prefer BLK_STS_RESOURCE to be honest).
    >
    >      But blk_path_error semantically mean "is this a pathing error", or at
    >      least that what its name suggest.
    >
    >      > And then nvme_failover_req() is finer-grained; by returning false it now
    >      > allows short-circuiting failover and reverting back to NVMe's normal
    >      > controller based error recovery -- which it does for
    >      > NVME_SC_CMD_INTERRUPTED due to "default" case in nvme_failover_req().
    >      >
    >      > And then the previous nvme_error_status() classification of retryable
    >      > BLK_STS obviously never gets returned up the IO stack; it gets thrown
    >      > away.
    >
    >      I see what you are saying. The issue is that the code becomes
    >      convoluted (it's a pathing error, oh wait, no its not a pathing error).
    >
    >      >>> Any BLK_STS mapping of NVMe specific error codes would need to not screw
    >      >>> up by categorizing a retryable error as non-retryable (and vice-versa).
    >      >>
    >      >> But it is a special type of retryable. There is nothing that fits the
    >      >> semantics of the current behavior.
    >      >
    >      > I agree.  But that's fine actually.
    >      >
    >      > And this issue is the poster-child for why properly supporting a duality
    >      > of driver-level vs upper level multipathing capabilities is pretty much
    >      > impossible unless a clean design factors out the different error classes
    >      > -- and local error retry is handled before punting to higher level
    >      > failover retry.  Think if NVMe were to adopt a bit more disciplined
    >      > "local then failover" error handling it all gets easier.
    >
    >      I don't think punting before is easier, because we do a local retry if:
    >      - no multipathing sw on top
    >      - request needs retry (e.g. no DNR, notretry is off etc..)
    >      - nvme error is not pathing related (nvme_failover_req returned false)
    >
    >      > This local retry _is_ NVMe specific.  NVMe should just own retrying on
    >      > the same controller no matter what (I'll hope that such retry has
    >      > awareness to not retry indefinitely though!).
    >
    >      it will retry until the retry limit.
    >
    >      >  And this has nothing to
    >      > do with multipathing, so the logic to handle it shouldn't be trapped in
    >      > nvme_failover_req().
    >
    >      Well given that nvme_failover_req already may not actually failover this
    >      makes some sense to me (although I did have some resistance to make it
    >      that way in the first place, but was convinced otherwise).
    >
    >      > I think NVMe can easily fix this by having an earlier stage of checking,
    >      > e.g. nvme_local_retry_req(), that shortcircuits ever getting to
    >      > higher-level multipathing consideration (be it native NVMe or DM
    >      > multipathing) for cases like NVME_SC_CMD_INTERRUPTED.
    >      > To be clear: the "default" case of nvme_failover_req() that returns
    >      > false to fallback to NVMe's "local" normal NVMe error handling -- that
    >      > can stay.. but a more explicit handling of cases like
    >      > NVME_SC_CMD_INTERRUPTED should be added to a nvme_local_retry_req()
    >      > check that happens before nvme_failover_req() in nvme_complete_rq().
    >
    >      I don't necessarily agree with having a dedicated nvme_local_retry_req().
    >      a request that isn't failed over, goes to local error handling (retry or
    >      not). I actually think that just adding the condition to
    >      nvme_complete_req and having nvme_failover_req reject it would work.
    >
    >      Keith?
    >
    >      >>> Anyway, no new BLK_STS is needed at this point.  More discipline with
    >      >>> how NVMe's error handling is changed is.
    >      >>
    >      >> Please read the above.
    >      >
    >      > I agree we'd need a new BLK_STS only if NVMe cannot be made to trap
    >      > NVME_SC_CMD_INTERRUPTED for local retry _before_ considering path
    >      > failover.
    >
    >      Not sure that is better, but we can see a patch first to determine.
    >
    >      >>> If NVMe wants to ensure its
    >      >>> interface isn't broken regularly it _should_ use blk_path_error() to
    >      >>> validate future nvme_error_status() changes.  Miscategorizing NVMe
    >      >>> errors to upper layers is a bug -- not open for debate.
    >      >>
    >      >> Again, don't agree is a Miscategorization nor a bug, its just something
    >      >> that is NVMe specific.
    >      >
    >      > Right I understand.
    >      >
    >      > Think it safe to assume these types of details are why Christoph wanted
    >      > to avoid the notion of native NVMe and DM multipathing having
    >      > compatible error handling.  There was some wisdom with that position
    >      > (especially with native NVMe goals in mind).  But if things are tweaked
    >      > slightly then both camps _can_ be made happy.
    >      >
    >      > There just needs to be a willingness to work through the details,
    >      > defensiveness needs to be shed on both sides, so constructive
    >      > review/consideration of problems can happen.
    >
    >      Agreed.
    >
    >      > Think that has already
    >      > happened here with our exchange.  I'm open to investing effort here if
    >      > others are up for humoring my attempt to explore fixing the issues in a
    >      > mutually beneficial way.  What's the worst that can happen?  My simple
    >      > patches might continue to be ignored? ;)
    >
    >      I won't ignore it, and I apologize of ignoring the original patch
    >      posted, I guess it flew under the radar...
    >



More information about the Linux-nvme mailing list