NVMeoF: multipath stuck after bringing one ethernet port down

Mon Jun 5 00:11:30 PDT 2017

I tested the patch, works great.

Both IO (dd), "multipath -ll", and "nvme list" return instantaneously 
with IO error, multipath is reinstated as soon as the path is reconnected.

Sagi, thanks for the fix!

On 05/30/2017 05:17 PM, Sagi Grimberg wrote:
>
>> Hi guys,
>> this is a known issue in upstream code and Mellanox OFED code as well.
>> I agree with Sagi's approach for future issues using our package.
>> For this one, we will test the proposed fixes and update you regarding
>> the results.
>
> You can try:
>
> -- 
> [PATCH] nvme-rdma: fast fail incoming requests while we reconnect
>
> When we encounter an transport/controller errors, error recovery
> kicks in which performs:
> 1. stops io/admin queues
> 2. moves transport queues out of LIVE state
> 3. fast fail pending io
> 4. schedule periodic reconnects.
>
> But we also need to fast fail incoming IO taht enters after we
> already scheduled. Given that our queue is not LIVE anymore, simply
> restart the request queues to fail in .queue_rq
>
> Signed-off-by: Sagi Grimberg <sagi at grimberg.me>
> ---
>  drivers/nvme/host/rdma.c | 37 ++++++++++++++++++++++---------------
>  1 file changed, 22 insertions(+), 15 deletions(-)
>
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 28bd255c144d..ce8f1e992e64 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -753,28 +753,26 @@ static void nvme_rdma_reconnect_ctrl_work(struct 
> work_struct *work)
>         if (ret)
>                 goto requeue;
>
> -       blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
> -
>         ret = nvmf_connect_admin_queue(&ctrl->ctrl);
>         if (ret)
> -               goto stop_admin_q;
> +               goto requeue;
>
>         set_bit(NVME_RDMA_Q_LIVE, &ctrl->queues[0].flags);
>
>         ret = nvme_enable_ctrl(&ctrl->ctrl, ctrl->cap);
>         if (ret)
> -               goto stop_admin_q;
> +               goto requeue;
>
>         nvme_start_keep_alive(&ctrl->ctrl);
>
>         if (ctrl->queue_count > 1) {
>                 ret = nvme_rdma_init_io_queues(ctrl);
>                 if (ret)
> -                       goto stop_admin_q;
> +                       goto requeue;
>
>                 ret = nvme_rdma_connect_io_queues(ctrl);
>                 if (ret)
> -                       goto stop_admin_q;
> +                       goto requeue;
>         }
>
>         changed = nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_LIVE);
> @@ -782,7 +780,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct 
> work_struct *work)
>         ctrl->ctrl.opts->nr_reconnects = 0;
>
>         if (ctrl->queue_count > 1) {
> -               nvme_start_queues(&ctrl->ctrl);
>                 nvme_queue_scan(&ctrl->ctrl);
>                 nvme_queue_async_events(&ctrl->ctrl);
>         }
> @@ -791,8 +788,6 @@ static void nvme_rdma_reconnect_ctrl_work(struct 
> work_struct *work)
>
>         return;
>
> -stop_admin_q:
> -       blk_mq_stop_hw_queues(ctrl->ctrl.admin_q);
>  requeue:
>         dev_info(ctrl->ctrl.device, "Failed reconnect attempt %d\n",
>                         ctrl->ctrl.opts->nr_reconnects);
> @@ -823,6 +818,13 @@ static void nvme_rdma_error_recovery_work(struct 
> work_struct *work)
>         blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
>                                 nvme_cancel_request, &ctrl->ctrl);
>
> +       /*
> +        * queues are not a live anymore, so restart the queues to 
> fail fast
> +        * new IO
> +        */
> +       blk_mq_start_stopped_hw_queues(ctrl->ctrl.admin_q, true);
> +       nvme_start_queues(&ctrl->ctrl);
> +
>         nvme_rdma_reconnect_or_remove(ctrl);
>  }
>
> @@ -1433,7 +1435,7 @@ nvme_rdma_timeout(struct request *rq, bool 
> reserved)
>  /*
>   * We cannot accept any other command until the Connect command has 
> completed.
>   */
> -static inline bool nvme_rdma_queue_is_ready(struct nvme_rdma_queue 
> *queue,
> +static inline int nvme_rdma_queue_is_ready(struct nvme_rdma_queue 
> *queue,
>                 struct request *rq)
>  {
>         if (unlikely(!test_bit(NVME_RDMA_Q_LIVE, &queue->flags))) {
> @@ -1441,11 +1443,15 @@ static inline bool 
> nvme_rdma_queue_is_ready(struct nvme_rdma_queue *queue,
>
>                 if (!blk_rq_is_passthrough(rq) ||
>                     cmd->common.opcode != nvme_fabrics_command ||
> -                   cmd->fabrics.fctype != nvme_fabrics_type_connect)
> -                       return false;
> +                   cmd->fabrics.fctype != nvme_fabrics_type_connect) {
> +                       if (queue->ctrl->ctrl->state == 
> NVME_CTRL_RECONNECTING)
> +                               return -EIO;
> +                       else
> +                               return -EAGAIN;
> +               }
>         }
>
> -       return true;
> +       return 0;
>  }
>
>  static int nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
> @@ -1463,8 +1469,9 @@ static int nvme_rdma_queue_rq(struct 
> blk_mq_hw_ctx *hctx,
>
>         WARN_ON_ONCE(rq->tag < 0);
>
> -       if (!nvme_rdma_queue_is_ready(queue, rq))
> -               return BLK_MQ_RQ_QUEUE_BUSY;
> +       ret = nvme_rdma_queue_is_ready(queue, rq);
> +       if (unlikely(ret))
> +               goto err;
>
>         dev = queue->device->dev;
>         ib_dma_sync_single_for_cpu(dev, sqe->dma,
> --