[PATCH] nvme-core: reduce io pause time when fail over

Thu Jul 23 12:30:57 EDT 2020

>>> Using synchronize_rcu() at the end may be looking to much into blk-mq's
>>> internal quiesce implementation. It happens to be the right thing to do
>>> for non-blocking hctx, so this patch assumes that will be true for any
>>> nvme request_queue.
>>
>> nvme-tcp became a block hctx since we optimize to do network sends from
>> inside queue_rq. So this should either split into two functions or
>> nvme-core needs to look into BLK_MQ_F_BLOCKING and do the right thing.
> 
> Another solution：introduce blk_mq_quiesce_queue_async,
> blk_mq_quiesce_queue_async do not wait all ongoing dispatches completed,
> if the hctx is not block hctx. The caller such as nvme_stop_queues
> wait all ongoing dispatches completed after all queues has been quiesced.

But then you maintain the problem for blocking hctx, so its not a good
solution (and also its not a good practice to abuse the return code like
you did IMO).

What we need is something like the following untested patches:
[1]:
--

diff --git a/block/blk-mq.c b/block/blk-mq.c
index abcf590f6238..c63bf34866eb 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -209,23 +209,12 @@ void blk_mq_quiesce_queue_nowait(struct 
request_queue *q)
  }
  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);

-/**
- * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
- * @q: request queue.
- *
- * Note: this function does not prevent that the struct request end_io()
- * callback function is invoked. Once this function is returned, we make
- * sure no dispatch can happen until the queue is unquiesced via
- * blk_mq_unquiesce_queue().
- */
-void blk_mq_quiesce_queue(struct request_queue *q)
+void blk_mq_quiesce_queue_sync(struct request_queue *q)
  {
         struct blk_mq_hw_ctx *hctx;
         unsigned int i;
         bool rcu = false;

-       blk_mq_quiesce_queue_nowait(q);
-
         queue_for_each_hw_ctx(q, hctx, i) {
                 if (hctx->flags & BLK_MQ_F_BLOCKING)
                         synchronize_srcu(hctx->srcu);
@@ -235,6 +224,22 @@ void blk_mq_quiesce_queue(struct request_queue *q)
         if (rcu)
                 synchronize_rcu();
  }
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_sync);
+
+/**
+ * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
+ * @q: request queue.
+ *
+ * Note: this function does not prevent that the struct request end_io()
+ * callback function is invoked. Once this function is returned, we make
+ * sure no dispatch can happen until the queue is unquiesced via
+ * blk_mq_unquiesce_queue().
+ */
+void blk_mq_quiesce_queue(struct request_queue *q)
+{
+       blk_mq_quiesce_queue_nowait(q);
+       blk_mq_quiesce_queue_sync(q);
+}
  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

  /*
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 23230c1d031e..7fd6994d5b32 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -532,6 +532,7 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap);
  void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int 
nr_hw_queues);

  void blk_mq_quiesce_queue_nowait(struct request_queue *q);
+void blk_mq_quiesce_queue_sync(struct request_queue *q);

  unsigned int blk_mq_rq_cpu(struct request *rq);
--

[2]:
--
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c16bfdff2953..35c39932c491 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4535,8 +4535,11 @@ void nvme_stop_queues(struct nvme_ctrl *ctrl)

         down_read(&ctrl->namespaces_rwsem);
         list_for_each_entry(ns, &ctrl->namespaces, list)
-               blk_mq_quiesce_queue(ns->queue);
+               blk_mq_quiesce_queue_nowait(ns->queue);
+       /* all namespaces share hctxs, so sync just once */
+       blk_mq_quiesce_queue_sync(ns->queue);
         up_read(&ctrl->namespaces_rwsem);
+
  }
  EXPORT_SYMBOL_GPL(nvme_stop_queues);
--

I can send formal patches if you care to test it...