[PATCH] nvme-core: reduce io pause time when fail over

Thu Jul 23 04:06:34 EDT 2020

On 2020/7/23 0:13, Sagi Grimberg wrote:
>
>>> To reduce io pause time, nvme_stop_queues use
>>> blk_mq_quiesce_queue_nowait to quiesce the queue, nvme_stop_queues wait
>>> all ongoing dispatches completed after all queues has been quiesced.
>>
>> The comment above blk_mq_quiesce_queue_nowait() wants to remove this
>> function. I'm not sure we should be introducing more users if that's the
>> case.
>>
>> Using synchronize_rcu() at the end may be looking to much into blk-mq's
>> internal quiesce implementation. It happens to be the right thing to do
>> for non-blocking hctx, so this patch assumes that will be true for any
>> nvme request_queue.
>
> nvme-tcp became a block hctx since we optimize to do network sends from
> inside queue_rq. So this should either split into two functions or
> nvme-core needs to look into BLK_MQ_F_BLOCKING and do the right thing.

Another solution：introduce blk_mq_quiesce_queue_async,
blk_mq_quiesce_queue_async do not wait all ongoing dispatches completed,
if the hctx is not block hctx. The caller such as nvme_stop_queues
wait all ongoing dispatches completed after all queues has been quiesced.

---
  block/blk-mq.c           | 28 ++++++++++++++++++++++++++++
  drivers/nvme/host/core.c |  5 ++++-
  include/linux/blk-mq.h   |  1 +
  3 files changed, 33 insertions(+), 1 deletion(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e0d173beaa3..0053ff42bb47 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -235,6 +235,34 @@ void blk_mq_quiesce_queue(struct request_queue *q)
  }
  EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);

+/*
+ * blk_mq_quiesce_queue_async() - do not wait for non block hctx
+ * @q: request queue.
+ *
+ * Note: this function do not wait all ongoing dispatches completed
+ * if the hctx is not block hctx. Should be used to reduce wait time
+ * when quiesce batch queues at the same time. The caller should
+ * synchronous wait until all ongoing dispatches have finished
+ * after all queues has been done this.
+ */
+bool blk_mq_quiesce_queue_async(struct request_queue *q)
+{
+    struct blk_mq_hw_ctx *hctx;
+    unsigned int i;
+    bool rcu = false;
+
+    blk_mq_quiesce_queue_nowait(q);
+
+    queue_for_each_hw_ctx(q, hctx, i) {
+        if (hctx->flags & BLK_MQ_F_BLOCKING)
+            synchronize_srcu(hctx->srcu);
+        else
+            rcu = true;
+    }
+    return rcu;
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_async);
+
  /*
   * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue()
   * @q: request queue.
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index add040168e67..a1915f0276eb 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4320,11 +4320,14 @@ EXPORT_SYMBOL_GPL(nvme_start_freeze);
  void nvme_stop_queues(struct nvme_ctrl *ctrl)
  {
      struct nvme_ns *ns;
+    bool rcu = false;

      down_read(&ctrl->namespaces_rwsem);
      list_for_each_entry(ns, &ctrl->namespaces, list)
-        blk_mq_quiesce_queue(ns->queue);
+        rcu = rcu || blk_mq_quiesce_queue_async(ns->queue);
      up_read(&ctrl->namespaces_rwsem);
+    if (rcu)
+        synchronize_rcu();
  }
  EXPORT_SYMBOL_GPL(nvme_stop_queues);

diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d6fcae17da5a..6ad83bfd17b2 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -515,6 +515,7 @@ void blk_mq_start_hw_queues(struct request_queue *q);
  void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
  void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
  void blk_mq_quiesce_queue(struct request_queue *q);
+bool blk_mq_quiesce_queue_async(struct request_queue *q);
  void blk_mq_unquiesce_queue(struct request_queue *q);
  void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
  void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
--