[PATCH V2 5/5] nvme: pci: simplify timeout handling

Sun Apr 29 08:41:52 PDT 2018

When one req is timed out, now nvme_timeout() handles it by the
following way:

	nvme_dev_disable(dev, false);
	nvme_reset_ctrl(&dev->ctrl);
	return BLK_EH_HANDLED.

which may introduces the following issues:

1) the following timeout on other reqs may call nvme_dev_disable()
again, which may quiesce queue again when resetting is in-progress,
then finally nothing can move on.

2) when timeout is triggered in reset work function, nvme_wait_freeze()
may wait forever because now controller can't be recovered at all

This patch fixes the issues by:

1) handle timeout event in one EH thread, and wakeup this thread if
controller recovery is needed

2) Inside the EH handler, timeout work is drained by nvme_unquiesce_timeout()
before canceling in-flight requrests, so all requests can be canceled or
completed by either EH or block timeout handler.

3) Moves the draining IO and unfreezing queues into one post_eh work context,
so controller can be recovered always.

This patch fixes reports from the horible test of block/011.

Cc: Jianchao Wang <jianchao.w.wang at oracle.com>
Cc: Christoph Hellwig <hch at lst.de>
Cc: Sagi Grimberg <sagi at grimberg.me>
Cc: linux-nvme at lists.infradead.org
Signed-off-by: Ming Lei <ming.lei at redhat.com>
---
 drivers/nvme/host/core.c |  22 ++++++++
 drivers/nvme/host/nvme.h |   3 +
 drivers/nvme/host/pci.c  | 140 +++++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 155 insertions(+), 10 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f9028873298e..664a268accd1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3582,6 +3582,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_unquiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_quiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a19b7f04ac24..956ee19ff403 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -19,6 +19,7 @@
 #include <linux/pci.h>
 #include <linux/kref.h>
 #include <linux/blk-mq.h>
+#include <linux/blkdev.h>
 #include <linux/lightnvm.h>
 #include <linux/sed-opal.h>
 #include <linux/fault-inject.h>
@@ -405,6 +406,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8172ee584130..86c14d9051ff 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -29,6 +29,7 @@
 #include <linux/types.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/sed-opal.h>
+#include <linux/kthread.h>
 
 #include "nvme.h"
 
@@ -112,6 +113,11 @@ struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+
+	spinlock_t	  eh_lock;
+	bool		eh_in_recovery;
+	struct task_struct    *ehandler;
+	struct work_struct post_eh_work;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1176,6 +1182,100 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	if (!dev->eh_in_recovery) {
+		dev->eh_in_recovery = true;
+		wake_up_process(dev->ehandler);
+	}
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_done(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	if (dev->eh_in_recovery)
+		dev->eh_in_recovery = false;
+	spin_unlock(&dev->eh_lock);
+}
+
+static int nvme_error_handler(void *data)
+{
+	struct nvme_dev *dev = data;
+
+	while (true) {
+		/*
+		 * The sequence in kthread_stop() sets the stop flag first
+		 * then wakes the process.  To avoid missed wakeups, the task
+		 * should always be in a non running state before the stop
+		 * flag is checked
+		 */
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+
+		spin_lock(&dev->eh_lock);
+		if (!dev->eh_in_recovery) {
+			spin_unlock(&dev->eh_lock);
+			schedule();
+			continue;
+		}
+		spin_unlock(&dev->eh_lock);
+
+		__set_current_state(TASK_RUNNING);
+
+		dev_info(dev->ctrl.device, "start eh recovery\n");
+
+		/* freeze queues before recovery */
+		nvme_start_freeze(&dev->ctrl);
+
+		nvme_dev_disable(dev, false);
+
+		/*
+		 * reset won't wait for IO completion any more, so it
+		 * is safe to reset controller in sync way
+		 */
+		nvme_reset_ctrl_sync(&dev->ctrl);
+
+		dev_info(dev->ctrl.device, "eh recovery done\n");
+
+		/*
+		 * drain IO & unfreeze queues in another context because
+		 * these IOs may trigger timeout again
+		 */
+		queue_work(nvme_reset_wq, &dev->post_eh_work);
+	}
+	__set_current_state(TASK_RUNNING);
+	dev->ehandler = NULL;
+
+	return 0;
+}
+
+static void nvme_post_eh_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, post_eh_work);
+
+	nvme_wait_freeze(&dev->ctrl);
+	nvme_unfreeze(&dev->ctrl);
+}
+
+static int nvme_eh_init(struct nvme_dev *dev)
+{
+	spin_lock_init(&dev->eh_lock);
+	INIT_WORK(&dev->post_eh_work, nvme_post_eh_work);
+
+	dev->ehandler = kthread_run(nvme_error_handler, dev,
+			"nvme_eh_%d", dev->ctrl.instance);
+	if (IS_ERR(dev->ehandler)) {
+		dev_err(dev->ctrl.device, "error handler thread failed to spawn, error = %ld\n",
+			PTR_ERR(dev->ehandler));
+		return PTR_ERR(dev->ehandler);
+	}
+	return 0;
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1197,8 +1297,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 */
 	if (nvme_should_reset(dev, csts)) {
 		nvme_warn_reset(dev, csts);
-		nvme_dev_disable(dev, false);
-		nvme_reset_ctrl(&dev->ctrl);
+		nvme_eh_schedule(dev);
 		return BLK_EH_HANDLED;
 	}
 
@@ -1224,8 +1323,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+		nvme_eh_schedule(dev);
 		return BLK_EH_HANDLED;
 	default:
 		break;
@@ -1240,14 +1339,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false);
-		nvme_reset_ctrl(&dev->ctrl);
-
 		/*
 		 * Mark the request as handled, since the inline shutdown
 		 * forces all outstanding requests to complete.
 		 */
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+		nvme_eh_schedule(dev);
 		return BLK_EH_HANDLED;
 	}
 
@@ -2246,8 +2343,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	if (pci_is_enabled(pdev)) {
 		u32 csts = readl(dev->bar + NVME_REG_CSTS);
 
-		if (dev->ctrl.state == NVME_CTRL_LIVE ||
-		    dev->ctrl.state == NVME_CTRL_RESETTING) {
+		if (shutdown && (dev->ctrl.state == NVME_CTRL_LIVE ||
+		    dev->ctrl.state == NVME_CTRL_RESETTING)) {
 			nvme_start_freeze(&dev->ctrl);
 			frozen = true;
 		}
@@ -2281,11 +2378,23 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
 
+	/* safe to sync timeout after queues are quiesced */
+	nvme_quiesce_timeout(&dev->ctrl);
+	blk_quiesce_timeout(dev->ctrl.admin_q);
+
 	nvme_pci_disable(dev);
 
+	/*
+	 * Both timeout and interrupt handler have been drained, and all
+	 * in-flight requests will be canceled now.
+	 */
 	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
 	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
 
+	/* all requests are canceled now, so enable timeout now */
+	nvme_unquiesce_timeout(&dev->ctrl);
+	blk_unquiesce_timeout(dev->ctrl.admin_q);
+
 	/*
 	 * The driver will not be starting up queues again if shutting down so
 	 * must flush all entered requests to their failed completion to avoid
@@ -2416,9 +2525,14 @@ static void nvme_reset_work(struct work_struct *work)
 	if (result)
 		goto out;
 
+	nvme_eh_done(dev);
+
 	/*
 	 * Keep the controller around but remove all namespaces if we don't have
 	 * any working I/O queue.
+	 *
+	 * Now we won't wait for IO completion here any more, and sync reset
+	 * can be used safely anywhere.
 	 */
 	if (dev->online_queues < 2) {
 		dev_warn(dev->ctrl.device, "IO queues not created\n");
@@ -2427,11 +2541,9 @@ static void nvme_reset_work(struct work_struct *work)
 		new_state = NVME_CTRL_ADMIN_ONLY;
 	} else {
 		nvme_start_queues(&dev->ctrl);
-		nvme_wait_freeze(&dev->ctrl);
 		/* hit this only when allocate tagset fails */
 		if (nvme_dev_add(dev))
 			new_state = NVME_CTRL_ADMIN_ONLY;
-		nvme_unfreeze(&dev->ctrl);
 	}
 
 	/*
@@ -2449,6 +2561,7 @@ static void nvme_reset_work(struct work_struct *work)
 
  out:
 	nvme_remove_dead_ctrl(dev, result);
+	nvme_eh_done(dev);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -2590,10 +2703,15 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	if (nvme_eh_init(dev))
+		goto uninit_ctrl;
+
 	nvme_reset_ctrl(&dev->ctrl);
 
 	return 0;
 
+ uninit_ctrl:
+	nvme_uninit_ctrl(&dev->ctrl);
  release_pools:
 	nvme_release_prp_pools(dev);
  unmap:
@@ -2647,6 +2765,8 @@ static void nvme_remove(struct pci_dev *pdev)
 	nvme_stop_ctrl(&dev->ctrl);
 	nvme_remove_namespaces(&dev->ctrl);
 	nvme_dev_disable(dev, true);
+	if (dev->ehandler)
+		kthread_stop(dev->ehandler);
 	nvme_free_host_mem(dev);
 	nvme_dev_remove_admin(dev);
 	nvme_free_queues(dev, 0);
-- 
2.9.5