[PATCH V4 7/7] nvme: pci: support nested EH

Ming Lei ming.lei at redhat.com
Sat May 5 06:59:05 PDT 2018


When one req is timed out, now nvme_timeout() handles it by the
following way:

	nvme_dev_disable(dev, false);
	nvme_reset_ctrl(&dev->ctrl);
	return BLK_EH_HANDLED.

There are several issues about the above approach:

1) IO may fail during resetting

Admin IO timeout may be triggered in nvme_reset_dev() when error happens.
Normal IO timeout may be triggered too during nvme_wait_freeze() in
reset path. When the two kinds of timeout happen, the current reset mechanism
can't work any more.

2) race between nvme_start_freeze and nvme_wait_freeze() & nvme_unfreeze()

nvme_dev_disable() and resetting controller are required for recovering
controller, but the two are run from different contexts. nvme_start_freeze()
is call from nvme_dev_disable() which is run timeout work context, and
nvme_unfreeze() is run from reset work context. Unfortunatley timeout may be
triggered during resetting controller, so nvme_start_freeze() may be run
several times. Also two reset work may run one by one, this may cause
hang in nvme_wait_freeze() forever.

3) all namespace's EH require to shutdown & reset the controller

block's timeout handler is per-request-queue, that means each
namespace's error handling may shutdown & reset the whole controller,
then the shutdown from one namespace may quiese queues when resetting
from another namespace is in-progress.

This patch fixes the above issues by using nested EH:

1) run controller shutdown(nvme_dev_disable()) and resetting(nvme_reset_dev)
from one same EH context

2) always start a new context for handling EH, and cancel all
in-flight requests(include the timed-out ones) in nvme_dev_disable()
by quiescing timeout event before shutdown controller.

3) limit the max number of nested EH, when the limit is reached, fails
the controller by marking its state as DELETING and fail all in-flight
request. This approach for failing controller is from Keith's previous
patch.

With this approach, blktest block/011 can be passed.

Cc: Jianchao Wang <jianchao.w.wang at oracle.com>
Cc: Christoph Hellwig <hch at lst.de>
Cc: Sagi Grimberg <sagi at grimberg.me>
Cc: linux-nvme at lists.infradead.org
Cc: Laurence Oberman <loberman at redhat.com>
Signed-off-by: Ming Lei <ming.lei at redhat.com>
---
 drivers/nvme/host/core.c |  26 ++++++++
 drivers/nvme/host/nvme.h |   2 +
 drivers/nvme/host/pci.c  | 161 ++++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 173 insertions(+), 16 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3aaee4dbf58e..d9a62e2cc33e 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -254,6 +254,8 @@ EXPORT_SYMBOL_GPL(nvme_complete_rq);
 
 void nvme_cancel_request(struct request *req, void *data, bool reserved)
 {
+	struct nvme_ctrl *ctrl = data;
+
 	if (!blk_mq_request_started(req))
 		return;
 
@@ -261,6 +263,8 @@ void nvme_cancel_request(struct request *req, void *data, bool reserved)
 				"Cancelling I/O %d", req->tag);
 
 	nvme_req(req)->status = NVME_SC_ABORT_REQ;
+	if (ctrl->state == NVME_CTRL_DELETING)
+		nvme_req(req)->status |= NVME_SC_DNR;
 	blk_mq_complete_request(req);
 
 }
@@ -3583,6 +3587,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
 }
 EXPORT_SYMBOL_GPL(nvme_start_freeze);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_unquiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+	struct nvme_ns *ns;
+
+	down_read(&ctrl->namespaces_rwsem);
+	list_for_each_entry(ns, &ctrl->namespaces, list)
+		blk_quiesce_timeout(ns->queue);
+	up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
 void nvme_stop_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 99f55c6f69f8..32f76cc8bb65 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -405,6 +405,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
 void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
 		union nvme_result *res);
 
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
 void nvme_stop_queues(struct nvme_ctrl *ctrl);
 void nvme_start_queues(struct nvme_ctrl *ctrl);
 void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 2fbe24274ad0..105d02fcac2d 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -71,6 +71,7 @@ struct nvme_queue;
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 		freeze_queue);
+static void nvme_reset_dev(struct nvme_dev *dev, bool update_state);
 
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
@@ -113,6 +114,20 @@ struct nvme_dev {
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
 	void **host_mem_desc_bufs;
+
+	/* EH handler */
+	spinlock_t	eh_lock;
+	bool		ctrl_shutdown_started;
+	bool		ctrl_failed;
+	unsigned int	nested_eh;
+	struct work_struct fail_ctrl_work;
+};
+
+#define  NVME_MAX_NESTED_EH	32
+struct nvme_eh_work {
+	struct work_struct	work;
+	struct nvme_dev		*dev;
+	int			seq;
 };
 
 static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1177,6 +1192,93 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
 			 csts, result);
 }
 
+static void nvme_eh_fail_ctrl_work(struct work_struct *work)
+{
+	struct nvme_dev *dev =
+		container_of(work, struct nvme_dev, fail_ctrl_work);
+
+	dev_info(dev->ctrl.device, "EH: fail controller\n");
+	nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+	nvme_dev_disable(dev, false, true);
+}
+
+static void nvme_eh_mark_ctrl_shutdown(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	dev->ctrl_shutdown_started = false;
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_done(struct nvme_dev *dev)
+{
+	spin_lock(&dev->eh_lock);
+	dev->nested_eh--;
+	spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_work(struct work_struct *work)
+{
+	struct nvme_eh_work *eh_work =
+		container_of(work, struct nvme_eh_work, work);
+	struct nvme_dev *dev = eh_work->dev;
+
+	dev_info(dev->ctrl.device, "EH %d: before shutdown\n",
+			eh_work->seq);
+	nvme_dev_disable(dev, false, true);
+	nvme_eh_mark_ctrl_shutdown(dev);
+
+	dev_info(dev->ctrl.device, "EH %d: after shutdown\n",
+			eh_work->seq);
+
+	nvme_reset_dev(dev, true);
+	nvme_eh_done(dev);
+	dev_info(dev->ctrl.device, "EH %d: after recovery\n",
+			eh_work->seq);
+
+	kfree(eh_work);
+}
+
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+	bool need_sched = false;
+	bool fail_ctrl = false;
+	struct nvme_eh_work *eh_work;
+	int seq;
+
+	spin_lock(&dev->eh_lock);
+	if (!dev->ctrl_shutdown_started) {
+		need_sched = true;
+		seq = dev->nested_eh;
+		if (++dev->nested_eh >= NVME_MAX_NESTED_EH) {
+			if (!dev->ctrl_failed)
+				dev->ctrl_failed = fail_ctrl = true;
+			else
+				need_sched = false;
+		} else
+			dev->ctrl_shutdown_started = true;
+	}
+	spin_unlock(&dev->eh_lock);
+
+	if (!need_sched)
+		return;
+
+	if (fail_ctrl) {
+ fail_ctrl:
+		INIT_WORK(&dev->fail_ctrl_work, nvme_eh_fail_ctrl_work);
+		queue_work(nvme_reset_wq, &dev->fail_ctrl_work);
+		return;
+	}
+
+	eh_work = kzalloc(sizeof(*eh_work), GFP_NOIO);
+	if (!eh_work)
+		goto fail_ctrl;
+
+	eh_work->dev = dev;
+	eh_work->seq = seq;
+	INIT_WORK(&eh_work->work, nvme_eh_work);
+	queue_work(nvme_reset_wq, &eh_work->work);
+}
+
 static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 {
 	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1198,9 +1300,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	 */
 	if (nvme_should_reset(dev, csts)) {
 		nvme_warn_reset(dev, csts);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	/*
@@ -1225,9 +1326,9 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, disable controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, false);
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	default:
 		break;
 	}
@@ -1241,15 +1342,13 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 		dev_warn(dev->ctrl.device,
 			 "I/O %d QID %d timeout, reset controller\n",
 			 req->tag, nvmeq->qid);
-		nvme_dev_disable(dev, false, true);
-		nvme_reset_ctrl(&dev->ctrl);
-
 		/*
 		 * Mark the request as handled, since the inline shutdown
 		 * forces all outstanding requests to complete.
 		 */
 		nvme_req(req)->flags |= NVME_REQ_CANCELLED;
-		return BLK_EH_HANDLED;
+		nvme_eh_schedule(dev);
+		return BLK_EH_RESET_TIMER;
 	}
 
 	if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
@@ -2301,12 +2400,26 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown, bool
 	}
 	for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
 		nvme_suspend_queue(&dev->queues[i]);
+	/*
+	 * safe to sync timeout after queues are quiesced, then all
+	 * requests(include the time-out ones) will be canceled.
+	 */
+	nvme_quiesce_timeout(&dev->ctrl);
+	blk_quiesce_timeout(dev->ctrl.admin_q);
 
 	nvme_pci_disable(dev);
 
+	/*
+	 * Both timeout and interrupt handler have been drained, and all
+	 * in-flight requests will be canceled now.
+	 */
 	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
 	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
 
+	/* all requests have been canceled now, so enable timeout now */
+	nvme_unquiesce_timeout(&dev->ctrl);
+	blk_unquiesce_timeout(dev->ctrl.admin_q);
+
 	/*
 	 * The driver will not be starting up queues again if shutting down so
 	 * must flush all entered requests to their failed completion to avoid
@@ -2365,7 +2478,7 @@ static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
 		nvme_put_ctrl(&dev->ctrl);
 }
 
-static void nvme_reset_dev(struct nvme_dev *dev)
+static void nvme_reset_dev(struct nvme_dev *dev, bool update_state)
 {
 	bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
 	int result = -ENODEV;
@@ -2373,7 +2486,19 @@ static void nvme_reset_dev(struct nvme_dev *dev)
 
 	mutex_lock(&dev->ctrl.reset_lock);
 
-	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
+	if (update_state) {
+		if (dev->ctrl.state != NVME_CTRL_RESETTING &&
+		    dev->ctrl.state != NVME_CTRL_CONNECTING) {
+		    if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING)) {
+			dev_warn(dev->ctrl.device, "failed to change state to %d\n",
+					NVME_CTRL_RESETTING);
+			goto out;
+		    }
+		}
+	}
+
+	if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING &&
+				dev->ctrl.state != NVME_CTRL_CONNECTING))
 		goto out;
 
 	/*
@@ -2387,10 +2512,12 @@ static void nvme_reset_dev(struct nvme_dev *dev)
 	 * Introduce CONNECTING state from nvme-fc/rdma transports to mark the
 	 * initializing procedure here.
 	 */
-	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
-		dev_warn(dev->ctrl.device,
-			"failed to mark controller CONNECTING\n");
-		goto out;
+	if (dev->ctrl.state != NVME_CTRL_CONNECTING) {
+		if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+			dev_warn(dev->ctrl.device,
+				 "failed to mark controller CONNECTING\n");
+			goto out;
+		}
 	}
 
 	result = nvme_pci_enable(dev);
@@ -2483,7 +2610,7 @@ static void nvme_reset_work(struct work_struct *work)
 	struct nvme_dev *dev =
 		container_of(work, struct nvme_dev, ctrl.reset_work);
 
-	nvme_reset_dev(dev);
+	nvme_reset_dev(dev, false);
 }
 
 static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -2625,6 +2752,8 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 
 	dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
 
+	spin_lock_init(&dev->eh_lock);
+
 	nvme_reset_ctrl(&dev->ctrl);
 
 	return 0;
-- 
2.9.5




More information about the Linux-nvme mailing list