[PATCHv2 2/2] NVMe: Shutdown fixes

Tue Nov 24 10:35:28 PST 2015

The controller must be disabled prior to completing a presumed lost
command. This patch makes the shutdown safe to simultaneous invocations,
and directly calls it from the timeout handler if timeout occurs during
initialization.

blk-mq marks requests as completed in its timeout handler, though the
driver still owns the request. To propagate the appropriate status
to the caller, the driver must directly set req->errors. This also
happens to fix a race if the controller being reset actually completes
the request that timed out.

A side effect of this patch is a controller that times out IO queue
creation will be failed. The driver silently proceeded before, so this
sounds like an improvement. There hasn't been any reports of such
a condition actually occurring, though.

Signed-off-by: Keith Busch <keith.busch at intel.com>
---
v1 -> v2:

  Get rid of the struct work for shutdown. Make shutdown robust to
  multiple invocations instead.

 drivers/nvme/host/pci.c | 56 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 13 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index ff253c8..cf9722f 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,6 +86,7 @@ struct nvme_queue;
 static int nvme_reset(struct nvme_dev *dev);
 static void nvme_process_cq(struct nvme_queue *nvmeq);
 static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
+static void nvme_dev_shutdown(struct nvme_dev *dev);
 
 struct async_cmd_info {
 	struct kthread_work work;
@@ -117,6 +118,7 @@ struct nvme_dev {
 	struct work_struct reset_work;
 	struct work_struct scan_work;
 	struct work_struct remove_work;
+	wait_queue_head_t shutdown_wait;
 	bool subsystem;
 	u32 page_size;
 	void __iomem *cmb;
@@ -125,7 +127,7 @@ struct nvme_dev {
 	u32 cmbsz;
 	unsigned long flags;
 #define NVME_CTRL_RESETTING	0
-
+#define NVME_CTRL_SHUTDOWN	1
 	struct nvme_ctrl ctrl;
 };
 
@@ -723,6 +725,19 @@ static void nvme_complete_rq(struct request *req)
 	blk_mq_end_request(req, error);
 }
 
+static inline void nvme_end_req(struct request *req, int error)
+{
+	/*
+	 * The request may already be marked "complete" by blk-mq if
+	 * completion occurs during a timeout. We set req->errors
+	 * before telling blk-mq in case this is the target request
+	 * of a timeout handler. This is safe since this driver never
+	 * completes the same command twice.
+	 */
+	req->errors = error;
+	blk_mq_complete_request(req, req->errors);
+}
+
 static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 {
 	u16 head, phase;
@@ -770,8 +785,7 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
 			u32 result = le32_to_cpu(cqe.result);
 			req->special = (void *)(uintptr_t)result;
 		}
-		blk_mq_complete_request(req, status >> 1);
-
+		nvme_end_req(req, status >> 1);
 	}
 
 	/* If the controller ignores the cq head doorbell and continuously
@@ -938,13 +952,16 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
 	struct nvme_command cmd;
 
 	/*
-	 * Just return an error to reset_work if we're already called from
-	 * probe context.  We don't worry about request / tag reuse as
-	 * reset_work will immeditalely unwind and remove the controller
-	 * once we fail a command.
+	 * Shutdown immediately if controller times out while starting. The
+	 * reset work will see the pci device disabled when it gets the forced
+	 * cancellation error. All outstanding requests are completed on
+	 * shutdown, so we return BLK_EH_HANDLED.
 	 */
 	if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) {
-		req->errors = NVME_SC_ABORT_REQ | NVME_SC_DNR;
+		dev_warn(dev->dev,
+				"I/O %d QID %d timeout, disable controller\n",
+				 req->tag, nvmeq->qid);
+		nvme_dev_shutdown(dev);
 		return BLK_EH_HANDLED;
 	}
 
@@ -1015,7 +1032,8 @@ static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved
 
 	if (blk_queue_dying(req->q))
 		status |= NVME_SC_DNR;
-	blk_mq_complete_request(req, status);
+
+	nvme_end_req(req, status);
 }
 
 static void nvme_free_queue(struct nvme_queue *nvmeq)
@@ -1510,9 +1528,7 @@ static int set_queue_count(struct nvme_dev *dev, int count)
 
 	status = nvme_set_features(&dev->ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0,
 								&result);
-	if (status < 0)
-		return status;
-	if (status > 0) {
+	if (status) {
 		dev_err(dev->dev, "Could not set queue count (%d)\n", status);
 		return 0;
 	}
@@ -2023,6 +2039,11 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 
 	nvme_dev_list_remove(dev);
 
+	if (test_and_set_bit(NVME_CTRL_SHUTDOWN, &dev->flags)) {
+		wait_event(dev->shutdown_wait, !test_bit(NVME_CTRL_SHUTDOWN,
+								&dev->flags));
+		return;
+	}
 	if (dev->bar) {
 		nvme_freeze_queues(dev);
 		csts = readl(dev->bar + NVME_REG_CSTS);
@@ -2041,6 +2062,9 @@ static void nvme_dev_shutdown(struct nvme_dev *dev)
 
 	for (i = dev->queue_count - 1; i >= 0; i--)
 		nvme_clear_queue(dev->queues[i]);
+
+	clear_bit(NVME_CTRL_SHUTDOWN, &dev->flags);
+	wake_up_all(&dev->shutdown_wait);
 }
 
 static int nvme_setup_prp_pools(struct nvme_dev *dev)
@@ -2115,7 +2139,12 @@ static void nvme_reset_work(struct work_struct *work)
 		goto free_tags;
 
 	result = nvme_setup_io_queues(dev);
-	if (result)
+
+	/*
+	 * The pci device will be disabled if a timeout occurs while setting up
+	 * IO queues, but return 0 for "result", so we have to check both.
+	 */
+	if (result || !pci_is_enabled(to_pci_dev(dev->dev)))
 		goto free_tags;
 
 	dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
@@ -2251,6 +2280,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 	INIT_WORK(&dev->scan_work, nvme_dev_scan);
 	INIT_WORK(&dev->reset_work, nvme_reset_work);
 	INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
+	init_waitqueue_head(&dev->shutdown_wait);
 
 	result = nvme_setup_prp_pools(dev);
 	if (result)
-- 
2.6.2.307.g37023ba