[PATCH V2 5/5] nvme: pci: simplify timeout handling
Ming Lei
ming.lei at redhat.com
Sun Apr 29 08:41:52 PDT 2018
When one req is timed out, now nvme_timeout() handles it by the
following way:
nvme_dev_disable(dev, false);
nvme_reset_ctrl(&dev->ctrl);
return BLK_EH_HANDLED.
which may introduces the following issues:
1) the following timeout on other reqs may call nvme_dev_disable()
again, which may quiesce queue again when resetting is in-progress,
then finally nothing can move on.
2) when timeout is triggered in reset work function, nvme_wait_freeze()
may wait forever because now controller can't be recovered at all
This patch fixes the issues by:
1) handle timeout event in one EH thread, and wakeup this thread if
controller recovery is needed
2) Inside the EH handler, timeout work is drained by nvme_unquiesce_timeout()
before canceling in-flight requrests, so all requests can be canceled or
completed by either EH or block timeout handler.
3) Moves the draining IO and unfreezing queues into one post_eh work context,
so controller can be recovered always.
This patch fixes reports from the horible test of block/011.
Cc: Jianchao Wang <jianchao.w.wang at oracle.com>
Cc: Christoph Hellwig <hch at lst.de>
Cc: Sagi Grimberg <sagi at grimberg.me>
Cc: linux-nvme at lists.infradead.org
Signed-off-by: Ming Lei <ming.lei at redhat.com>
---
drivers/nvme/host/core.c | 22 ++++++++
drivers/nvme/host/nvme.h | 3 +
drivers/nvme/host/pci.c | 140 +++++++++++++++++++++++++++++++++++++++++++----
3 files changed, 155 insertions(+), 10 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index f9028873298e..664a268accd1 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -3582,6 +3582,28 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_unquiesce_timeout(ns->queue);
+ up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_unquiesce_timeout);
+
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ down_read(&ctrl->namespaces_rwsem);
+ list_for_each_entry(ns, &ctrl->namespaces, list)
+ blk_quiesce_timeout(ns->queue);
+ up_read(&ctrl->namespaces_rwsem);
+}
+EXPORT_SYMBOL_GPL(nvme_quiesce_timeout);
+
void nvme_stop_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a19b7f04ac24..956ee19ff403 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -19,6 +19,7 @@
#include <linux/pci.h>
#include <linux/kref.h>
#include <linux/blk-mq.h>
+#include <linux/blkdev.h>
#include <linux/lightnvm.h>
#include <linux/sed-opal.h>
#include <linux/fault-inject.h>
@@ -405,6 +406,8 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
union nvme_result *res);
+void nvme_unquiesce_timeout(struct nvme_ctrl *ctrl);
+void nvme_quiesce_timeout(struct nvme_ctrl *ctrl);
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
void nvme_kill_queues(struct nvme_ctrl *ctrl);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 8172ee584130..86c14d9051ff 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -29,6 +29,7 @@
#include <linux/types.h>
#include <linux/io-64-nonatomic-lo-hi.h>
#include <linux/sed-opal.h>
+#include <linux/kthread.h>
#include "nvme.h"
@@ -112,6 +113,11 @@ struct nvme_dev {
dma_addr_t host_mem_descs_dma;
struct nvme_host_mem_buf_desc *host_mem_descs;
void **host_mem_desc_bufs;
+
+ spinlock_t eh_lock;
+ bool eh_in_recovery;
+ struct task_struct *ehandler;
+ struct work_struct post_eh_work;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -1176,6 +1182,100 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
csts, result);
}
+static void nvme_eh_schedule(struct nvme_dev *dev)
+{
+ spin_lock(&dev->eh_lock);
+ if (!dev->eh_in_recovery) {
+ dev->eh_in_recovery = true;
+ wake_up_process(dev->ehandler);
+ }
+ spin_unlock(&dev->eh_lock);
+}
+
+static void nvme_eh_done(struct nvme_dev *dev)
+{
+ spin_lock(&dev->eh_lock);
+ if (dev->eh_in_recovery)
+ dev->eh_in_recovery = false;
+ spin_unlock(&dev->eh_lock);
+}
+
+static int nvme_error_handler(void *data)
+{
+ struct nvme_dev *dev = data;
+
+ while (true) {
+ /*
+ * The sequence in kthread_stop() sets the stop flag first
+ * then wakes the process. To avoid missed wakeups, the task
+ * should always be in a non running state before the stop
+ * flag is checked
+ */
+ set_current_state(TASK_INTERRUPTIBLE);
+ if (kthread_should_stop())
+ break;
+
+ spin_lock(&dev->eh_lock);
+ if (!dev->eh_in_recovery) {
+ spin_unlock(&dev->eh_lock);
+ schedule();
+ continue;
+ }
+ spin_unlock(&dev->eh_lock);
+
+ __set_current_state(TASK_RUNNING);
+
+ dev_info(dev->ctrl.device, "start eh recovery\n");
+
+ /* freeze queues before recovery */
+ nvme_start_freeze(&dev->ctrl);
+
+ nvme_dev_disable(dev, false);
+
+ /*
+ * reset won't wait for IO completion any more, so it
+ * is safe to reset controller in sync way
+ */
+ nvme_reset_ctrl_sync(&dev->ctrl);
+
+ dev_info(dev->ctrl.device, "eh recovery done\n");
+
+ /*
+ * drain IO & unfreeze queues in another context because
+ * these IOs may trigger timeout again
+ */
+ queue_work(nvme_reset_wq, &dev->post_eh_work);
+ }
+ __set_current_state(TASK_RUNNING);
+ dev->ehandler = NULL;
+
+ return 0;
+}
+
+static void nvme_post_eh_work(struct work_struct *work)
+{
+ struct nvme_dev *dev =
+ container_of(work, struct nvme_dev, post_eh_work);
+
+ nvme_wait_freeze(&dev->ctrl);
+ nvme_unfreeze(&dev->ctrl);
+}
+
+static int nvme_eh_init(struct nvme_dev *dev)
+{
+ spin_lock_init(&dev->eh_lock);
+ INIT_WORK(&dev->post_eh_work, nvme_post_eh_work);
+
+ dev->ehandler = kthread_run(nvme_error_handler, dev,
+ "nvme_eh_%d", dev->ctrl.instance);
+ if (IS_ERR(dev->ehandler)) {
+ dev_err(dev->ctrl.device, "error handler thread failed to spawn, error = %ld\n",
+ PTR_ERR(dev->ehandler));
+ return PTR_ERR(dev->ehandler);
+ }
+ return 0;
+}
+
static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1197,8 +1297,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
*/
if (nvme_should_reset(dev, csts)) {
nvme_warn_reset(dev, csts);
- nvme_dev_disable(dev, false);
- nvme_reset_ctrl(&dev->ctrl);
+ nvme_eh_schedule(dev);
return BLK_EH_HANDLED;
}
@@ -1224,8 +1323,8 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, disable controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, false);
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+ nvme_eh_schedule(dev);
return BLK_EH_HANDLED;
default:
break;
@@ -1240,14 +1339,12 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
dev_warn(dev->ctrl.device,
"I/O %d QID %d timeout, reset controller\n",
req->tag, nvmeq->qid);
- nvme_dev_disable(dev, false);
- nvme_reset_ctrl(&dev->ctrl);
-
/*
* Mark the request as handled, since the inline shutdown
* forces all outstanding requests to complete.
*/
nvme_req(req)->flags |= NVME_REQ_CANCELLED;
+ nvme_eh_schedule(dev);
return BLK_EH_HANDLED;
}
@@ -2246,8 +2343,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
if (pci_is_enabled(pdev)) {
u32 csts = readl(dev->bar + NVME_REG_CSTS);
- if (dev->ctrl.state == NVME_CTRL_LIVE ||
- dev->ctrl.state == NVME_CTRL_RESETTING) {
+ if (shutdown && (dev->ctrl.state == NVME_CTRL_LIVE ||
+ dev->ctrl.state == NVME_CTRL_RESETTING)) {
nvme_start_freeze(&dev->ctrl);
frozen = true;
}
@@ -2281,11 +2378,23 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
nvme_suspend_queue(&dev->queues[i]);
+ /* safe to sync timeout after queues are quiesced */
+ nvme_quiesce_timeout(&dev->ctrl);
+ blk_quiesce_timeout(dev->ctrl.admin_q);
+
nvme_pci_disable(dev);
+ /*
+ * Both timeout and interrupt handler have been drained, and all
+ * in-flight requests will be canceled now.
+ */
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
+ /* all requests are canceled now, so enable timeout now */
+ nvme_unquiesce_timeout(&dev->ctrl);
+ blk_unquiesce_timeout(dev->ctrl.admin_q);
+
/*
* The driver will not be starting up queues again if shutting down so
* must flush all entered requests to their failed completion to avoid
@@ -2416,9 +2525,14 @@ static void nvme_reset_work(struct work_struct *work)
if (result)
goto out;
+ nvme_eh_done(dev);
+
/*
* Keep the controller around but remove all namespaces if we don't have
* any working I/O queue.
+ *
+ * Now we won't wait for IO completion here any more, and sync reset
+ * can be used safely anywhere.
*/
if (dev->online_queues < 2) {
dev_warn(dev->ctrl.device, "IO queues not created\n");
@@ -2427,11 +2541,9 @@ static void nvme_reset_work(struct work_struct *work)
new_state = NVME_CTRL_ADMIN_ONLY;
} else {
nvme_start_queues(&dev->ctrl);
- nvme_wait_freeze(&dev->ctrl);
/* hit this only when allocate tagset fails */
if (nvme_dev_add(dev))
new_state = NVME_CTRL_ADMIN_ONLY;
- nvme_unfreeze(&dev->ctrl);
}
/*
@@ -2449,6 +2561,7 @@ static void nvme_reset_work(struct work_struct *work)
out:
nvme_remove_dead_ctrl(dev, result);
+ nvme_eh_done(dev);
}
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -2590,10 +2703,15 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
+ if (nvme_eh_init(dev))
+ goto uninit_ctrl;
+
nvme_reset_ctrl(&dev->ctrl);
return 0;
+ uninit_ctrl:
+ nvme_uninit_ctrl(&dev->ctrl);
release_pools:
nvme_release_prp_pools(dev);
unmap:
@@ -2647,6 +2765,8 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_stop_ctrl(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_dev_disable(dev, true);
+ if (dev->ehandler)
+ kthread_stop(dev->ehandler);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
nvme_free_queues(dev, 0);
--
2.9.5
More information about the Linux-nvme
mailing list