[PATCH 5/5] nvme/pci: Complete all stuck requests

Wed Feb 15 10:14:35 PST 2017

On Fri, Feb 10, 2017 at 06:15:53PM -0500, Keith Busch wrote:
> If the nvme driver is shutting down, it will not start the queues back
> up until asked to resume. If the block layer has entered requests and
> gets a CPU hot plug event prior to the resume event, it will wait for
> those requests to exit. Those requests will never exit since the NVMe
> driver is quieced, creating a deadlock.
> 
> This patch fixes that by freezing the queue and flushing all entered
> requests to either their natural completion, or forces their demise. We
> only need to do this when requesting to shutdown the controller since
> we will not be starting the IO queues back up again.
> 
> Signed-off-by: Keith Busch <keith.busch at intel.com>

Tested-by: Marc MERLIN <marc at merlins.org>

> ---
>  drivers/nvme/host/core.c | 33 +++++++++++++++++++++++++++++++++
>  drivers/nvme/host/nvme.h |  3 +++
>  drivers/nvme/host/pci.c  | 34 +++++++++++++++++++++++++++++++++-
>  3 files changed, 69 insertions(+), 1 deletion(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index c302270..1888451 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2125,6 +2125,39 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl)
>  }
>  EXPORT_SYMBOL_GPL(nvme_kill_queues);
>  
> +void nvme_unfreeze(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	mutex_lock(&ctrl->namespaces_mutex);
> +	list_for_each_entry(ns, &ctrl->namespaces, list)
> +		blk_mq_unfreeze_queue(ns->queue);
> +	mutex_unlock(&ctrl->namespaces_mutex);
> +}
> +EXPORT_SYMBOL_GPL(nvme_unfreeze);
> +
> +void nvme_wait_freeze(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	mutex_lock(&ctrl->namespaces_mutex);
> +	list_for_each_entry(ns, &ctrl->namespaces, list)
> +		blk_mq_freeze_queue(ns->queue);
> +	mutex_unlock(&ctrl->namespaces_mutex);
> +}
> +EXPORT_SYMBOL_GPL(nvme_wait_freeze);
> +
> +void nvme_start_freeze(struct nvme_ctrl *ctrl)
> +{
> +	struct nvme_ns *ns;
> +
> +	mutex_lock(&ctrl->namespaces_mutex);
> +	list_for_each_entry(ns, &ctrl->namespaces, list)
> +		blk_mq_freeze_queue_start(ns->queue);
> +	mutex_unlock(&ctrl->namespaces_mutex);
> +}
> +EXPORT_SYMBOL_GPL(nvme_start_freeze);
> +
>  void nvme_stop_queues(struct nvme_ctrl *ctrl)
>  {
>  	struct nvme_ns *ns;
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 569cba1..7408373 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -289,6 +289,9 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
>  void nvme_stop_queues(struct nvme_ctrl *ctrl);
>  void nvme_start_queues(struct nvme_ctrl *ctrl);
>  void nvme_kill_queues(struct nvme_ctrl *ctrl);
> +void nvme_unfreeze(struct nvme_ctrl *ctrl);
> +void nvme_wait_freeze(struct nvme_ctrl *ctrl);
> +void nvme_start_freeze(struct nvme_ctrl *ctrl);
>  
>  #define NVME_QID_ANY -1
>  struct request *nvme_alloc_request(struct request_queue *q,
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 92010fd..b6451d8 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -1672,12 +1672,15 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>  {
>  	int i, queues;
>  	u32 csts = -1;
> +	bool drain_queue = pci_is_enabled(to_pci_dev(dev->dev));
>  
>  	del_timer_sync(&dev->watchdog_timer);
>  	cancel_work_sync(&dev->reset_work);
>  
>  	mutex_lock(&dev->shutdown_lock);
> -	if (pci_is_enabled(to_pci_dev(dev->dev))) {
> +	if (drain_queue) {
> +		if (shutdown)
> +			nvme_start_freeze(&dev->ctrl);
>  		nvme_stop_queues(&dev->ctrl);
>  		csts = readl(dev->bar + NVME_REG_CSTS);
>  	}
> @@ -1701,6 +1704,25 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>  
>  	blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
>  	blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
> +
> +	/*
> +	 * If shutting down, the driver will not be starting up queues again,
> +	 * so must drain all entered requests to their demise to avoid
> +	 * deadlocking blk-mq hot-cpu notifier.
> +	 */
> +	if (drain_queue && shutdown) {
> +		nvme_start_queues(&dev->ctrl);
> +		/*
> +		 * Waiting for frozen increases the freeze depth. Since we
> +		 * already start the freeze earlier in this function to stop
> +		 * incoming requests, we have to unfreeze after froze to get
> +		 * the depth back to the desired.
> +		 */
> +		nvme_wait_freeze(&dev->ctrl);
> +		nvme_unfreeze(&dev->ctrl);
> +		nvme_stop_queues(&dev->ctrl);
> +	}
> +
>  	mutex_unlock(&dev->shutdown_lock);
>  }
>  
> @@ -1817,6 +1839,16 @@ static void nvme_reset_work(struct work_struct *work)
>  	} else {
>  		nvme_start_queues(&dev->ctrl);
>  		nvme_dev_add(dev);
> +
> +		/*
> +		 * If we are resuming from suspend, the queue was set to freeze
> +		 * to prevent blk-mq's hot CPU notifier from getting stuck on
> +		 * requests that entered the queue that NVMe had quiesced. Now
> +		 * that we are resuming and have notified blk-mq of the new h/w
> +		 * context queue count, it is safe to unfreeze the queues.
> +		 */
> +		if (was_suspend)
> +			nvme_unfreeze(&dev->ctrl);
>  	}
>  
>  	if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
> -- 
> 1.8.3.1
> 
> 

-- 
"A mouse is a device used to point at the xterm you want to type in" - A.S.R.
Microsoft is to operating systems ....
                                      .... what McDonalds is to gourmet cooking
Home page: http://marc.merlins.org/                         | PGP 1024R/763BE901