[PATCH] nvme/pci: Remove watchdog timer

Sun May 28 02:41:52 PDT 2017

On Fri, May 26, 2017 at 08:16:28AM -0400, Keith Busch wrote:
> The controller status polling was added to preemptively reset a failed
> controller. This early detection would allow commands that would normally
> timeout a chance for a retry, or find broken links when the platform
> didn't support hotplug.
> 
> This once-per-second MMIO read, however, created more problems than
> it solves. This often races with PCIe Hotplug events that required
> complicated syncing between work queues, frequently triggered PCIe
> Completion Timeout errors that also lead to fatal machine checks, and
> unnecessarily disrupts low power modes by running on idle controllers.
> 
> This patch removes the watchdog timer, and instead checks controller
> health only on an IO timeout when we have a reason to believe something
> is wrong. If the controller is failed, the driver will disable immediately
> and request scheduling a reset.
> 
> Suggested-by: Andy Lutomirski <luto at amacapital.net>
> Signed-off-by: Keith Busch <keith.busch at intel.com>
> Cc: Christoph Hellwig <hch at lst.de>
> ---
>  drivers/nvme/host/pci.c | 123 ++++++++++++++++++++++--------------------------
>  1 file changed, 56 insertions(+), 67 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index d52701df..eedb9b4 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -94,7 +94,6 @@ struct nvme_dev {
>  	void __iomem *bar;
>  	struct work_struct reset_work;
>  	struct work_struct remove_work;
> -	struct timer_list watchdog_timer;
>  	struct mutex shutdown_lock;
>  	bool subsystem;
>  	void __iomem *cmb;
> @@ -950,6 +949,51 @@ static void abort_endio(struct request *req, int error)
>  	blk_mq_free_request(req);
>  }
>  
> +static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
> +{
> +
> +	/* If true, indicates loss of adapter communication, possibly by a
> +	 * NVMe Subsystem reset.
> +	 */
> +	bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
> +
> +	/* If there is a reset ongoing, we shouldn't reset again. */
> +	if (work_busy(&dev->reset_work))
> +		return false;
> +
> +	/* We shouldn't reset unless the controller is on fatal error state
> +	 * _or_ if we lost the communication with it.
> +	 */
> +	if (!(csts & NVME_CSTS_CFS) && !nssro)
> +		return false;
> +
> +	/* If PCI error recovery process is happening, we cannot reset or
> +	 * the recovery mechanism will surely fail.
> +	 */
> +	if (pci_channel_offline(to_pci_dev(dev->dev)))
> +		return false;
> +
> +	return true;
> +}
> +
> +static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
> +{
> +	/* Read a config register to help see what died. */
> +	u16 pci_status;
> +	int result;
> +
> +	result = pci_read_config_word(to_pci_dev(dev->dev), PCI_STATUS,
> +				      &pci_status);
> +	if (result == PCIBIOS_SUCCESSFUL)
> +		dev_warn(dev->ctrl.device,
> +			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS=0x%hx\n",
> +			 csts, pci_status);
> +	else
> +		dev_warn(dev->ctrl.device,
> +			 "controller is down; will reset: CSTS=0x%x, PCI_STATUS read failed (%d)\n",
> +			 csts, result);
> +}
> +
>  static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> @@ -957,6 +1001,17 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>  	struct nvme_dev *dev = nvmeq->dev;
>  	struct request *abort_req;
>  	struct nvme_command cmd;
> +	u32 csts = readl(dev->bar + NVME_REG_CSTS);
> +
> +	/*
> +	 * Reset immediately if the controller is failed

s/is/hash?

Otherwise this looks good to me, but I'd like to defer it until
we have the multiple reset issue sorted out, to avoid conflicts.