[PATCH V4 3/5] nvme-pci: avoid nvme_dev_disable to be invoked in nvme_timeout
jianchao.wang
jianchao.w.wang at oracle.com
Tue Mar 13 06:29:55 PDT 2018
Hi Keith
Would you please take a look at this patch.
I really need your suggestion on this.
Sincerely
Jianchao
On 03/09/2018 10:01 AM, jianchao.wang wrote:
> Hi Keith
>
> Can I have the honor of getting your comment on this patch?
>
> Thanks in advance
> Jianchao
>
> On 03/08/2018 02:19 PM, Jianchao Wang wrote:
>> nvme_dev_disable will issue command on adminq to clear HMB and
>> delete io cq/sqs, maybe more in the future. When adminq no response,
>> it has to depends on timeout path. However, nvme_timeout has to
>> invoke nvme_dev_disable before return, so that the DMA mappings
>> could be released safely. This will introduce dangerous circular
>> dependence. Moreover, the whole nvme_dev_disable is under
>> shutdown_lock even waiting for the command, this makes things
>> worse.
>>
>> To avoid this, this patch refactors the nvme_timeout. The basic
>> principle is:
>> - When need to schedule reset_work, hand over expired requests
>> to nvme_dev_disable. They will be completed after the controller
>> is disabled/shtudown.
>> - When requests from nvme_dev_disable and nvme_reset_work expires,
>> disable the controller directly then the request could be completed
>> to wakeup the waiter. nvme_pci_disable_ctrl_directly is introduced
>> for this, it doesn't send commands on adminq and the shutdown_lock
>> is not needed here, because the nvme_abort_requests_sync in
>> nvme_dev_disable could synchronize with nvme_timeout.
>>
>> Signed-off-by: Jianchao Wang <jianchao.w.wang at oracle.com>
>> ---
>> drivers/nvme/host/pci.c | 199 ++++++++++++++++++++++++++++++++----------------
>> 1 file changed, 134 insertions(+), 65 deletions(-)
>>
>> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
>> index e186158..ce09057 100644
>> --- a/drivers/nvme/host/pci.c
>> +++ b/drivers/nvme/host/pci.c
>> @@ -70,6 +70,7 @@ struct nvme_queue;
>>
>> static void nvme_process_cq(struct nvme_queue *nvmeq);
>> static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
>> +static void nvme_pci_disable_ctrl_directly(struct nvme_dev *dev);
>>
>> /*
>> * Represents an NVM Express device. Each nvme_dev is a PCI function.
>> @@ -98,6 +99,7 @@ struct nvme_dev {
>> u32 cmbloc;
>> struct nvme_ctrl ctrl;
>> struct completion ioq_wait;
>> + bool inflight_flushed;
>>
>> /* shadow doorbell buffer support: */
>> u32 *dbbuf_dbs;
>> @@ -1180,73 +1182,13 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts)
>> csts, result);
>> }
>>
>> -static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>> +static enum blk_eh_timer_return nvme_pci_abort_io_req(struct request *req)
>> {
>> struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>> struct nvme_queue *nvmeq = iod->nvmeq;
>> struct nvme_dev *dev = nvmeq->dev;
>> - struct request *abort_req;
>> struct nvme_command cmd;
>> - u32 csts = readl(dev->bar + NVME_REG_CSTS);
>> -
>> - /*
>> - * Reset immediately if the controller is failed
>> - */
>> - if (nvme_should_reset(dev, csts)) {
>> - nvme_warn_reset(dev, csts);
>> - nvme_dev_disable(dev, false);
>> - nvme_reset_ctrl(&dev->ctrl);
>> - return BLK_EH_HANDLED;
>> - }
>> -
>> - /*
>> - * Did we miss an interrupt?
>> - */
>> - if (__nvme_poll(nvmeq, req->tag)) {
>> - dev_warn(dev->ctrl.device,
>> - "I/O %d QID %d timeout, completion polled\n",
>> - req->tag, nvmeq->qid);
>> - return BLK_EH_HANDLED;
>> - }
>> -
>> - /*
>> - * Shutdown immediately if controller times out while starting. The
>> - * reset work will see the pci device disabled when it gets the forced
>> - * cancellation error. All outstanding requests are completed on
>> - * shutdown, so we return BLK_EH_HANDLED.
>> - */
>> - switch (dev->ctrl.state) {
>> - case NVME_CTRL_CONNECTING:
>> - case NVME_CTRL_RESETTING:
>> - dev_warn(dev->ctrl.device,
>> - "I/O %d QID %d timeout, disable controller\n",
>> - req->tag, nvmeq->qid);
>> - nvme_dev_disable(dev, false);
>> - set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
>> - return BLK_EH_HANDLED;
>> - default:
>> - break;
>> - }
>> -
>> - /*
>> - * Shutdown the controller immediately and schedule a reset if the
>> - * command was already aborted once before and still hasn't been
>> - * returned to the driver, or if this is the admin queue.
>> - */
>> - if (!nvmeq->qid || iod->aborted) {
>> - dev_warn(dev->ctrl.device,
>> - "I/O %d QID %d timeout, reset controller\n",
>> - req->tag, nvmeq->qid);
>> - nvme_dev_disable(dev, false);
>> - nvme_reset_ctrl(&dev->ctrl);
>> -
>> - /*
>> - * Mark the request as handled, since the inline shutdown
>> - * forces all outstanding requests to complete.
>> - */
>> - set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
>> - return BLK_EH_HANDLED;
>> - }
>> + struct request *abort_req;
>>
>> if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) {
>> atomic_inc(&dev->ctrl.abort_limit);
>> @@ -1282,6 +1224,105 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>> return BLK_EH_RESET_TIMER;
>> }
>>
>> +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved)
>> +{
>> + struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>> + struct nvme_queue *nvmeq = iod->nvmeq;
>> + struct nvme_dev *dev = nvmeq->dev;
>> + u32 csts = readl(dev->bar + NVME_REG_CSTS);
>> + enum {ABORT, RESET, DISABLE} action;
>> + enum blk_eh_timer_return ret;
>> + /*
>> + * This is for nvme_abort_req. This request will be completed
>> + * by nvme_flush_aborted_requests after the controller is
>> + * disabled/shutdown
>> + */
>> + if (test_bit(NVME_REQ_ABORTED, &nvme_req(req)->flags))
>> + return BLK_EH_NOT_HANDLED;
>> +
>> + /*
>> + * Reset immediately if the controller is failed.
>> + * Defer the completion to nvme_flush_aborted_requests.
>> + */
>> + if (nvme_should_reset(dev, csts)) {
>> + nvme_warn_reset(dev, csts);
>> + nvme_reset_ctrl(&dev->ctrl);
>> + return BLK_EH_RESET_TIMER;
>> + }
>> +
>> + /*
>> + * Did we miss an interrupt?
>> + */
>> + if (__nvme_poll(nvmeq, req->tag)) {
>> + dev_warn(dev->ctrl.device,
>> + "I/O %d QID %d timeout, completion polled\n",
>> + req->tag, nvmeq->qid);
>> + return BLK_EH_HANDLED;
>> + }
>> +
>> + if (nvmeq->qid) {
>> + if (dev->ctrl.state == NVME_CTRL_RESETTING ||
>> + iod->aborted)
>> + action = RESET;
>> + else
>> + action = ABORT;
>> + } else {
>> + /*
>> + * Disable immediately if controller times out while disabling/
>> + * shuting down/starting. The nvme_dev_disable/nvme_reset_work
>> + * will see the error.
>> + * Note: inflight_flushed is set in nvme_dev_disable when it
>> + * abort all the inflight requests. Introducing this flag is due
>> + * to there is no state to represent the shutdown procedure.
>> + */
>> + if (dev->ctrl.state == NVME_CTRL_CONNECTING ||
>> + dev->inflight_flushed)
>> + action = DISABLE;
>> + else
>> + action = RESET;
>> + }
>> +
>> + switch (action) {
>> + case ABORT:
>> + ret = nvme_pci_abort_io_req(req);
>> + break;
>> + case RESET:
>> + dev_warn(dev->ctrl.device,
>> + "I/O %d QID %d timeout, reset controller\n",
>> + req->tag, nvmeq->qid);
>> + set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
>> + nvme_reset_ctrl(&dev->ctrl);
>> + /*
>> + * The reset work will take over this request. nvme_abort_req
>> + * employs blk_abort_request to force the request to be timed
>> + * out. So we need to return BLK_EH_RESET_TIMER then the
>> + * RQF_MQ_TIMEOUT_EXPIRED could be cleared.
>> + */
>> + ret = BLK_EH_RESET_TIMER;
>> + break;
>> + case DISABLE:
>> + /*
>> + * We disable the controller directly here so that we could
>> + * complete the request safely to wake up the nvme_dev_disable/
>> + * nvme_reset_work who is waiting on adminq. We cannot return
>> + * BLK_EH_RESET_TIMER to depend on error recovery procedure
>> + * because it is waiting for timeout path.
>> + */
>> + dev_warn(dev->ctrl.device,
>> + "I/O %d QID %d timeout, disable controller\n",
>> + req->tag, nvmeq->qid);
>> + nvme_pci_disable_ctrl_directly(dev);
>> + set_bit(NVME_REQ_CANCELLED, &nvme_req(req)->flags);
>> + ret = BLK_EH_HANDLED;
>> + break;
>> + default:
>> + WARN_ON(1);
>> + break;
>> + }
>> +
>> + return ret;
>> +}
>> +
>> static void nvme_free_queue(struct nvme_queue *nvmeq)
>> {
>> dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth),
>> @@ -2169,6 +2210,33 @@ static void nvme_pci_disable(struct nvme_dev *dev)
>> }
>> }
>>
>> +/*
>> + * This is only invoked by nvme_timeout. shutdown_lock is not needed
>> + * here because nvme_abort_requests_sync in nvme_dev_disable will
>> + * synchronize with the timeout path.
>> + */
>> +static void nvme_pci_disable_ctrl_directly(struct nvme_dev *dev)
>> +{
>> + int i;
>> + struct pci_dev *pdev = to_pci_dev(dev->dev);
>> + bool dead = true;
>> +
>> + if (pci_is_enabled(pdev)) {
>> + u32 csts = readl(dev->bar + NVME_REG_CSTS);
>> +
>> + dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
>> + pdev->error_state != pci_channel_io_normal);
>> +
>> + if (!dead)
>> + nvme_disable_ctrl(&dev->ctrl, dev->ctrl.cap);
>> + }
>> +
>> + for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
>> + nvme_suspend_queue(&dev->queues[i]);
>> +
>> + nvme_pci_disable(dev);
>> +}
>> +
>> static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>> {
>> int i;
>> @@ -2205,6 +2273,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>>
>> }
>> nvme_stop_queues(&dev->ctrl);
>> + nvme_abort_requests_sync(&dev->ctrl);
>> + dev->inflight_flushed = true;
>>
>> if (!dead) {
>> nvme_disable_io_queues(dev);
>> @@ -2215,9 +2285,8 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
>>
>> nvme_pci_disable(dev);
>>
>> - blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
>> - blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
>> -
>> + nvme_flush_aborted_requests(&dev->ctrl);
>> + dev->inflight_flushed = false;
>> /*
>> * The driver will not be starting up queues again if shutting down so
>> * must flush all entered requests to their failed completion to avoid
>>
>
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme at lists.infradead.org
> https://urldefense.proofpoint.com/v2/url?u=http-3A__lists.infradead.org_mailman_listinfo_linux-2Dnvme&d=DwICAg&c=RoP1YumCXCgaWHvlZYR8PZh8Bv7qIrMUB65eapI_JnE&r=7WdAxUBeiTUTCy8v-7zXyr4qk7sx26ATvfo6QSTvZyQ&m=5Mw5gjePWHT_YZ77OroeGMhDmXLF0LW6EtV-hBMRYas&s=VM31J1nvWkmoWzuZIHV0f5nlrSW0ZTKUQ1TjLwbB18g&e=
>
More information about the Linux-nvme
mailing list