completion timeouts with pin-based interrupts in QEMU hw/nvme

Klaus Jensen its at irrelevant.dk
Mon Jan 16 13:14:07 PST 2023


On Jan 12 14:10, Klaus Jensen wrote:
> Hi all (linux-nvme, qemu-devel, maintainers),
> 
> On QEMU riscv64, which does not use MSI/MSI-X and thus relies on
> pin-based interrupts, I'm seeing occasional completion timeouts, i.e.
> 
>   nvme nvme0: I/O 333 QID 1 timeout, completion polled
> 
> To rule out issues with shadow doorbells (which have been a source of
> frustration in the past), those are disabled. FWIW I'm also seeing the
> issue with shadow doorbells.
> 
> 	diff --git a/hw/nvme/ctrl.c b/hw/nvme/ctrl.c
> 	index f25cc2c235e9..28d8e7f4b56c 100644
> 	--- a/hw/nvme/ctrl.c
> 	+++ b/hw/nvme/ctrl.c
> 	@@ -7407,7 +7407,7 @@ static void nvme_init_ctrl(NvmeCtrl *n, PCIDevice *pci_dev)
> 	     id->mdts = n->params.mdts;
> 	     id->ver = cpu_to_le32(NVME_SPEC_VER);
> 	     id->oacs =
> 	-        cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT | NVME_OACS_DBBUF);
> 	+        cpu_to_le16(NVME_OACS_NS_MGMT | NVME_OACS_FORMAT);
> 	     id->cntrltype = 0x1;
> 
> 	     /*
> 
> 
> I captured a trace from QEMU when this happens:
> 
> pci_nvme_mmio_write addr 0x1008 data 0x4e size 4
> pci_nvme_mmio_doorbell_sq sqid 1 new_tail 78
> pci_nvme_io_cmd cid 4428 nsid 0x1 sqid 1 opc 0x2 opname 'NVME_NVM_CMD_READ'
> pci_nvme_read cid 4428 nsid 1 nlb 32 count 16384 lba 0x1324
> pci_nvme_map_prp trans_len 4096 len 16384 prp1 0x80aca000 prp2 0x82474100 num_prps 5
> pci_nvme_map_addr addr 0x80aca000 len 4096
> pci_nvme_map_addr addr 0x80ac9000 len 4096
> pci_nvme_map_addr addr 0x80ac8000 len 4096
> pci_nvme_map_addr addr 0x80ac7000 len 4096
> pci_nvme_io_cmd cid 4429 nsid 0x1 sqid 1 opc 0x2 opname 'NVME_NVM_CMD_READ'
> pci_nvme_read cid 4429 nsid 1 nlb 224 count 114688 lba 0x1242
> pci_nvme_map_prp trans_len 4096 len 114688 prp1 0x80ae6000 prp2 0x82474000 num_prps 29
> pci_nvme_map_addr addr 0x80ae6000 len 4096
> pci_nvme_map_addr addr 0x80ae5000 len 4096
> pci_nvme_map_addr addr 0x80ae4000 len 4096
> pci_nvme_map_addr addr 0x80ae3000 len 4096
> pci_nvme_map_addr addr 0x80ae2000 len 4096
> pci_nvme_map_addr addr 0x80ae1000 len 4096
> pci_nvme_map_addr addr 0x80ae0000 len 4096
> pci_nvme_map_addr addr 0x80adf000 len 4096
> pci_nvme_map_addr addr 0x80ade000 len 4096
> pci_nvme_map_addr addr 0x80add000 len 4096
> pci_nvme_map_addr addr 0x80adc000 len 4096
> pci_nvme_map_addr addr 0x80adb000 len 4096
> pci_nvme_map_addr addr 0x80ada000 len 4096
> pci_nvme_map_addr addr 0x80ad9000 len 4096
> pci_nvme_map_addr addr 0x80ad8000 len 4096
> pci_nvme_map_addr addr 0x80ad7000 len 4096
> pci_nvme_map_addr addr 0x80ad6000 len 4096
> pci_nvme_map_addr addr 0x80ad5000 len 4096
> pci_nvme_map_addr addr 0x80ad4000 len 4096
> pci_nvme_map_addr addr 0x80ad3000 len 4096
> pci_nvme_map_addr addr 0x80ad2000 len 4096
> pci_nvme_map_addr addr 0x80ad1000 len 4096
> pci_nvme_map_addr addr 0x80ad0000 len 4096
> pci_nvme_map_addr addr 0x80acf000 len 4096
> pci_nvme_map_addr addr 0x80ace000 len 4096
> pci_nvme_map_addr addr 0x80acd000 len 4096
> pci_nvme_map_addr addr 0x80acc000 len 4096
> pci_nvme_map_addr addr 0x80acb000 len 4096
> pci_nvme_rw_cb cid 4428 blk 'd0'
> pci_nvme_rw_complete_cb cid 4428 blk 'd0'
> pci_nvme_enqueue_req_completion cid 4428 cqid 1 dw0 0x0 dw1 0x0 status 0x0
> [1]: pci_nvme_irq_pin pulsing IRQ pin
> pci_nvme_rw_cb cid 4429 blk 'd0'
> pci_nvme_rw_complete_cb cid 4429 blk 'd0'
> pci_nvme_enqueue_req_completion cid 4429 cqid 1 dw0 0x0 dw1 0x0 status 0x0
> [2]: pci_nvme_irq_pin pulsing IRQ pin
> [3]: pci_nvme_mmio_write addr 0x100c data 0x4d size 4
> [4]: pci_nvme_mmio_doorbell_cq cqid 1 new_head 77
> ---- TIMEOUT HERE (30s) ---
> [5]: pci_nvme_mmio_read addr 0x1c size 4
> [6]: pci_nvme_mmio_write addr 0x100c data 0x4e size 4
> [7]: pci_nvme_mmio_doorbell_cq cqid 1 new_head 78
> --- Interrupt deasserted (cq->tail == cq->head)
> [   31.757821] nvme nvme0: I/O 333 QID 1 timeout, completion polled
> 
> Following the timeout, everything returns to "normal" and device/driver
> happily continues.
> 
> The pin-based interrupt logic in hw/nvme seems sound enough to me, so I
> am wondering if there is something going on with the kernel driver (but
> I certainly do not rule out that hw/nvme is at fault here, since
> pin-based interrupts has also been a source of several issues in the
> past).
> 
> What I'm thinking is that following the interrupt in [1], the driver
> picks up completion for cid 4428 but does not find cid 4429 in the queue
> since it has not been posted yet. Before getting a cq head doorbell
> write (which would cause the pin to be deasserted), the device posts the
> completion for cid 4429 which just keeps the interrupt asserted in [2].
> The trace then shows the cq head doorbell update in [3,4] for cid 4428
> and then we hit the timeout since the driver is not aware that cid 4429
> has been posted in between this (why is it not aware of this?) Timing
> out, the driver then polls the queue and notices cid 4429 and updates
> the cq head doorbell in [5-7], causing the device to deassert the
> interrupt and we are "back in shape".
> 
> I'm observing this on 6.0 kernels and v6.2-rc3 (have not tested <6.0).
> Tested on QEMU v7.0.0 (to rule out all the shadow doorbell
> optimizations) as well as QEMU nvme-next (infradead). In other words,
> it's not a recent regression in either project and potentially it has
> always been like this. I've not tested other platforms for now, but I
> would assume others using pin-based interrupts would observe the same.
> 
> Any ideas on how to shed any light on this issue from the kernel side of
> things?

I noticed that the Linux driver does not use the INTMS/INTMC registers
to mask interrupts on the controller while processing CQEs. While not
required by the spec, it is *recommended* in setups not using MSI-X to
reduce the risk of spurious and/or missed interrupts.

With the patch below, running 100 boot iterations, no timeouts were
observed on QEMU emulated riscv64 or mips64.

No changes are required in the QEMU hw/nvme interrupt logic.


diff --git i/drivers/nvme/host/pci.c w/drivers/nvme/host/pci.c
index b13baccedb4a..75f6b87c4c3f 100644
--- i/drivers/nvme/host/pci.c
+++ w/drivers/nvme/host/pci.c
@@ -1128,6 +1128,27 @@ static inline int nvme_poll_cq(struct nvme_queue *nvmeq,
 }

 static irqreturn_t nvme_irq(int irq, void *data)
+{
+       struct nvme_queue *nvmeq = data;
+       struct nvme_dev *dev = nvmeq->dev;
+       u32 mask = 1 << nvmeq->cq_vector;
+       irqreturn_t ret = IRQ_NONE;
+       DEFINE_IO_COMP_BATCH(iob);
+
+       writel(mask, dev->bar + NVME_REG_INTMS);
+
+       if (nvme_poll_cq(nvmeq, &iob)) {
+               if (!rq_list_empty(iob.req_list))
+                       nvme_pci_complete_batch(&iob);
+               ret = IRQ_HANDLED;
+       }
+
+       writel(mask, dev->bar + NVME_REG_INTMC);
+
+       return ret;
+}
+
+static irqreturn_t nvme_irq_msix(int irq, void *data)
 {
        struct nvme_queue *nvmeq = data;
        DEFINE_IO_COMP_BATCH(iob);
@@ -1602,12 +1623,13 @@ static int queue_request_irq(struct nvme_queue *nvmeq)
 {
        struct pci_dev *pdev = to_pci_dev(nvmeq->dev->dev);
        int nr = nvmeq->dev->ctrl.instance;
+       irq_handler_t handler = pdev->msix_enabled ? nvme_irq_msix : nvme_irq;

        if (use_threaded_interrupts) {
                return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq_check,
-                               nvme_irq, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
+                               handler, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
        } else {
-               return pci_request_irq(pdev, nvmeq->cq_vector, nvme_irq,
+               return pci_request_irq(pdev, nvmeq->cq_vector, handler,
                                NULL, nvmeq, "nvme%dq%d", nr, nvmeq->qid);
        }
 }


-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 488 bytes
Desc: not available
URL: <http://lists.infradead.org/pipermail/linux-nvme/attachments/20230116/747f39bc/attachment.sig>


More information about the Linux-nvme mailing list