[PATCH -qemu] nvme: support Google vendor extension

Tue Nov 17 21:47:04 PST 2015

From: Mihai Rusu <dizzy at google.com>

This implements the device side for an NVMe vendor extension that
reduces the number of MMIO writes which can result in a very large
performance benefit in virtualized environments.

See the following link for a description of the mechanism and the
kernel NVMe driver changes to support this vendor extension:
http://lists.infradead.org/pipermail/linux-nvme/2014-July/001076.html

On my workstation (3.2Ghz Xeon E5-1650), running QEMU:
$ bin/opt/native/x86_64-softmmu/qemu-system-x86_64 \
    -enable-kvm -m 2048 -smp 4 \
    -drive if=virtio,file=debian.raw,cache=none \
    -drive file=nvme.raw,if=none,id=nvme-dev \
    -device nvme,drive=nvme-dev,serial=nvme-serial

Using "fio":
vm # fio -time_based --name=benchmark --ioengine=libaio --iodepth=32 \
    --numjobs=1 --runtime=30 --blocksize=4k --filename=/dev/nvme0n1 \
    --nrfiles=1 --invalidate=1 --verify=0 --direct=1 --rw=randread

I get about 20k IOPs with the original code and about 85k IOPs with
the vendor extension changes applied (and running a vendor extension
supporting 3.14 based guest kernel).

Signed-off-by: Mihai Rusu <dizzy at google.com>
[fixed for a merging into different tree; added VID/DID params]
Signed-off-by: Keith Busch <keith.busch at intel.com>
[mlin: port for upstream]
Signed-off-by: Ming Lin <mlin at kernel.org>
---
 hw/block/nvme.c | 92 ++++++++++++++++++++++++++++++++++++++++++++++++++++++---
 hw/block/nvme.h | 18 +++++++++++
 2 files changed, 106 insertions(+), 4 deletions(-)

diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 169e4fa..3e1c38d 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -20,6 +20,7 @@
  *      -device nvme,drive=<drive_id>,serial=<serial>,id=<id[optional]>
  */
 
+#include <exec/memory.h>
 #include <hw/block/block.h>
 #include <hw/hw.h>
 #include <hw/pci/msix.h>
@@ -158,6 +159,14 @@ static uint16_t nvme_dma_read_prp(NvmeCtrl *n, uint8_t *ptr, uint32_t len,
     return NVME_SUCCESS;
 }
 
+static void nvme_update_cq_head(NvmeCQueue *cq)
+{
+    if (cq->db_addr) {
+        pci_dma_read(&cq->ctrl->parent_obj, cq->db_addr,
+                     &cq->head, sizeof(cq->head));
+    }
+}
+
 static void nvme_post_cqes(void *opaque)
 {
     NvmeCQueue *cq = opaque;
@@ -168,6 +177,8 @@ static void nvme_post_cqes(void *opaque)
         NvmeSQueue *sq;
         hwaddr addr;
 
+        nvme_update_cq_head(cq);
+
         if (nvme_cq_full(cq)) {
             break;
         }
@@ -350,6 +361,8 @@ static void nvme_init_sq(NvmeSQueue *sq, NvmeCtrl *n, uint64_t dma_addr,
         QTAILQ_INSERT_TAIL(&(sq->req_list), &sq->io_req[i], entry);
     }
     sq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_process_sq, sq);
+    sq->db_addr = 0;
+    sq->eventidx_addr = 0;
 
     assert(n->cq[cqid]);
     cq = n->cq[cqid];
@@ -430,6 +443,8 @@ static void nvme_init_cq(NvmeCQueue *cq, NvmeCtrl *n, uint64_t dma_addr,
     cq->head = cq->tail = 0;
     QTAILQ_INIT(&cq->req_list);
     QTAILQ_INIT(&cq->sq_list);
+    cq->db_addr = 0;
+    cq->eventidx_addr = 0;
     msix_vector_use(&n->parent_obj, cq->vector);
     n->cq[cqid] = cq;
     cq->timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, nvme_post_cqes, cq);
@@ -528,6 +543,40 @@ static uint16_t nvme_set_feature(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
     return NVME_SUCCESS;
 }
 
+static uint16_t nvme_set_db_memory(NvmeCtrl *n, const NvmeCmd *cmd)
+{
+    uint64_t db_addr = le64_to_cpu(cmd->prp1);
+    uint64_t eventidx_addr = le64_to_cpu(cmd->prp2);
+    int i;
+
+    /* Addresses should not be NULL and should be page aligned. */
+    if (db_addr == 0 || db_addr & (n->page_size - 1) ||
+        eventidx_addr == 0 || eventidx_addr & (n->page_size - 1)) {
+        return NVME_INVALID_MEMORY_ADDRESS | NVME_DNR;
+    }
+
+    /* This assumes all I/O queues are created before this command is handled.
+     * We skip the admin queues. */
+    for (i = 1; i < n->num_queues; i++) {
+        NvmeSQueue *sq = n->sq[i];
+        NvmeCQueue *cq = n->cq[i];
+
+        if (sq != NULL) {
+            /* Submission queue tail pointer location, 2 * QID * stride. */
+            sq->db_addr = db_addr + 2 * i * 4;
+            sq->eventidx_addr = eventidx_addr + 2 * i * 4;
+        }
+
+        if (cq != NULL) {
+            /* Completion queue head pointer location, (2 * QID + 1) * stride.
+             */
+            cq->db_addr = db_addr + (2 * i + 1) * 4;
+            cq->eventidx_addr = eventidx_addr + (2 * i + 1) * 4;
+        }
+    }
+    return NVME_SUCCESS;
+}
+
 static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
 {
     switch (cmd->opcode) {
@@ -545,11 +594,29 @@ static uint16_t nvme_admin_cmd(NvmeCtrl *n, NvmeCmd *cmd, NvmeRequest *req)
         return nvme_set_feature(n, cmd, req);
     case NVME_ADM_CMD_GET_FEATURES:
         return nvme_get_feature(n, cmd, req);
+    case NVME_ADM_CMD_SET_DB_MEMORY:
+        return nvme_set_db_memory(n, cmd);
     default:
         return NVME_INVALID_OPCODE | NVME_DNR;
     }
 }
 
+static void nvme_update_sq_eventidx(const NvmeSQueue *sq)
+{
+    if (sq->eventidx_addr) {
+        pci_dma_write(&sq->ctrl->parent_obj, sq->eventidx_addr,
+                      &sq->tail, sizeof(sq->tail));
+    }
+}
+
+static void nvme_update_sq_tail(NvmeSQueue *sq)
+{
+    if (sq->db_addr) {
+        pci_dma_read(&sq->ctrl->parent_obj, sq->db_addr,
+                     &sq->tail, sizeof(sq->tail));
+    }
+}
+
 static void nvme_process_sq(void *opaque)
 {
     NvmeSQueue *sq = opaque;
@@ -561,6 +628,8 @@ static void nvme_process_sq(void *opaque)
     NvmeCmd cmd;
     NvmeRequest *req;
 
+    nvme_update_sq_tail(sq);
+
     while (!(nvme_sq_empty(sq) || QTAILQ_EMPTY(&sq->req_list))) {
         addr = sq->dma_addr + sq->head * n->sqe_size;
         pci_dma_read(&n->parent_obj, addr, (void *)&cmd, sizeof(cmd));
@@ -578,6 +647,9 @@ static void nvme_process_sq(void *opaque)
             req->status = status;
             nvme_enqueue_req_completion(cq, req);
         }
+
+        nvme_update_sq_eventidx(sq);
+        nvme_update_sq_tail(sq);
     }
 }
 
@@ -726,7 +798,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
         }
 
         start_sqs = nvme_cq_full(cq) ? 1 : 0;
-        cq->head = new_head;
+        /* When the mapped pointer memory area is setup, we don't rely on
+         * the MMIO written values to update the head pointer. */
+        if (!cq->db_addr) {
+            cq->head = new_head;
+        }
         if (start_sqs) {
             NvmeSQueue *sq;
             QTAILQ_FOREACH(sq, &cq->sq_list, entry) {
@@ -752,7 +828,11 @@ static void nvme_process_db(NvmeCtrl *n, hwaddr addr, int val)
             return;
         }
 
-        sq->tail = new_tail;
+        /* When the mapped pointer memory area is setup, we don't rely on
+         * the MMIO written values to update the tail pointer. */
+        if (!sq->db_addr) {
+            sq->tail = new_tail;
+        }
         timer_mod(sq->timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 500);
     }
 }
@@ -805,6 +885,8 @@ static int nvme_init(PCIDevice *pci_dev)
     pci_conf = pci_dev->config;
     pci_conf[PCI_INTERRUPT_PIN] = 1;
     pci_config_set_prog_interface(pci_dev->config, 0x2);
+    pci_config_set_vendor_id(pci_dev->config, n->vid);
+    pci_config_set_device_id(pci_dev->config, n->did);
     pci_config_set_class(pci_dev->config, PCI_CLASS_STORAGE_EXPRESS);
     pcie_endpoint_cap_init(&n->parent_obj, 0x80);
 
@@ -885,9 +967,13 @@ static void nvme_exit(PCIDevice *pci_dev)
     msix_uninit_exclusive_bar(pci_dev);
 }
 
+#define PCI_VENDOR_ID_GOOGLE 0x1AE0
+
 static Property nvme_props[] = {
     DEFINE_BLOCK_PROPERTIES(NvmeCtrl, conf),
     DEFINE_PROP_STRING("serial", NvmeCtrl, serial),
+    DEFINE_PROP_UINT16("vid", NvmeCtrl, vid, PCI_VENDOR_ID_GOOGLE),
+    DEFINE_PROP_UINT16("did", NvmeCtrl, did, 0x5845),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -905,8 +991,6 @@ static void nvme_class_init(ObjectClass *oc, void *data)
     pc->exit = nvme_exit;
     pc->class_id = PCI_CLASS_STORAGE_EXPRESS;
     pc->vendor_id = PCI_VENDOR_ID_INTEL;
-    pc->device_id = 0x5845;
-    pc->revision = 1;
     pc->is_express = 1;
 
     set_bit(DEVICE_CATEGORY_STORAGE, dc->categories);
diff --git a/hw/block/nvme.h b/hw/block/nvme.h
index bf3a3cc..82aeab4 100644
--- a/hw/block/nvme.h
+++ b/hw/block/nvme.h
@@ -170,6 +170,7 @@ enum NvmeAdminCommands {
     NVME_ADM_CMD_FORMAT_NVM     = 0x80,
     NVME_ADM_CMD_SECURITY_SEND  = 0x81,
     NVME_ADM_CMD_SECURITY_RECV  = 0x82,
+    NVME_ADM_CMD_SET_DB_MEMORY  = 0xC0,  /* Vendor specific. */
 };
 
 enum NvmeIoCommands {
@@ -381,6 +382,7 @@ enum NvmeStatusCodes {
     NVME_CONFLICTING_ATTRS      = 0x0180,
     NVME_INVALID_PROT_INFO      = 0x0181,
     NVME_WRITE_TO_RO            = 0x0182,
+    NVME_INVALID_MEMORY_ADDRESS = 0x01C0,  /* Vendor extension. */
     NVME_WRITE_FAULT            = 0x0280,
     NVME_UNRECOVERED_READ       = 0x0281,
     NVME_E2E_GUARD_ERROR        = 0x0282,
@@ -658,6 +660,13 @@ typedef struct NvmeSQueue {
     QTAILQ_HEAD(sq_req_list, NvmeRequest) req_list;
     QTAILQ_HEAD(out_req_list, NvmeRequest) out_req_list;
     QTAILQ_ENTRY(NvmeSQueue) entry;
+    /* Mapped memory location where the tail pointer is stored by the guest
+     * without triggering MMIO exits. */
+    uint64_t    db_addr;
+    /* virtio-like eventidx pointer, guest updates to the tail pointer that
+     * do not go over this value will not result in MMIO writes (but will
+     * still write the tail pointer to the "db_addr" location above). */
+    uint64_t    eventidx_addr;
 } NvmeSQueue;
 
 typedef struct NvmeCQueue {
@@ -673,6 +682,13 @@ typedef struct NvmeCQueue {
     QEMUTimer   *timer;
     QTAILQ_HEAD(sq_list, NvmeSQueue) sq_list;
     QTAILQ_HEAD(cq_req_list, NvmeRequest) req_list;
+    /* Mapped memory location where the head pointer is stored by the guest
+     * without triggering MMIO exits. */
+    uint64_t    db_addr;
+    /* virtio-like eventidx pointer, guest updates to the head pointer that
+     * do not go over this value will not result in MMIO writes (but will
+     * still write the head pointer to the "db_addr" location above). */
+    uint64_t    eventidx_addr;
 } NvmeCQueue;
 
 typedef struct NvmeNamespace {
@@ -699,6 +715,8 @@ typedef struct NvmeCtrl {
     uint32_t    num_queues;
     uint32_t    max_q_ents;
     uint64_t    ns_size;
+    uint16_t    vid;
+    uint16_t    did;
 
     char            *serial;
     NvmeNamespace   *namespaces;
-- 
1.9.1