[PATCH v2 RFC] nvme: improve performance for virtual NVMe devices
Helen Koike
helen.koike at collabora.co.uk
Thu Apr 21 06:33:11 PDT 2016
Hi
On 14-04-2016 15:04, Helen Mae Koike Fornazier wrote:
> From: Rob Nelson <rlnelson at google.com>
>
> This change provides a mechanism to reduce the number of MMIO doorbell
> writes for the NVMe driver. When running in a virtualized environment
> like QEMU, the cost of an MMIO is quite hefy here. The main idea for
> the patch is provide the device two memory location locations:
> 1) to store the doorbell values so they can be lookup without the doorbell
> MMIO write
> 2) to store an event index.
> I believe the doorbell value is obvious, the event index not so much.
> Similar to the virtio specificaiton, the virtual device can tell the
> driver (guest OS) not to write MMIO unless you are writing past this
> value.
>
> FYI: doorbell values are written by the nvme driver (guest OS) and the
> event index is written by the virtual device (host OS).
>
> The patch implements a new admin command that will communicate where
> these two memory locations reside. If the command fails, the nvme
> driver will work as before without any optimizations.
>
> Contributions:
> Eric Northup <digitaleric at google.com>
> Frank Swiderski <fes at google.com>
> Ted Tso <tytso at mit.edu>
> Keith Busch <keith.busch at intel.com>
>
> Just to give an idea on the performance boost with the vendor
> extension: Running fio [1], a stock NVMe driver I get about 200K read
> IOPs with my vendor patch I get about 1000K read IOPs. This was
> running with a null device i.e. the backing device simply returned
> success on every read IO request.
>
> [1] Running on a 4 core machine:
> fio --time_based --name=benchmark --runtime=30
> --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32
> --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4
> --rw=randread --blocksize=4k --randrepeat=false
>
> Signed-off-by: Rob Nelson <rlnelson at google.com>
> [mlin: port for upstream]
> Signed-off-by: Ming Lin <mlin at kernel.org>
> [koike: updated for current APIs]
> Signed-off-by: Helen Mae Koike Fornazier <helen.koike at collabora.co.uk>
>
> Conflicts:
> drivers/nvme/host/Kconfig
> drivers/nvme/host/pci.c
> ---
>
> As stated above, this patch provides a HUGE improvement in performance. I would like to work on that to get it upstream.
>
> I also would like to know if you think this code can be made more generic, maybe s/CONFIG_NVME_VENDOS_EXT_GOOGLE/CONFIG_NVME_VM_OPTIMIZATION ? And remove the if(vendor == PCI_VENDOR_ID_GOOGLE)?
>
> I am not sure if all the code blocks inside the if(vendor == PCI_VENDOR_ID_GOOGLE) are only specific to google or if we can remove this check, what do you think?
>
> Thanks in advance for your comments.
I understand now that it can't be generic, as it is based on a vendor
specific command, but I still would like to know your opinion about this
patch and if it can make to upstream.
Thank you
>
> drivers/nvme/host/Kconfig | 7 +++
> drivers/nvme/host/pci.c | 150 ++++++++++++++++++++++++++++++++++++++++++++++
> include/linux/nvme.h | 21 +++++++
> 3 files changed, 178 insertions(+)
>
> diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
> index c894841..dc4fddc 100644
> --- a/drivers/nvme/host/Kconfig
> +++ b/drivers/nvme/host/Kconfig
> @@ -24,3 +24,10 @@ config BLK_DEV_NVME_SCSI
> to say N here, unless you run a distro that abuses the SCSI
> emulation to provide stable device names for mount by id, like
> some OpenSuSE and SLES versions.
> +
> +config NVME_VENDOR_EXT_GOOGLE
> + bool "NVMe Vendor Extension for Improved Virtualization"
> + depends on BLK_DEV_NVME
> + ---help---
> + Google extension to reduce the number of MMIO doorbell
> + writes for the NVMe driver
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 24ccda3..b6d8fd8 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -50,6 +50,9 @@
> #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command))
> #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion))
>
> +/* Google Vendor ID is not in include/linux/pci_ids.h */
> +#define PCI_VENDOR_ID_GOOGLE 0x1AE0
> +
> /*
> * We handle AEN commands ourselves and don't even let the
> * block layer know about them.
> @@ -107,6 +110,13 @@ struct nvme_dev {
> #define NVME_CTRL_RESETTING 0
> #define NVME_CTRL_REMOVING 1
>
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + u32 *db_mem;
> + dma_addr_t doorbell;
> + u32 *ei_mem;
> + dma_addr_t eventidx;
> +#endif
> +
> struct nvme_ctrl ctrl;
> struct completion ioq_wait;
> };
> @@ -139,6 +149,12 @@ struct nvme_queue {
> u16 qid;
> u8 cq_phase;
> u8 cqe_seen;
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + u32 *sq_doorbell_addr;
> + u32 *sq_eventidx_addr;
> + u32 *cq_doorbell_addr;
> + u32 *cq_eventidx_addr;
> +#endif
> };
>
> /*
> @@ -176,6 +192,9 @@ static inline void _nvme_check_size(void)
> BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
> BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
> BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + BUILD_BUG_ON(sizeof(struct nvme_doorbell_memory) != 64);
> +#endif
> }
>
> /*
> @@ -305,6 +324,51 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
> }
> }
>
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +static int nvme_vendor_memory_size(struct nvme_dev *dev)
> +{
> + return ((num_possible_cpus() + 1) * 8 * dev->db_stride);
> +}
> +
> +static int nvme_set_doorbell_memory(struct nvme_dev *dev)
> +{
> + struct nvme_command c;
> +
> + memset(&c, 0, sizeof(c));
> + c.doorbell_memory.opcode = nvme_admin_doorbell_memory;
> + c.doorbell_memory.prp1 = cpu_to_le64(dev->doorbell);
> + c.doorbell_memory.prp2 = cpu_to_le64(dev->eventidx);
> +
> + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
> +}
> +
> +static inline int nvme_ext_need_event(u16 event_idx, u16 new_idx, u16 old)
> +{
> + /* Borrowed from vring_need_event */
> + return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
> +}
> +
> +static void nvme_ext_write_doorbell(u16 value, u32 __iomem* q_db,
> + u32* db_addr, volatile u32* event_idx)
> +{
> + u16 old_value;
> + if (!db_addr)
> + goto ring_doorbell;
> +
> + old_value = *db_addr;
> + *db_addr = value;
> +
> + rmb();
> + if (!nvme_ext_need_event(*event_idx, value, old_value))
> + goto no_doorbell;
> +
> +ring_doorbell:
> + writel(value, q_db);
> +no_doorbell:
> + return;
> +}
> +#endif
> +
> /**
> * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
> * @nvmeq: The queue to use
> @@ -322,9 +386,19 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
> else
> memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + if (nvmeq->sq_doorbell_addr)
> + wmb();
> +#endif
> +
> if (++tail == nvmeq->q_depth)
> tail = 0;
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + nvme_ext_write_doorbell(tail, nvmeq->q_db,
> + nvmeq->sq_doorbell_addr, nvmeq->sq_eventidx_addr);
> +#else
> writel(tail, nvmeq->q_db);
> +#endif
> nvmeq->sq_tail = tail;
> }
>
> @@ -741,6 +815,10 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
> struct nvme_completion cqe = nvmeq->cqes[head];
> struct request *req;
>
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + if (to_pci_dev(nvmeq->dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE)
> + rmb();
> +#endif
> if (++head == nvmeq->q_depth) {
> head = 0;
> phase = !phase;
> @@ -785,7 +863,14 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
> return;
>
> if (likely(nvmeq->cq_vector >= 0))
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + nvme_ext_write_doorbell(head,
> + nvmeq->q_db + nvmeq->dev->db_stride,
> + nvmeq->cq_doorbell_addr,
> + nvmeq->cq_eventidx_addr);
> +#else
> writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
> +#endif
> nvmeq->cq_head = head;
> nvmeq->cq_phase = phase;
>
> @@ -1168,6 +1253,17 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
> dev->queues[qid] = nvmeq;
> dev->queue_count++;
>
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + if (dev->db_mem && dev->ei_mem && qid != 0) {
> + nvmeq->sq_doorbell_addr = &dev->db_mem[qid * 2 * dev->db_stride];
> + nvmeq->cq_doorbell_addr =
> + &dev->db_mem[(qid * 2 + 1) * dev->db_stride];
> + nvmeq->sq_eventidx_addr = &dev->ei_mem[qid * 2 * dev->db_stride];
> + nvmeq->cq_eventidx_addr =
> + &dev->ei_mem[(qid * 2 + 1) * dev->db_stride];
> + }
> +#endif
> +
> return nvmeq;
>
> free_cqdma:
> @@ -1198,6 +1294,16 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
> nvmeq->cq_head = 0;
> nvmeq->cq_phase = 1;
> nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + if (to_pci_dev(dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE && qid != 0) {
> + nvmeq->sq_doorbell_addr = &dev->db_mem[qid * 2 * dev->db_stride];
> + nvmeq->cq_doorbell_addr =
> + &dev->db_mem[(qid * 2 + 1) * dev->db_stride];
> + nvmeq->sq_eventidx_addr = &dev->ei_mem[qid * 2 * dev->db_stride];
> + nvmeq->cq_eventidx_addr =
> + &dev->ei_mem[(qid * 2 + 1) * dev->db_stride];
> + }
> +#endif
> memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
> dev->online_queues++;
> spin_unlock_irq(&nvmeq->q_lock);
> @@ -1676,6 +1782,19 @@ static int nvme_dev_add(struct nvme_dev *dev)
> if (blk_mq_alloc_tag_set(&dev->tagset))
> return 0;
> dev->ctrl.tagset = &dev->tagset;
> +
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + if (to_pci_dev(dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE) {
> + int res = nvme_set_doorbell_memory(dev);
> + if (res) {
> + // Free memory and continue on.
> + dma_free_coherent(dev->dev, 8192, dev->db_mem, dev->doorbell);
> + dma_free_coherent(dev->dev, 8192, dev->ei_mem, dev->doorbell);
> + dev->db_mem = 0;
> + dev->ei_mem = 0;
> + }
> + }
> +#endif
> } else {
> blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
>
> @@ -1740,8 +1859,31 @@ static int nvme_pci_enable(struct nvme_dev *dev)
>
> pci_enable_pcie_error_reporting(pdev);
> pci_save_state(pdev);
> +
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + if (pdev->vendor == PCI_VENDOR_ID_GOOGLE) {
> + int mem_size = nvme_vendor_memory_size(dev);
> + dev->db_mem = dma_alloc_coherent(&pdev->dev, mem_size, &dev->doorbell, GFP_KERNEL);
> + if (!dev->db_mem) {
> + result = -ENOMEM;
> + goto disable;
> + }
> + dev->ei_mem = dma_alloc_coherent(&pdev->dev, mem_size, &dev->eventidx, GFP_KERNEL);
> + if (!dev->ei_mem) {
> + result = -ENOMEM;
> + goto dma_free;
> + }
> + }
> +#endif
> +
> return 0;
>
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + dma_free:
> + dma_free_coherent(&pdev->dev, nvme_vendor_memory_size(dev), dev->db_mem, dev->doorbell);
> + dev->db_mem = 0;
> +#endif
> +
> disable:
> pci_disable_device(pdev);
> return result;
> @@ -1757,6 +1899,14 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
> static void nvme_pci_disable(struct nvme_dev *dev)
> {
> struct pci_dev *pdev = to_pci_dev(dev->dev);
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + int mem_size = nvme_vendor_memory_size(dev);
> +
> + if (dev->db_mem)
> + dma_free_coherent(&pdev->dev, mem_size, dev->db_mem, dev->doorbell);
> + if (dev->ei_mem)
> + dma_free_coherent(&pdev->dev, mem_size, dev->ei_mem, dev->eventidx);
> +#endif
>
> if (pdev->msi_enabled)
> pci_disable_msi(pdev);
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index a55986f..d3a8289 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -387,6 +387,9 @@ enum nvme_admin_opcode {
> nvme_admin_format_nvm = 0x80,
> nvme_admin_security_send = 0x81,
> nvme_admin_security_recv = 0x82,
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + nvme_admin_doorbell_memory = 0xC0,
> +#endif
> };
>
> enum {
> @@ -516,6 +519,18 @@ struct nvme_format_cmd {
> __u32 rsvd11[5];
> };
>
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +struct nvme_doorbell_memory {
> + __u8 opcode;
> + __u8 flags;
> + __u16 command_id;
> + __u32 rsvd1[5];
> + __le64 prp1;
> + __le64 prp2;
> + __u32 rsvd12[6];
> +};
> +#endif
> +
> struct nvme_command {
> union {
> struct nvme_common_command common;
> @@ -529,6 +544,9 @@ struct nvme_command {
> struct nvme_format_cmd format;
> struct nvme_dsm_cmd dsm;
> struct nvme_abort_cmd abort;
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + struct nvme_doorbell_memory doorbell_memory;
> +#endif
> };
> };
>
> @@ -575,6 +593,9 @@ enum {
> NVME_SC_BAD_ATTRIBUTES = 0x180,
> NVME_SC_INVALID_PI = 0x181,
> NVME_SC_READ_ONLY = 0x182,
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> + NVME_SC_DOORBELL_MEMORY_INVALID = 0x1C0,
> +#endif
> NVME_SC_WRITE_FAULT = 0x280,
> NVME_SC_READ_ERROR = 0x281,
> NVME_SC_GUARD_CHECK = 0x282,
More information about the Linux-nvme
mailing list