[PATCH v2 RFC] nvme: improve performance for virtual NVMe devices

Helen Koike helen.koike at collabora.co.uk
Thu Apr 21 06:33:11 PDT 2016


Hi

On 14-04-2016 15:04, Helen Mae Koike Fornazier wrote:
> From: Rob Nelson <rlnelson at google.com>
>
> This change provides a mechanism to reduce the number of MMIO doorbell
> writes for the NVMe driver. When running in a virtualized environment
> like QEMU, the cost of an MMIO is quite hefy here. The main idea for
> the patch is provide the device two memory location locations:
>   1) to store the doorbell values so they can be lookup without the doorbell
>      MMIO write
>   2) to store an event index.
> I believe the doorbell value is obvious, the event index not so much.
> Similar to the virtio specificaiton, the virtual device can tell the
> driver (guest OS) not to write MMIO unless you are writing past this
> value.
>
> FYI: doorbell values are written by the nvme driver (guest OS) and the
> event index is written by the virtual device (host OS).
>
> The patch implements a new admin command that will communicate where
> these two memory locations reside. If the command fails, the nvme
> driver will work as before without any optimizations.
>
> Contributions:
>    Eric Northup <digitaleric at google.com>
>    Frank Swiderski <fes at google.com>
>    Ted Tso <tytso at mit.edu>
>    Keith Busch <keith.busch at intel.com>
>
> Just to give an idea on the performance boost with the vendor
> extension: Running fio [1], a stock NVMe driver I get about 200K read
> IOPs with my vendor patch I get about 1000K read IOPs. This was
> running with a null device i.e. the backing device simply returned
> success on every read IO request.
>
> [1] Running on a 4 core machine:
>    fio --time_based --name=benchmark --runtime=30
>    --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32
>    --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4
>    --rw=randread --blocksize=4k --randrepeat=false
>
> Signed-off-by: Rob Nelson <rlnelson at google.com>
> [mlin: port for upstream]
> Signed-off-by: Ming Lin <mlin at kernel.org>
> [koike: updated for current APIs]
> Signed-off-by: Helen Mae Koike Fornazier <helen.koike at collabora.co.uk>
>
> Conflicts:
> 	drivers/nvme/host/Kconfig
> 	drivers/nvme/host/pci.c
> ---
>
> As stated above, this patch provides a HUGE improvement in performance. I would like to work on that to get it upstream.
>
> I also would like to know if you think this code can be made more generic, maybe s/CONFIG_NVME_VENDOS_EXT_GOOGLE/CONFIG_NVME_VM_OPTIMIZATION ? And remove the if(vendor == PCI_VENDOR_ID_GOOGLE)?
>
> I am not sure if all the code blocks inside the if(vendor == PCI_VENDOR_ID_GOOGLE) are only specific to google or if we can remove this check, what do you think?
>
> Thanks in advance for your comments.

I understand now that it can't be generic, as it is based on a vendor 
specific command, but I still would like to know your opinion about this 
patch and if it can make to upstream.

Thank you

>
>   drivers/nvme/host/Kconfig |   7 +++
>   drivers/nvme/host/pci.c   | 150 ++++++++++++++++++++++++++++++++++++++++++++++
>   include/linux/nvme.h      |  21 +++++++
>   3 files changed, 178 insertions(+)
>
> diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig
> index c894841..dc4fddc 100644
> --- a/drivers/nvme/host/Kconfig
> +++ b/drivers/nvme/host/Kconfig
> @@ -24,3 +24,10 @@ config BLK_DEV_NVME_SCSI
>   	  to say N here, unless you run a distro that abuses the SCSI
>   	  emulation to provide stable device names for mount by id, like
>   	  some OpenSuSE and SLES versions.
> +
> +config NVME_VENDOR_EXT_GOOGLE
> +	bool "NVMe Vendor Extension for Improved Virtualization"
> +	depends on BLK_DEV_NVME
> +	---help---
> +	  Google extension to reduce the number of MMIO doorbell
> +	  writes for the NVMe driver
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 24ccda3..b6d8fd8 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -50,6 +50,9 @@
>   #define SQ_SIZE(depth)		(depth * sizeof(struct nvme_command))
>   #define CQ_SIZE(depth)		(depth * sizeof(struct nvme_completion))
>   		
> +/* Google Vendor ID is not in include/linux/pci_ids.h */
> +#define PCI_VENDOR_ID_GOOGLE 0x1AE0
> +
>   /*
>    * We handle AEN commands ourselves and don't even let the
>    * block layer know about them.
> @@ -107,6 +110,13 @@ struct nvme_dev {
>   #define NVME_CTRL_RESETTING    0
>   #define NVME_CTRL_REMOVING     1
>   
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	u32 *db_mem;
> +	dma_addr_t doorbell;
> +	u32 *ei_mem;
> +	dma_addr_t eventidx;
> +#endif
> +
>   	struct nvme_ctrl ctrl;
>   	struct completion ioq_wait;
>   };
> @@ -139,6 +149,12 @@ struct nvme_queue {
>   	u16 qid;
>   	u8 cq_phase;
>   	u8 cqe_seen;
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	u32 *sq_doorbell_addr;
> +	u32 *sq_eventidx_addr;
> +	u32 *cq_doorbell_addr;
> +	u32 *cq_eventidx_addr;
> +#endif
>   };
>   
>   /*
> @@ -176,6 +192,9 @@ static inline void _nvme_check_size(void)
>   	BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
>   	BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
>   	BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	BUILD_BUG_ON(sizeof(struct nvme_doorbell_memory) != 64);
> +#endif
>   }
>   
>   /*
> @@ -305,6 +324,51 @@ static void nvme_complete_async_event(struct nvme_dev *dev,
>   	}
>   }
>   
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +static int nvme_vendor_memory_size(struct nvme_dev *dev)
> +{
> +	return ((num_possible_cpus() + 1) * 8 * dev->db_stride);
> +}
> +
> +static int nvme_set_doorbell_memory(struct nvme_dev *dev)
> +{
> +	struct nvme_command c;
> +
> +	memset(&c, 0, sizeof(c));
> +	c.doorbell_memory.opcode = nvme_admin_doorbell_memory;
> +	c.doorbell_memory.prp1 = cpu_to_le64(dev->doorbell);
> +	c.doorbell_memory.prp2 = cpu_to_le64(dev->eventidx);
> +
> +	return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
> +}
> +
> +static inline int nvme_ext_need_event(u16 event_idx, u16 new_idx, u16 old)
> +{
> +	/* Borrowed from vring_need_event */
> +	return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
> +}
> +
> +static void nvme_ext_write_doorbell(u16 value, u32 __iomem* q_db,
> +			   u32* db_addr, volatile u32* event_idx)
> +{
> +	u16 old_value;
> +	if (!db_addr)
> +		goto ring_doorbell;
> +
> +	old_value = *db_addr;
> +	*db_addr = value;
> +
> +	rmb();
> +	if (!nvme_ext_need_event(*event_idx, value, old_value))
> +		goto no_doorbell;
> +
> +ring_doorbell:
> +	writel(value, q_db);
> +no_doorbell:
> +	return;
> +}
> +#endif
> +
>   /**
>    * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell
>    * @nvmeq: The queue to use
> @@ -322,9 +386,19 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
>   	else
>   		memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd));
>   
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	if (nvmeq->sq_doorbell_addr)
> +		wmb();
> +#endif
> +
>   	if (++tail == nvmeq->q_depth)
>   		tail = 0;
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	nvme_ext_write_doorbell(tail, nvmeq->q_db,
> +		nvmeq->sq_doorbell_addr, nvmeq->sq_eventidx_addr);
> +#else
>   	writel(tail, nvmeq->q_db);
> +#endif
>   	nvmeq->sq_tail = tail;
>   }
>   
> @@ -741,6 +815,10 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
>   		struct nvme_completion cqe = nvmeq->cqes[head];
>   		struct request *req;
>   
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +		if (to_pci_dev(nvmeq->dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE)
> +			rmb();
> +#endif
>   		if (++head == nvmeq->q_depth) {
>   			head = 0;
>   			phase = !phase;
> @@ -785,7 +863,14 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
>   		return;
>   
>   	if (likely(nvmeq->cq_vector >= 0))
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +		nvme_ext_write_doorbell(head,
> +			nvmeq->q_db + nvmeq->dev->db_stride,
> +			nvmeq->cq_doorbell_addr,
> +			nvmeq->cq_eventidx_addr);
> +#else
>   		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
> +#endif
>   	nvmeq->cq_head = head;
>   	nvmeq->cq_phase = phase;
>   
> @@ -1168,6 +1253,17 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
>   	dev->queues[qid] = nvmeq;
>   	dev->queue_count++;
>   
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	if (dev->db_mem && dev->ei_mem && qid != 0) {
> +		nvmeq->sq_doorbell_addr = &dev->db_mem[qid * 2 * dev->db_stride];
> +		nvmeq->cq_doorbell_addr =
> +			&dev->db_mem[(qid * 2 + 1) * dev->db_stride];
> +		nvmeq->sq_eventidx_addr = &dev->ei_mem[qid * 2 * dev->db_stride];
> +		nvmeq->cq_eventidx_addr =
> +			&dev->ei_mem[(qid * 2 + 1) * dev->db_stride];
> +	}
> +#endif
> +
>   	return nvmeq;
>   
>    free_cqdma:
> @@ -1198,6 +1294,16 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
>   	nvmeq->cq_head = 0;
>   	nvmeq->cq_phase = 1;
>   	nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	if (to_pci_dev(dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE && qid != 0) {
> +		nvmeq->sq_doorbell_addr = &dev->db_mem[qid * 2 * dev->db_stride];
> +		nvmeq->cq_doorbell_addr =
> +			&dev->db_mem[(qid * 2 + 1) * dev->db_stride];
> +		nvmeq->sq_eventidx_addr = &dev->ei_mem[qid * 2 * dev->db_stride];
> +		nvmeq->cq_eventidx_addr =
> +			&dev->ei_mem[(qid * 2 + 1) * dev->db_stride];
> +	}
> +#endif
>   	memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
>   	dev->online_queues++;
>   	spin_unlock_irq(&nvmeq->q_lock);
> @@ -1676,6 +1782,19 @@ static int nvme_dev_add(struct nvme_dev *dev)
>   		if (blk_mq_alloc_tag_set(&dev->tagset))
>   			return 0;
>   		dev->ctrl.tagset = &dev->tagset;
> +
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +		if (to_pci_dev(dev->dev)->vendor == PCI_VENDOR_ID_GOOGLE) {
> +			int res = nvme_set_doorbell_memory(dev);
> +			if (res) {
> +				// Free memory and continue on.
> +				dma_free_coherent(dev->dev, 8192, dev->db_mem, dev->doorbell);
> +				dma_free_coherent(dev->dev, 8192, dev->ei_mem, dev->doorbell);
> +				dev->db_mem = 0;
> +				dev->ei_mem = 0;
> +			}
> +		}
> +#endif
>   	} else {
>   		blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
>   
> @@ -1740,8 +1859,31 @@ static int nvme_pci_enable(struct nvme_dev *dev)
>   
>   	pci_enable_pcie_error_reporting(pdev);
>   	pci_save_state(pdev);
> +
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	if (pdev->vendor == PCI_VENDOR_ID_GOOGLE) {
> +		int mem_size = nvme_vendor_memory_size(dev);
> +		dev->db_mem = dma_alloc_coherent(&pdev->dev, mem_size, &dev->doorbell, GFP_KERNEL);
> +		if (!dev->db_mem) {
> +			result = -ENOMEM;
> +			goto disable;
> +		}
> +		dev->ei_mem = dma_alloc_coherent(&pdev->dev, mem_size, &dev->eventidx, GFP_KERNEL);
> +		if (!dev->ei_mem) {
> +			result = -ENOMEM;
> +			goto dma_free;
> +		}
> +	}
> +#endif
> +
>   	return 0;
>   
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +  dma_free:
> +	dma_free_coherent(&pdev->dev, nvme_vendor_memory_size(dev), dev->db_mem, dev->doorbell);
> +	dev->db_mem = 0;
> +#endif
> +
>    disable:
>   	pci_disable_device(pdev);
>   	return result;
> @@ -1757,6 +1899,14 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
>   static void nvme_pci_disable(struct nvme_dev *dev)
>   {
>   	struct pci_dev *pdev = to_pci_dev(dev->dev);
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	int mem_size = nvme_vendor_memory_size(dev);
> +
> +	if (dev->db_mem)
> +		dma_free_coherent(&pdev->dev, mem_size, dev->db_mem, dev->doorbell);
> +	if (dev->ei_mem)
> +		dma_free_coherent(&pdev->dev, mem_size, dev->ei_mem, dev->eventidx);
> +#endif
>   
>   	if (pdev->msi_enabled)
>   		pci_disable_msi(pdev);
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index a55986f..d3a8289 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -387,6 +387,9 @@ enum nvme_admin_opcode {
>   	nvme_admin_format_nvm		= 0x80,
>   	nvme_admin_security_send	= 0x81,
>   	nvme_admin_security_recv	= 0x82,
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	nvme_admin_doorbell_memory	= 0xC0,
> +#endif
>   };
>   
>   enum {
> @@ -516,6 +519,18 @@ struct nvme_format_cmd {
>   	__u32			rsvd11[5];
>   };
>   
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +struct nvme_doorbell_memory {
> +	__u8			opcode;
> +	__u8			flags;
> +	__u16			command_id;
> +	__u32			rsvd1[5];
> +	__le64			prp1;
> +	__le64			prp2;
> +	__u32			rsvd12[6];
> +};
> +#endif
> +
>   struct nvme_command {
>   	union {
>   		struct nvme_common_command common;
> @@ -529,6 +544,9 @@ struct nvme_command {
>   		struct nvme_format_cmd format;
>   		struct nvme_dsm_cmd dsm;
>   		struct nvme_abort_cmd abort;
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +		struct nvme_doorbell_memory doorbell_memory;
> +#endif
>   	};
>   };
>   
> @@ -575,6 +593,9 @@ enum {
>   	NVME_SC_BAD_ATTRIBUTES		= 0x180,
>   	NVME_SC_INVALID_PI		= 0x181,
>   	NVME_SC_READ_ONLY		= 0x182,
> +#ifdef CONFIG_NVME_VENDOR_EXT_GOOGLE
> +	NVME_SC_DOORBELL_MEMORY_INVALID	= 0x1C0,
> +#endif
>   	NVME_SC_WRITE_FAULT		= 0x280,
>   	NVME_SC_READ_ERROR		= 0x281,
>   	NVME_SC_GUARD_CHECK		= 0x282,




More information about the Linux-nvme mailing list