[PATCH v8] nvme: improve performance for virtual NVMe devices
Helen Koike
helen.koike at collabora.com
Fri Apr 14 11:10:30 PDT 2017
On 2017-04-10 12:51 PM, Helen Koike wrote:
> From: Helen Koike <helen.koike at collabora.co.uk>
>
> This change provides a mechanism to reduce the number of MMIO doorbell
> writes for the NVMe driver. When running in a virtualized environment
> like QEMU, the cost of an MMIO is quite hefy here. The main idea for
> the patch is provide the device two memory location locations:
> 1) to store the doorbell values so they can be lookup without the doorbell
> MMIO write
> 2) to store an event index.
> I believe the doorbell value is obvious, the event index not so much.
> Similar to the virtio specification, the virtual device can tell the
> driver (guest OS) not to write MMIO unless you are writing past this
> value.
>
> FYI: doorbell values are written by the nvme driver (guest OS) and the
> event index is written by the virtual device (host OS).
>
> The patch implements a new admin command that will communicate where
> these two memory locations reside. If the command fails, the nvme
> driver will work as before without any optimizations.
>
> Contributions:
> Eric Northup <digitaleric at google.com>
> Frank Swiderski <fes at google.com>
> Ted Tso <tytso at mit.edu>
> Keith Busch <keith.busch at intel.com>
>
> Just to give an idea on the performance boost with the vendor
> extension: Running fio [1], a stock NVMe driver I get about 200K read
> IOPs with my vendor patch I get about 1000K read IOPs. This was
> running with a null device i.e. the backing device simply returned
> success on every read IO request.
>
> [1] Running on a 4 core machine:
> fio --time_based --name=benchmark --runtime=30
> --filename=/dev/nvme0n1 --nrfiles=1 --ioengine=libaio --iodepth=32
> --direct=1 --invalidate=1 --verify=0 --verify_fatal=0 --numjobs=4
> --rw=randread --blocksize=4k --randrepeat=false
>
> Signed-off-by: Rob Nelson <rlnelson at google.com>
> [mlin: port for upstream]
> Signed-off-by: Ming Lin <mlin at kernel.org>
> [koike: updated for upstream]
> Signed-off-by: Helen Koike <helen.koike at collabora.co.uk>
> Reviewed-by: Christoph Hellwig <hch at lst.de>
>
> ---
>
> This patch is based on git://git.infradead.org/nvme.git master
> Tested through Google Cloud Engine
> TPAR is ratified by the NVME working group
>
> changes since v7:
> - remove nvme_write_doorbell(), add
> nvme_dbbuf_update_and_check_event() instead that doesn't write
> to the doorbell, only returns true if db needs to be updated
> - add reviewed-by tag from Christoph
>
> changes since v6:
> - remove nvme_dbbuf_init() from nvme_alloc_queue() as it is
> already called in nvme_init_queue()
> - free dbbuf_dbs only in nvme_pci_free_ctrl(), change
> nvme_dbbuf_dma_alloc() to do nothing the dbbuf_dbs is already
> allocated
>
> changes since v5:
> - remove anonymous dbbuf struct in struct nvme_dev and struct nvme_queue
> - use inline functions for sq_idx and cq_idx instead of macros
> - add a warning when nvme_dbbuf_set fails
> - remove comment "Borrowed from vring_need_event"
> - change formatting of the parameters in nvme_write_doorbell
> - don't fail nvme_reset_work if nvme_dbbuf_dma_alloc fails
> - remove nvme_write_doorbell_{sq,cq} wrappers
> - in nvme_write_doorbell(), call writel last
>
> Changes since v4
> - Remove CONFIG_NVME_DBBUF
> - Remove dbbuf.{c,h}, move the code to pci.c
> - Coding style changes
> - Functions and vaiables renamed
>
> Changes since v3
> - Rename to dbbuf (be closer to the latest TPAR)
> - Modify the opcodes according to the latest TPAR
> - Check if the device support this feature through the OACS bit
> - little cleanups
>
> nvme_dbbuf_update_and_check_event
> ---
> drivers/nvme/host/pci.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++-
> include/linux/nvme.h | 13 +++++
> 2 files changed, 154 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 5b7bd9c..0e5f2a1 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -103,8 +103,22 @@ struct nvme_dev {
> u32 cmbloc;
> struct nvme_ctrl ctrl;
> struct completion ioq_wait;
> + u32 *dbbuf_dbs;
> + dma_addr_t dbbuf_dbs_dma_addr;
> + u32 *dbbuf_eis;
> + dma_addr_t dbbuf_eis_dma_addr;
> };
>
> +static inline unsigned int sq_idx(unsigned int qid, u32 stride)
> +{
> + return qid * 2 * stride;
> +}
> +
> +static inline unsigned int cq_idx(unsigned int qid, u32 stride)
> +{
> + return (qid * 2 + 1) * stride;
> +}
> +
> static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl)
> {
> return container_of(ctrl, struct nvme_dev, ctrl);
> @@ -133,6 +147,10 @@ struct nvme_queue {
> u16 qid;
> u8 cq_phase;
> u8 cqe_seen;
> + u32 *dbbuf_sq_db;
> + u32 *dbbuf_cq_db;
> + u32 *dbbuf_sq_ei;
> + u32 *dbbuf_cq_ei;
> };
>
> /*
> @@ -174,6 +192,112 @@ static inline void _nvme_check_size(void)
> BUILD_BUG_ON(sizeof(struct nvme_id_ns) != 4096);
> BUILD_BUG_ON(sizeof(struct nvme_lba_range_type) != 64);
> BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512);
> + BUILD_BUG_ON(sizeof(struct nvme_dbbuf) != 64);
> +}
> +
> +static inline unsigned int nvme_dbbuf_size(u32 stride)
> +{
> + return ((num_possible_cpus() + 1) * 8 * stride);
> +}
> +
> +static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
> +{
> + unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
> +
> + if (dev->dbbuf_dbs)
> + return 0;
> +
> + dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
> + &dev->dbbuf_dbs_dma_addr,
> + GFP_KERNEL);
> + if (!dev->dbbuf_dbs)
> + return -ENOMEM;
> + dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
> + &dev->dbbuf_eis_dma_addr,
> + GFP_KERNEL);
> + if (!dev->dbbuf_eis) {
> + dma_free_coherent(dev->dev, mem_size,
> + dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
> + dev->dbbuf_dbs = NULL;
> + return -ENOMEM;
> + }
> +
> + return 0;
> +}
> +
> +static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
> +{
> + unsigned int mem_size = nvme_dbbuf_size(dev->db_stride);
> +
> + if (dev->dbbuf_dbs) {
> + dma_free_coherent(dev->dev, mem_size,
> + dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
> + dev->dbbuf_dbs = NULL;
> + }
> + if (dev->dbbuf_eis) {
> + dma_free_coherent(dev->dev, mem_size,
> + dev->dbbuf_eis, dev->dbbuf_eis_dma_addr);
> + dev->dbbuf_eis = NULL;
> + }
> +}
> +
> +static void nvme_dbbuf_init(struct nvme_dev *dev,
> + struct nvme_queue *nvmeq, int qid)
> +{
> + if (!dev->dbbuf_dbs || !qid)
> + return;
> +
> + nvmeq->dbbuf_sq_db = &dev->dbbuf_dbs[sq_idx(qid, dev->db_stride)];
> + nvmeq->dbbuf_cq_db = &dev->dbbuf_dbs[cq_idx(qid, dev->db_stride)];
> + nvmeq->dbbuf_sq_ei = &dev->dbbuf_eis[sq_idx(qid, dev->db_stride)];
> + nvmeq->dbbuf_cq_ei = &dev->dbbuf_eis[cq_idx(qid, dev->db_stride)];
> +}
> +
> +static void nvme_dbbuf_set(struct nvme_dev *dev)
> +{
> + struct nvme_command c;
> +
> + if (!dev->dbbuf_dbs)
> + return;
> +
> + memset(&c, 0, sizeof(c));
> + c.dbbuf.opcode = nvme_admin_dbbuf;
> + c.dbbuf.prp1 = cpu_to_le64(dev->dbbuf_dbs_dma_addr);
> + c.dbbuf.prp2 = cpu_to_le64(dev->dbbuf_eis_dma_addr);
> +
> + if (nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0)) {
> + dev_warn(dev->dev, "unable to set dbbuf\n");
> + /* Free memory and continue on */
> + nvme_dbbuf_dma_free(dev);
> + }
> +}
> +
> +static inline int nvme_dbbuf_need_event(u16 event_idx, u16 new_idx, u16 old)
> +{
> + return (u16)(new_idx - event_idx - 1) < (u16)(new_idx - old);
> +}
> +
> +/* Update dbbuf and return true if an MMIO is required */
> +static bool nvme_dbbuf_update_and_check_event(u16 value, u32 *dbbuf_db,
> + volatile u32 *dbbuf_ei)
> +{
> + if (dbbuf_db) {
> + u16 old_value;
> +
> + /*
> + * Ensure that the queue is written before updating
> + * the doorbell in memory
> + */
> + wmb();
> +
> + old_value = *dbbuf_db;
> + *dbbuf_db = value;
> +
> + if (!nvme_dbbuf_need_event(*dbbuf_ei, value, old_value))
> + return false;
> + }
> +
> + return true;
> }
>
> /*
> @@ -300,7 +424,9 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq,
>
> if (++tail == nvmeq->q_depth)
> tail = 0;
> - writel(tail, nvmeq->q_db);
> + if (nvme_dbbuf_update_and_check_event(tail, nvmeq->dbbuf_sq_db,
> + nvmeq->dbbuf_sq_ei))
> + writel(tail, nvmeq->q_db);
> nvmeq->sq_tail = tail;
> }
>
> @@ -716,7 +842,9 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag)
> return;
>
> if (likely(nvmeq->cq_vector >= 0))
> - writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
> + if (nvme_dbbuf_update_and_check_event(head, nvmeq->dbbuf_cq_db,
> + nvmeq->dbbuf_cq_ei))
> + writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
> nvmeq->cq_head = head;
> nvmeq->cq_phase = phase;
>
> @@ -1099,6 +1227,7 @@ static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid)
> nvmeq->cq_phase = 1;
> nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride];
> memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth));
> + nvme_dbbuf_init(dev, nvmeq, qid);
> dev->online_queues++;
> spin_unlock_irq(&nvmeq->q_lock);
> }
> @@ -1568,6 +1697,8 @@ static int nvme_dev_add(struct nvme_dev *dev)
> if (blk_mq_alloc_tag_set(&dev->tagset))
> return 0;
> dev->ctrl.tagset = &dev->tagset;
> +
> + nvme_dbbuf_set(dev);
> } else {
> blk_mq_update_nr_hw_queues(&dev->tagset, dev->online_queues - 1);
>
> @@ -1733,6 +1864,7 @@ static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
> {
> struct nvme_dev *dev = to_nvme_dev(ctrl);
>
> + nvme_dbbuf_dma_free(dev);
> put_device(dev->dev);
> if (dev->tagset.tags)
> blk_mq_free_tag_set(&dev->tagset);
> @@ -1800,6 +1932,13 @@ static void nvme_reset_work(struct work_struct *work)
> dev->ctrl.opal_dev = NULL;
> }
>
> + if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
> + result = nvme_dbbuf_dma_alloc(dev);
> + if (result)
> + dev_warn(dev->dev,
> + "unable to allocate dma for dbbuf\n");
> + }
> +
> result = nvme_setup_io_queues(dev);
> if (result)
> goto out;
> diff --git a/include/linux/nvme.h b/include/linux/nvme.h
> index c43d435..43a6289 100644
> --- a/include/linux/nvme.h
> +++ b/include/linux/nvme.h
> @@ -245,6 +245,7 @@ enum {
> NVME_CTRL_ONCS_WRITE_ZEROES = 1 << 3,
> NVME_CTRL_VWC_PRESENT = 1 << 0,
> NVME_CTRL_OACS_SEC_SUPP = 1 << 0,
> + NVME_CTRL_OACS_DBBUF_SUPP = 1 << 7,
> };
>
> struct nvme_lbaf {
> @@ -603,6 +604,7 @@ enum nvme_admin_opcode {
> nvme_admin_download_fw = 0x11,
> nvme_admin_ns_attach = 0x15,
> nvme_admin_keep_alive = 0x18,
> + nvme_admin_dbbuf = 0x7C,
> nvme_admin_format_nvm = 0x80,
> nvme_admin_security_send = 0x81,
> nvme_admin_security_recv = 0x82,
> @@ -874,6 +876,16 @@ struct nvmf_property_get_command {
> __u8 resv4[16];
> };
>
> +struct nvme_dbbuf {
> + __u8 opcode;
> + __u8 flags;
> + __u16 command_id;
> + __u32 rsvd1[5];
> + __le64 prp1;
> + __le64 prp2;
> + __u32 rsvd12[6];
> +};
> +
> struct nvme_command {
> union {
> struct nvme_common_command common;
> @@ -893,6 +905,7 @@ struct nvme_command {
> struct nvmf_connect_command connect;
> struct nvmf_property_set_command prop_set;
> struct nvmf_property_get_command prop_get;
> + struct nvme_dbbuf dbbuf;
> };
> };
>
>
+ Add missing maintainers from scripts/get_maintainer.pl in the email thread
Hi,
I would like to know if it would be possible to get this patch for
kernel 4.12.
Should I send a pull request? Or do you usually get the patch from the
email thread?
Thanks,
Helen
More information about the Linux-nvme
mailing list