[PATCH 4/4] nvme-pci: implement host memory buffer support
Max Gurtovoy
maxg at mellanox.com
Tue May 30 14:39:22 PDT 2017
On 5/20/2017 4:13 PM, Christoph Hellwig wrote:
> If a controller supports the host memory buffer we try to provide
> it with the requested size up to an upper cap set as a module
> parameter. We try to give as few as possible descriptors, eventually
> working our way down.
>
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
> drivers/nvme/host/pci.c | 187 +++++++++++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 185 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index fed803232edc..93868b9750cd 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -66,6 +66,11 @@ static bool use_cmb_sqes = true;
> module_param(use_cmb_sqes, bool, 0644);
> MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes");
>
> +static unsigned int max_host_mem_size_mb = 128;
> +module_param(max_host_mem_size_mb, uint, 0444);
> +MODULE_PARM_DESC(max_host_mem_size_mb,
> + "Maximum Host Memory Buffer (HMB) size per controller (in MiB)");
> +
> static struct workqueue_struct *nvme_workq;
>
> struct nvme_dev;
> @@ -104,10 +109,18 @@ struct nvme_dev {
> u32 cmbloc;
> struct nvme_ctrl ctrl;
> struct completion ioq_wait;
> +
> + /* shadow doorbell buffer support: */
> u32 *dbbuf_dbs;
> dma_addr_t dbbuf_dbs_dma_addr;
> u32 *dbbuf_eis;
> dma_addr_t dbbuf_eis_dma_addr;
> +
> + /* host memory buffer support: */
> + u64 host_mem_size;
> + u32 nr_host_mem_descs;
> + struct nvme_host_mem_buf_desc *host_mem_descs;
> + void **host_mem_desc_bufs;
> };
>
> static inline unsigned int sq_idx(unsigned int qid, u32 stride)
> @@ -1509,6 +1522,160 @@ static inline void nvme_release_cmb(struct nvme_dev *dev)
> }
> }
>
> +static int nvme_set_host_mem(struct nvme_dev *dev, u32 bits)
> +{
> + size_t len = dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs);
> + struct nvme_command c;
> + u64 dma_addr;
> + int ret;
> +
> + dma_addr = dma_map_single(dev->dev, dev->host_mem_descs, len,
> + DMA_TO_DEVICE);
> + if (dma_mapping_error(dev->dev, dma_addr))
> + return -ENOMEM;
> +
> + memset(&c, 0, sizeof(c));
> + c.features.opcode = nvme_admin_set_features;
> + c.features.fid = cpu_to_le32(NVME_FEAT_HOST_MEM_BUF);
> + c.features.dword11 = cpu_to_le32(bits);
> + c.features.dword12 = cpu_to_le32(dev->host_mem_size /
> + dev->ctrl.page_size);
> + c.features.dword13 = cpu_to_le32(lower_32_bits(dma_addr));
> + c.features.dword14 = cpu_to_le32(upper_32_bits(dma_addr));
> + c.features.dword15 = cpu_to_le32(dev->nr_host_mem_descs);
> +
> + ret = nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0);
> + if (ret) {
> + dev_warn(dev->ctrl.device,
> + "failed to set host mem (err %d, flags %#x).\n",
> + ret, bits);
> + }
> + dma_unmap_single(dev->dev, dma_addr, len, DMA_TO_DEVICE);
> + return ret;
> +}
> +
> +static void nvme_free_host_mem(struct nvme_dev *dev)
> +{
> + int i;
> +
> + for (i = 0; i < dev->nr_host_mem_descs; i++) {
> + struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
> + size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
> +
> + dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i],
> + le64_to_cpu(desc->addr));
> + }
> +
> + kfree(dev->host_mem_desc_bufs);
> + dev->host_mem_desc_bufs = NULL;
> + kfree(dev->host_mem_descs);
> + dev->host_mem_descs = NULL;
> +}
> +
> +static int nvme_alloc_host_mem(struct nvme_dev *dev, u64 min, u64 preferred)
> +{
> + struct nvme_host_mem_buf_desc *descs;
> + u32 chunk_size, max_entries, i = 0;
> + void **bufs;
> + u64 size;
> +
> + /* start big and work our way down */
> + chunk_size = min(preferred, (u64)PAGE_SIZE << MAX_ORDER);
> +retry:
> + max_entries = DIV_ROUND_UP(preferred, chunk_size);
> + descs = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL);
> + if (!descs)
> + goto out;
> +
> + bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
> + if (!bufs)
> + goto out_free_descs;
> +
> + for (size = 0; size < preferred; size += chunk_size) {
> + u32 len = min_t(u64, chunk_size, preferred - size);
> + dma_addr_t dma_addr;
> +
> + bufs[i] = dma_alloc_attrs(dev->dev, len, &dma_addr, GFP_KERNEL,
> + DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
> + if (!bufs[i])
> + break;
> +
> + descs[i].addr = cpu_to_le64(dma_addr);
> + descs[i].size = cpu_to_le32(len / dev->ctrl.page_size);
> + i++;
> + }
> +
> + if (!size || (min && size < min)) {
> + dev_warn(dev->ctrl.device,
> + "failed to allocate host memory buffer.\n");
> + goto out_free_bufs;
> + }
> +
> + dev_info(dev->ctrl.device,
> + "allocated %lld MiB host memory buffer.\n", size / 1024 / 1024);
> + dev->nr_host_mem_descs = i;
> + dev->host_mem_size = size;
> + dev->host_mem_descs = descs;
> + dev->host_mem_desc_bufs = bufs;
> + return 0;
> +
> +out_free_bufs:
> + while (--i >= 0) {
> + struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
> + size_t size = le32_to_cpu(desc->size) * dev->ctrl.page_size;
> +
> + dma_free_coherent(dev->dev, size, dev->host_mem_desc_bufs[i],
> + le64_to_cpu(desc->addr));
> + }
> +
> + kfree(bufs);
> +out_free_descs:
> + kfree(descs);
> +out:
> + /* try a smaller chunk size if we failed early */
> + if (chunk_size >= PAGE_SIZE * 2 && (i == 0 || size < min)) {
> + chunk_size /= 2;
> + goto retry;
> + }
> + dev->host_mem_descs = NULL;
> + return -ENOMEM;
> +}
> +
> +static void nvme_setup_host_mem(struct nvme_dev *dev)
> +{
> + u64 max = (u64)max_host_mem_size_mb * 1024 * 1024;
> + u64 preferred = (u64)dev->ctrl.hmpre * 4096;
> + u64 min = (u64)dev->ctrl.hmmin * 4096;
> + u32 enable_bits = NVME_HOST_MEM_ENABLE;
> +
> + preferred = min(preferred, max);
> + if (min > max) {
Should it be:
if (min > preferred) ?
for more intuitive code without assumptions?
> + dev_warn(dev->ctrl.device,
> + "min host memory (%lld MiB) above limit (%d MiB).\n",
> + preferred / 1024 / 1024, max_host_mem_size_mb);
here we can:
preferred ==> min
max_host_mem_size_mb ==> preferred / 1024 / 1024 ?
other option is change the warning print from "preferred" to "min" since
we assume that dev->ctrl.hmpre >= dev->ctrl.hmmin.
your call.
Otherwise Looks good,
Reviewed-by: Max Gurtovoy <maxg at mellanox.com>
More information about the Linux-nvme
mailing list