[PATCH-part-2 8/9] nvme-pci: add support for sgl metadata
Keith Busch
kbusch at meta.com
Wed Sep 4 11:38:16 PDT 2024
From: Keith Busch <kbusch at kernel.org>
Supporting this mode allows merging requests with metadata that wouldn't
be possible otherwise.
Signed-off-by: Keith Busch <kbusch at kernel.org>
---
drivers/nvme/host/core.c | 4 +-
drivers/nvme/host/nvme.h | 5 ++
drivers/nvme/host/pci.c | 129 ++++++++++++++++++++++++++++++++++-----
include/linux/nvme.h | 1 +
4 files changed, 122 insertions(+), 17 deletions(-)
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 053d5b4909cda..6a8411ef45f26 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1988,7 +1988,9 @@ static void nvme_set_ctrl_limits(struct nvme_ctrl *ctrl,
lim->max_hw_sectors = ctrl->max_hw_sectors;
lim->max_segments = min_t(u32, USHRT_MAX,
min_not_zero(nvme_max_drv_segments(ctrl), ctrl->max_segments));
- lim->max_integrity_segments = ctrl->max_integrity_segments;
+ if (lim->max_integrity_segments > 1 &&
+ !nvme_ctrl_meta_sgl_supported(ctrl))
+ lim->max_integrity_segments = 1;
lim->virt_boundary_mask = NVME_CTRL_PAGE_SIZE - 1;
lim->max_segment_size = UINT_MAX;
lim->dma_alignment = 3;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f900e44243aef..699cc36e596fa 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -1121,6 +1121,11 @@ static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
return ctrl->sgls & ((1 << 0) | (1 << 1));
}
+static inline bool nvme_ctrl_meta_sgl_supported(struct nvme_ctrl *ctrl)
+{
+ return ctrl->sgls & NVME_CTRL_SGLS_MPTR;
+}
+
#ifdef CONFIG_NVME_HOST_AUTH
int __init nvme_init_auth(void);
void __exit nvme_exit_auth(void);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index db0144a6bc5db..a0a10451d7da8 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -43,6 +43,7 @@
*/
#define NVME_MAX_KB_SZ 8192
#define NVME_MAX_SEGS 128
+#define NVME_MAX_META_SEGS 15
#define NVME_MAX_NR_ALLOCATIONS 5
static int use_threaded_interrupts;
@@ -143,6 +144,7 @@ struct nvme_dev {
bool hmb;
mempool_t *iod_mempool;
+ mempool_t *iod_meta_mempool;
/* shadow doorbell buffer support: */
__le32 *dbbuf_dbs;
@@ -237,6 +239,8 @@ struct nvme_iod {
dma_addr_t first_dma;
dma_addr_t meta_dma;
struct sg_table sgt;
+ struct sg_table meta_sgt;
+ union nvme_descriptor meta_list;
union nvme_descriptor list[NVME_MAX_NR_ALLOCATIONS];
};
@@ -516,6 +520,11 @@ static inline bool nvme_pci_sgl_capable(struct nvme_dev *dev,
return sgl_threshold;
}
+static inline bool nvme_pci_metadata_use_sgls(struct request *req)
+{
+ return blk_rq_integrity_segments(req) > 1;
+}
+
static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
int nseg)
{
@@ -525,6 +534,8 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req,
if (!nvme_pci_sgl_capable(dev, req))
return false;
+ if (nvme_pci_metadata_use_sgls(req))
+ return true;
return avg_seg_size >= sgl_threshold;
}
@@ -834,19 +845,81 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req)
return __nvme_map_data(dev, req);
}
-static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req,
- struct nvme_command *cmnd)
+static blk_status_t nvme_pci_setup_meta_sgls(struct nvme_dev *dev,
+ struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_rw_command *cmnd = &iod->cmd.rw;
+ struct nvme_sgl_desc *sg_list;
+ struct scatterlist *sg;
+ unsigned int entries;
+ dma_addr_t sgl_dma;
+ int rc;
+
+ iod->meta_sgt.sgl = mempool_alloc(dev->iod_meta_mempool, GFP_ATOMIC);
+ if (!iod->meta_sgt.sgl)
+ return BLK_STS_RESOURCE;
+
+ sg_init_table(iod->meta_sgt.sgl, req->nr_integrity_segments);
+ iod->meta_sgt.orig_nents = blk_rq_map_integrity_sg(req->bio,
+ iod->meta_sgt.sgl);
+ if (!iod->meta_sgt.orig_nents)
+ goto out_free_sg;
+
+ rc = dma_map_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req),
+ DMA_ATTR_NO_WARN);
+ if (rc)
+ goto out_free_sg;
+
+ sg_list = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &sgl_dma);
+ if (!sg_list)
+ goto out_unmap_sg;
+
+ entries = iod->meta_sgt.nents;
+ iod->meta_list.sg_list = sg_list;
+ iod->meta_dma = sgl_dma;
+
+ cmnd->flags = NVME_CMD_SGL_METASEG;
+ cmnd->metadata = cpu_to_le64(sgl_dma);
+
+ sg = iod->meta_sgt.sgl;
+ if (entries == 1) {
+ nvme_pci_sgl_set_data(sg_list, sg);
+ return BLK_STS_OK;
+ }
+
+ sgl_dma += sizeof(sizeof(*sg_list));
+ nvme_pci_sgl_set_list(&sg_list[1], sg_list, sg, entries, sgl_dma);
+ return BLK_STS_OK;
+
+out_unmap_sg:
+ dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+out_free_sg:
+ mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
+ return BLK_STS_RESOURCE;
+}
+
+static blk_status_t nvme_pci_setup_meta_mptr(struct nvme_dev *dev,
+ struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct nvme_rw_command *cmnd = &iod->cmd.rw;
struct bio_vec bv = rq_integrity_vec(req);
iod->meta_dma = dma_map_bvec(dev->dev, &bv, rq_dma_dir(req), 0);
if (dma_mapping_error(dev->dev, iod->meta_dma))
return BLK_STS_IOERR;
- cmnd->rw.metadata = cpu_to_le64(iod->meta_dma);
+ cmnd->metadata = cpu_to_le64(iod->meta_dma);
return BLK_STS_OK;
}
+static blk_status_t nvme_map_metadata(struct nvme_dev *dev, struct request *req)
+{
+ if (nvme_pci_metadata_use_sgls(req))
+ return nvme_pci_setup_meta_sgls(dev, req);
+ return nvme_pci_setup_meta_mptr(dev, req);
+}
+
static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -855,6 +928,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
iod->aborted = false;
iod->nr_allocations = -1;
iod->sgt.nents = 0;
+ iod->meta_sgt.nents = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
@@ -867,7 +941,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
}
if (blk_integrity_rq(req)) {
- ret = nvme_map_metadata(dev, req, &iod->cmd);
+ ret = nvme_map_metadata(dev, req);
if (ret)
goto out_unmap_data;
}
@@ -971,17 +1045,31 @@ static void nvme_queue_rqs(struct request **rqlist)
*rqlist = requeue_list;
}
+static __always_inline void nvme_unmap_metadata(struct nvme_dev *dev,
+ struct request *req)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+
+ if (!iod->meta_sgt.nents) {
+ dma_unmap_page(dev->dev, iod->meta_dma,
+ rq_integrity_vec(req).bv_len,
+ rq_dma_dir(req));
+ return;
+ }
+
+ dma_pool_free(dev->prp_small_pool, iod->meta_list.sg_list,
+ iod->meta_dma);
+ dma_unmap_sgtable(dev->dev, &iod->meta_sgt, rq_dma_dir(req), 0);
+ mempool_free(iod->meta_sgt.sgl, dev->iod_meta_mempool);
+}
+
static __always_inline void nvme_pci_unmap_rq(struct request *req)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct nvme_dev *dev = nvmeq->dev;
- if (blk_integrity_rq(req)) {
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
-
- dma_unmap_page(dev->dev, iod->meta_dma,
- rq_integrity_vec(req).bv_len, rq_dma_dir(req));
- }
+ if (blk_integrity_rq(req))
+ nvme_unmap_metadata(dev, req);
if (blk_rq_nr_phys_segments(req))
nvme_unmap_data(dev, req);
@@ -2718,6 +2806,7 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
{
+ size_t meta_size = sizeof(struct scatterlist) * NVME_MAX_META_SEGS + 1;
size_t alloc_size = sizeof(struct scatterlist) * NVME_MAX_SEGS;
dev->iod_mempool = mempool_create_node(1,
@@ -2726,7 +2815,18 @@ static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
dev_to_node(dev->dev));
if (!dev->iod_mempool)
return -ENOMEM;
+
+ dev->iod_meta_mempool = mempool_create_node(1,
+ mempool_kmalloc, mempool_kfree,
+ (void *)meta_size, GFP_KERNEL,
+ dev_to_node(dev->dev));
+ if (!dev->iod_meta_mempool)
+ goto free;
+
return 0;
+free:
+ mempool_destroy(dev->iod_mempool);
+ return -ENOMEM;
}
static void nvme_free_tagset(struct nvme_dev *dev)
@@ -3046,12 +3146,7 @@ static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
dev->ctrl.max_hw_sectors = min_t(u32,
NVME_MAX_KB_SZ << 1, dma_opt_mapping_size(&pdev->dev) >> 9);
dev->ctrl.max_segments = NVME_MAX_SEGS;
-
- /*
- * There is no support for SGLs for metadata (yet), so we are limited to
- * a single integrity segment for the separate metadata pointer.
- */
- dev->ctrl.max_integrity_segments = 1;
+ dev->ctrl.max_integrity_segments = NVME_MAX_META_SEGS;
return dev;
out_put_device:
@@ -3155,6 +3250,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
nvme_free_queues(dev, 0);
out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->iod_meta_mempool);
out_release_prp_pools:
nvme_release_prp_pools(dev);
out_dev_unmap:
@@ -3220,6 +3316,7 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
mempool_destroy(dev->iod_mempool);
+ mempool_destroy(dev->iod_meta_mempool);
nvme_release_prp_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 7b2ae2e435447..87ea1673c60b8 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -388,6 +388,7 @@ enum {
NVME_CTRL_CTRATT_PREDICTABLE_LAT = 1 << 5,
NVME_CTRL_CTRATT_NAMESPACE_GRANULARITY = 1 << 7,
NVME_CTRL_CTRATT_UUID_LIST = 1 << 9,
+ NVME_CTRL_SGLS_MPTR = 1 << 18,
};
struct nvme_lbaf {
--
2.43.5
More information about the Linux-nvme
mailing list