[PATCH v3 07/10] nvme-pci: implement dma_token backed requests
Anuj Gupta/Anuj Gupta
anuj20.g at samsung.com
Mon May 18 03:18:37 PDT 2026
On 5/18/2026 2:59 PM, Pavel Begunkov wrote:
>> FYI, I really want SGL support before this get merged, but ignoring that
>> for now:
>
> I was hoping to let Samsung guys to send a follow up they already have,
> but I'll ask them to have about taking it into this patch set.
I had done patches on top of v3 adding SGL support and PRP list reuse
optimization for the dmabuf path.
Branch: https://github.com/SamsungDS/linux/commits/rw-dmabuf-v3-nvme-opt/
Also pasting the SGL patch here for quick reference:
Subject: [PATCH 1/2] nvme-pci: add sgl support for dmabuf path
Handle dmabuf-backed requests through the SGL setup path too.
Use the cached dmabuf sg_table and keep PRP fallback where allowed.
Signed-off-by: Anuj Gupta <anuj20.g at samsung.com>
---
drivers/nvme/host/pci.c | 194 +++++++++++++++++++++++++++++++++++++++-
1 file changed, 193 insertions(+), 1 deletion(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0a49c94dd675..31e37ab8769b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1270,6 +1270,14 @@ static void nvme_pci_sgl_set_data(struct
nvme_sgl_desc *sge,
sge->type = NVME_SGL_FMT_DATA_DESC << 4;
}
+static void nvme_pci_sgl_set_data_addr(struct nvme_sgl_desc *sge,
+ dma_addr_t addr, u32 len)
+{
+ sge->addr = cpu_to_le64(addr);
+ sge->length = cpu_to_le32(len);
+ sge->type = NVME_SGL_FMT_DATA_DESC << 4;
+}
+
static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
dma_addr_t dma_addr, int entries)
{
@@ -1321,6 +1329,176 @@ static blk_status_t
nvme_pci_setup_data_sgl(struct request *req,
return iter->status;
}
+static unsigned int nvme_pci_dmabuf_sgl_nents(struct request *req,
+ dma_addr_t *first_dma, u32 *first_len)
+{
+ struct bio *bio = req->bio;
+ struct nvme_dmabuf_map *map;
+ struct scatterlist *sg;
+ unsigned long tmp;
+ size_t offset = bio->bi_iter.bi_bvec_done;
+ size_t remaining = blk_rq_payload_bytes(req);
+ dma_addr_t last_end = 0;
+ unsigned int nents = 0;
+ dma_addr_t dma = 0;
+ u32 len = 0;
+ bool have = false;
+
+ map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base);
+
+ for_each_sgtable_dma_sg(map->sgt, sg, tmp) {
+ size_t sg_len = sg_dma_len(sg);
+ dma_addr_t addr = sg_dma_address(sg);
+
+ if (!remaining)
+ break;
+ if (offset >= sg_len) {
+ offset -= sg_len;
+ continue;
+ }
+
+ addr += offset;
+ sg_len -= offset;
+ offset = 0;
+
+ while (sg_len && remaining) {
+ u32 chunk = min_t(size_t, remaining, sg_len);
+
+ if (!have || last_end != addr) {
+ nents++;
+ if (nents == 1) {
+ dma = addr;
+ len = chunk;
+ }
+ } else if (nents == 1) {
+ len += chunk;
+ }
+
+ have = true;
+ last_end = addr + chunk;
+ addr += chunk;
+ sg_len -= chunk;
+ remaining -= chunk;
+ }
+ }
+
+ if (unlikely(remaining))
+ return 0;
+
+ *first_dma = dma;
+ *first_len = len;
+ return nents;
+}
+
+static unsigned int nvme_pci_dmabuf_avg_seg_size(struct request *req)
+{
+ dma_addr_t first_dma;
+ u32 first_len;
+ unsigned int nseg;
+
+ nseg = nvme_pci_dmabuf_sgl_nents(req, &first_dma, &first_len);
+ if (!nseg)
+ return 0;
+ return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+}
+
+static blk_status_t nvme_rq_setup_dmabuf_sgl(struct request *req,
+ struct nvme_queue *nvmeq)
+{
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ struct bio *bio = req->bio;
+ struct nvme_dmabuf_map *map;
+ size_t length = blk_rq_payload_bytes(req);
+ struct nvme_sgl_desc *sg_list = NULL;
+ dma_addr_t sgl_dma = 0, first_dma, last_end = 0;
+ unsigned int entries, mapped = 0;
+ unsigned long tmp;
+ struct scatterlist *sg;
+ size_t offset, remaining;
+ u32 first_len;
+ bool have = false;
+
+ map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base);
+
+ entries = nvme_pci_dmabuf_sgl_nents(req, &first_dma, &first_len);
+ if (!entries)
+ return BLK_STS_IOERR;
+ if (entries > NVME_MAX_SEGS)
+ return BLK_STS_AGAIN;
+
+ iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
+ iod->total_len = length;
+
+ nvme_sync_dma(nvmeq->dev, req, false);
+
+ if (entries == 1) {
+ nvme_pci_sgl_set_data_addr(&iod->cmd.common.dptr.sgl,
first_dma,
+ first_len);
+ return BLK_STS_OK;
+ }
+
+ if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
+ iod->flags |= IOD_SMALL_DESCRIPTOR;
+
+ sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+ &sgl_dma);
+ if (!sg_list)
+ return BLK_STS_RESOURCE;
+ iod->descriptors[iod->nr_descriptors++] = sg_list;
+
+ offset = bio->bi_iter.bi_bvec_done;
+ remaining = length;
+
+ for_each_sgtable_dma_sg(map->sgt, sg, tmp) {
+ size_t sg_len = sg_dma_len(sg);
+ dma_addr_t addr = sg_dma_address(sg);
+
+ if (!remaining)
+ break;
+ if (offset >= sg_len) {
+ offset -= sg_len;
+ continue;
+ }
+
+ addr += offset;
+ sg_len -= offset;
+ offset = 0;
+
+ while (sg_len && remaining) {
+ u32 chunk = min_t(size_t, remaining, sg_len);
+
+ if (have && last_end == addr) {
+ u32 old = le32_to_cpu(sg_list[mapped -
1].length);
+
+ sg_list[mapped - 1].length =
+ cpu_to_le32(old + chunk);
+ } else {
+ if (WARN_ON_ONCE(mapped == entries))
+ goto err_free;
+
nvme_pci_sgl_set_data_addr(&sg_list[mapped++],
+ addr, chunk);
+ }
+
+ have = true;
+ last_end = addr + chunk;
+ addr += chunk;
+ sg_len -= chunk;
+ remaining -= chunk;
+ }
+ }
+
+ if (unlikely(remaining))
+ goto err_free;
+
+ nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
+ return BLK_STS_OK;
+
+err_free:
+ iod->nr_descriptors--;
+ dma_pool_free(nvme_dma_pool(nvmeq, iod), sg_list, sgl_dma);
+ return BLK_STS_IOERR;
+}
+
static blk_status_t nvme_pci_setup_data_simple(struct request *req,
enum nvme_use_sgl use_sgl)
{
@@ -1369,8 +1547,22 @@ static blk_status_t nvme_map_data(struct request
*req)
struct blk_dma_iter iter;
blk_status_t ret;
- if (nvme_rq_is_dmabuf_attached(req))
+ if (nvme_rq_is_dmabuf_attached(req)) {
+ if (use_sgl == SGL_FORCED) {
+ ret = nvme_rq_setup_dmabuf_sgl(req, nvmeq);
+ /* Regular path doesn't fall back if SGLs are
forced. */
+ return ret == BLK_STS_AGAIN ? BLK_STS_IOERR : ret;
+ }
+
+ if (use_sgl == SGL_SUPPORTED && sgl_threshold &&
+ nvme_pci_dmabuf_avg_seg_size(req) >= sgl_threshold) {
+ ret = nvme_rq_setup_dmabuf_sgl(req, nvmeq);
+ if (ret != BLK_STS_AGAIN)
+ return ret;
+ }
+
return nvme_rq_setup_dmabuf_map(req, nvmeq);
+ }
/*
* Try to skip the DMA iterator for single segment requests, as
that
--
2.43.0
More information about the Linux-nvme
mailing list