[PATCH v3 07/10] nvme-pci: implement dma_token backed requests

Mon May 18 03:18:37 PDT 2026

On 5/18/2026 2:59 PM, Pavel Begunkov wrote:
>> FYI, I really want SGL support before this get merged, but ignoring that
>> for now:
> 
> I was hoping to let Samsung guys to send a follow up they already have,
> but I'll ask them to have about taking it into this patch set.

I had done patches on top of v3 adding SGL support and PRP list reuse
optimization for the dmabuf path.
Branch: https://github.com/SamsungDS/linux/commits/rw-dmabuf-v3-nvme-opt/

Also pasting the SGL patch here for quick reference:

Subject: [PATCH 1/2] nvme-pci: add sgl support for dmabuf path

Handle dmabuf-backed requests through the SGL setup path too.
Use the cached dmabuf sg_table and keep PRP fallback where allowed.

Signed-off-by: Anuj Gupta <anuj20.g at samsung.com>
---
  drivers/nvme/host/pci.c | 194 +++++++++++++++++++++++++++++++++++++++-
  1 file changed, 193 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 0a49c94dd675..31e37ab8769b 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -1270,6 +1270,14 @@ static void nvme_pci_sgl_set_data(struct 
nvme_sgl_desc *sge,
         sge->type = NVME_SGL_FMT_DATA_DESC << 4;
  }

+static void nvme_pci_sgl_set_data_addr(struct nvme_sgl_desc *sge,
+               dma_addr_t addr, u32 len)
+{
+       sge->addr = cpu_to_le64(addr);
+       sge->length = cpu_to_le32(len);
+       sge->type = NVME_SGL_FMT_DATA_DESC << 4;
+}
+
  static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
                 dma_addr_t dma_addr, int entries)
  {
@@ -1321,6 +1329,176 @@ static blk_status_t 
nvme_pci_setup_data_sgl(struct request *req,
         return iter->status;
  }

+static unsigned int nvme_pci_dmabuf_sgl_nents(struct request *req,
+               dma_addr_t *first_dma, u32 *first_len)
+{
+       struct bio *bio = req->bio;
+       struct nvme_dmabuf_map *map;
+       struct scatterlist *sg;
+       unsigned long tmp;
+       size_t offset = bio->bi_iter.bi_bvec_done;
+       size_t remaining = blk_rq_payload_bytes(req);
+       dma_addr_t last_end = 0;
+       unsigned int nents = 0;
+       dma_addr_t dma = 0;
+       u32 len = 0;
+       bool have = false;
+
+       map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base);
+
+       for_each_sgtable_dma_sg(map->sgt, sg, tmp) {
+               size_t sg_len = sg_dma_len(sg);
+               dma_addr_t addr = sg_dma_address(sg);
+
+               if (!remaining)
+                       break;
+               if (offset >= sg_len) {
+                       offset -= sg_len;
+                       continue;
+               }
+
+               addr += offset;
+               sg_len -= offset;
+               offset = 0;
+
+               while (sg_len && remaining) {
+                       u32 chunk = min_t(size_t, remaining, sg_len);
+
+                       if (!have || last_end != addr) {
+                               nents++;
+                               if (nents == 1) {
+                                       dma = addr;
+                                       len = chunk;
+                               }
+                       } else if (nents == 1) {
+                               len += chunk;
+                       }
+
+                       have = true;
+                       last_end = addr + chunk;
+                       addr += chunk;
+                       sg_len -= chunk;
+                       remaining -= chunk;
+               }
+       }
+
+       if (unlikely(remaining))
+               return 0;
+
+       *first_dma = dma;
+       *first_len = len;
+       return nents;
+}
+
+static unsigned int nvme_pci_dmabuf_avg_seg_size(struct request *req)
+{
+       dma_addr_t first_dma;
+       u32 first_len;
+       unsigned int nseg;
+
+       nseg = nvme_pci_dmabuf_sgl_nents(req, &first_dma, &first_len);
+       if (!nseg)
+               return 0;
+       return DIV_ROUND_UP(blk_rq_payload_bytes(req), nseg);
+}
+
+static blk_status_t nvme_rq_setup_dmabuf_sgl(struct request *req,
+                                            struct nvme_queue *nvmeq)
+{
+       struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+       struct bio *bio = req->bio;
+       struct nvme_dmabuf_map *map;
+       size_t length = blk_rq_payload_bytes(req);
+       struct nvme_sgl_desc *sg_list = NULL;
+       dma_addr_t sgl_dma = 0, first_dma, last_end = 0;
+       unsigned int entries, mapped = 0;
+       unsigned long tmp;
+       struct scatterlist *sg;
+       size_t offset, remaining;
+       u32 first_len;
+       bool have = false;
+
+       map = container_of(bio->dmabuf_map, struct nvme_dmabuf_map, base);
+
+       entries = nvme_pci_dmabuf_sgl_nents(req, &first_dma, &first_len);
+       if (!entries)
+               return BLK_STS_IOERR;
+       if (entries > NVME_MAX_SEGS)
+               return BLK_STS_AGAIN;
+
+       iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
+       iod->total_len = length;
+
+       nvme_sync_dma(nvmeq->dev, req, false);
+
+       if (entries == 1) {
+               nvme_pci_sgl_set_data_addr(&iod->cmd.common.dptr.sgl, 
first_dma,
+                                          first_len);
+               return BLK_STS_OK;
+       }
+
+       if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
+               iod->flags |= IOD_SMALL_DESCRIPTOR;
+
+       sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
+                                &sgl_dma);
+       if (!sg_list)
+               return BLK_STS_RESOURCE;
+       iod->descriptors[iod->nr_descriptors++] = sg_list;
+
+       offset = bio->bi_iter.bi_bvec_done;
+       remaining = length;
+
+       for_each_sgtable_dma_sg(map->sgt, sg, tmp) {
+               size_t sg_len = sg_dma_len(sg);
+               dma_addr_t addr = sg_dma_address(sg);
+
+               if (!remaining)
+                       break;
+               if (offset >= sg_len) {
+                       offset -= sg_len;
+                       continue;
+               }
+
+               addr += offset;
+               sg_len -= offset;
+               offset = 0;
+
+               while (sg_len && remaining) {
+                       u32 chunk = min_t(size_t, remaining, sg_len);
+
+                       if (have && last_end == addr) {
+                               u32 old = le32_to_cpu(sg_list[mapped - 
1].length);
+
+                               sg_list[mapped - 1].length =
+                                       cpu_to_le32(old + chunk);
+                       } else {
+                               if (WARN_ON_ONCE(mapped == entries))
+                                       goto err_free;
+ 
nvme_pci_sgl_set_data_addr(&sg_list[mapped++],
+                                                          addr, chunk);
+                       }
+
+                       have = true;
+                       last_end = addr + chunk;
+                       addr += chunk;
+                       sg_len -= chunk;
+                       remaining -= chunk;
+               }
+       }
+
+       if (unlikely(remaining))
+               goto err_free;
+
+       nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
+       return BLK_STS_OK;
+
+err_free:
+       iod->nr_descriptors--;
+       dma_pool_free(nvme_dma_pool(nvmeq, iod), sg_list, sgl_dma);
+       return BLK_STS_IOERR;
+}
+
  static blk_status_t nvme_pci_setup_data_simple(struct request *req,
                 enum nvme_use_sgl use_sgl)
  {
@@ -1369,8 +1547,22 @@ static blk_status_t nvme_map_data(struct request 
*req)
         struct blk_dma_iter iter;
         blk_status_t ret;

-       if (nvme_rq_is_dmabuf_attached(req))
+       if (nvme_rq_is_dmabuf_attached(req)) {
+               if (use_sgl == SGL_FORCED) {
+                       ret = nvme_rq_setup_dmabuf_sgl(req, nvmeq);
+                       /* Regular path doesn't fall back if SGLs are 
forced. */
+                       return ret == BLK_STS_AGAIN ? BLK_STS_IOERR : ret;
+               }
+
+               if (use_sgl == SGL_SUPPORTED && sgl_threshold &&
+                   nvme_pci_dmabuf_avg_seg_size(req) >= sgl_threshold) {
+                       ret = nvme_rq_setup_dmabuf_sgl(req, nvmeq);
+                       if (ret != BLK_STS_AGAIN)
+                               return ret;
+               }
+
                 return nvme_rq_setup_dmabuf_map(req, nvmeq);
+       }

         /*
          * Try to skip the DMA iterator for single segment requests, as 
that
--
2.43.0