[PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
Klara Modin
klarasmodin at gmail.com
Thu Jul 10 17:25:12 PDT 2025
Hi,
On 2025-07-07 14:52:23 +0200, Christoph Hellwig wrote:
> The current version of the blk_rq_dma_map support in nvme-pci tries to
> reconstruct the DMA mappings from the on the wire descriptors if they
> are needed for unmapping. While this is not the case for the direct
> mapping fast path and the IOVA path, it is needed for the non-IOVA slow
> path, e.g. when using the interconnect is not dma coherent, when using
> swiotlb bounce buffering, or a IOMMU mapping that can't coalesce.
>
> While the reconstruction is easy and works fine for the SGL path, where
> the on the wire representation maps 1:1 to DMA mappings, the code to
> reconstruct the DMA mapping ranges from PRPs can't always work, as a
> given PRP layout can come from different DMA mappings, and the current
> code doesn't even always get that right.
>
> Give up on this approach and track the actual DMA mapping when actually
> needed again.
>
> Fixes: 7ce3c1dd78fc ("nvme-pci: convert the data mapping to blk_rq_dma_map")
> Reported-by: Ben Copeland <ben.copeland at linaro.org>
> Signed-off-by: Christoph Hellwig <hch at lst.de>
This patch seems to introduce a memory leak, causing the slab to grow
continuously. The easiest way I found to trigger it is along the lines
of
# dd if=/dev/nvme0n1 of=/dev/null bs=8M
It also seems to happen during normal use but much more slowly.
Reverting fixes the issue.
# cat /proc/allocinfo | sort -n | numfmt --invalid=ignore --to=iec-i | tail -n5
146Mi 5285 mm/memory.c:4977 func:alloc_anon_folio
182Mi 24395 mm/shmem.c:1847 func:shmem_alloc_folio
420Mi 38746 mm/readahead.c:186 func:ractl_alloc_folio
29Gi 7383484 drivers/nvme/host/pci.c:767 func:nvme_pci_setup_data_prp
29Gi 937347 mm/slub.c:2487 func:alloc_slab_page
# slabtop --human --once | head
Active / Total Objects (% used) : 8966529 / 9031691 (99.3%)
Active / Total Slabs (% used) : 971257 / 971257 (100.0%)
Active / Total Caches (% used) : 128 / 211 (60.7%)
Active / Total Size (% used) : 28Gi / 28Gi (99.9%)
Minimum / Average / Maximum Object : 0.01K / 3.32K / 8.00K
OBJS ACTIVE USE OBJ SIZE SLABS OBJ/SLAB CACHE SIZE NAME
7410112 7410112 100% 4.00K 926264 8 28Gi kmalloc-4k
929920 929920 100% 0.12K 29060 32 113Mi kmalloc-128
81744 80996 99% 0.10K 2096 39 8.2Mi btrfs_free_space
My machine has 32 GiB of memory and if I leave it be it locks up and
sometimes panics.
# nvme id-ctrl /dev/nvme0 | grep sg
sgls : 0
Let me know if there's anything else you need.
Regards,
Klara Modin
> ---
> drivers/nvme/host/pci.c | 114 ++++++++++++++++++++++------------------
> 1 file changed, 62 insertions(+), 52 deletions(-)
>
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6a7a8bdbf385..6af184f2b73b 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -173,6 +173,7 @@ struct nvme_dev {
> bool hmb;
> struct sg_table *hmb_sgt;
>
> + mempool_t *dmavec_mempool;
> mempool_t *iod_meta_mempool;
>
> /* shadow doorbell buffer support: */
> @@ -262,6 +263,11 @@ enum nvme_iod_flags {
> IOD_SINGLE_SEGMENT = 1U << 2,
> };
>
> +struct nvme_dma_vec {
> + dma_addr_t addr;
> + unsigned int len;
> +};
> +
> /*
> * The nvme_iod describes the data in an I/O.
> */
> @@ -274,6 +280,8 @@ struct nvme_iod {
> unsigned int total_len;
> struct dma_iova_state dma_state;
> void *descriptors[NVME_MAX_NR_DESCRIPTORS];
> + struct nvme_dma_vec *dma_vecs;
> + unsigned int nr_dma_vecs;
>
> dma_addr_t meta_dma;
> struct sg_table meta_sgt;
> @@ -674,44 +682,12 @@ static void nvme_free_prps(struct request *req)
> {
> struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
> - struct device *dma_dev = nvmeq->dev->dev;
> - enum dma_data_direction dir = rq_dma_dir(req);
> - int length = iod->total_len;
> - dma_addr_t dma_addr;
> - int i, desc;
> - __le64 *prp_list;
> - u32 dma_len;
> -
> - dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
> - dma_len = min_t(u32, length,
> - NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
> - length -= dma_len;
> - if (!length) {
> - dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> - return;
> - }
> -
> - if (length <= NVME_CTRL_PAGE_SIZE) {
> - dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> - dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
> - dma_unmap_page(dma_dev, dma_addr, length, dir);
> - return;
> - }
> -
> - i = 0;
> - desc = 0;
> - prp_list = iod->descriptors[desc];
> - do {
> - dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> - if (i == NVME_CTRL_PAGE_SIZE >> 3) {
> - prp_list = iod->descriptors[++desc];
> - i = 0;
> - }
> + unsigned int i;
>
> - dma_addr = le64_to_cpu(prp_list[i++]);
> - dma_len = min(length, NVME_CTRL_PAGE_SIZE);
> - length -= dma_len;
> - } while (length);
> + for (i = 0; i < iod->nr_dma_vecs; i++)
> + dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr,
> + iod->dma_vecs[i].len, rq_dma_dir(req));
> + mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
> }
>
> static void nvme_free_sgls(struct request *req)
> @@ -760,6 +736,23 @@ static void nvme_unmap_data(struct request *req)
> nvme_free_descriptors(req);
> }
>
> +static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
> + struct blk_dma_iter *iter)
> +{
> + struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +
> + if (iter->len)
> + return true;
> + if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
> + return false;
> + if (dma_need_unmap(dma_dev)) {
> + iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
> + iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
> + iod->nr_dma_vecs++;
> + }
> + return true;
> +}
> +
> static blk_status_t nvme_pci_setup_data_prp(struct request *req,
> struct blk_dma_iter *iter)
> {
> @@ -770,6 +763,16 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
> unsigned int prp_len, i;
> __le64 *prp_list;
>
> + if (dma_need_unmap(nvmeq->dev->dev)) {
> + iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
> + GFP_ATOMIC);
> + if (!iod->dma_vecs)
> + return BLK_STS_RESOURCE;
> + iod->dma_vecs[0].addr = iter->addr;
> + iod->dma_vecs[0].len = iter->len;
> + iod->nr_dma_vecs = 1;
> + }
> +
> /*
> * PRP1 always points to the start of the DMA transfers.
> *
> @@ -786,13 +789,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
> if (!length)
> goto done;
>
> - if (!iter->len) {
> - if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
> - &iod->dma_state, iter)) {
> - if (WARN_ON_ONCE(!iter->status))
> - goto bad_sgl;
> - goto done;
> - }
> + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
> + if (WARN_ON_ONCE(!iter->status))
> + goto bad_sgl;
> + goto done;
> }
>
> /*
> @@ -831,13 +831,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
> if (!length)
> break;
>
> - if (iter->len == 0) {
> - if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
> - &iod->dma_state, iter)) {
> - if (WARN_ON_ONCE(!iter->status))
> - goto bad_sgl;
> - goto done;
> - }
> + if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
> + if (WARN_ON_ONCE(!iter->status))
> + goto bad_sgl;
> + goto done;
> }
>
> /*
> @@ -3025,14 +3022,25 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
> static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
> {
> size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
> + size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
> +
> + dev->dmavec_mempool = mempool_create_node(1,
> + mempool_kmalloc, mempool_kfree,
> + (void *)alloc_size, GFP_KERNEL,
> + dev_to_node(dev->dev));
> + if (!dev->dmavec_mempool)
> + return -ENOMEM;
>
> dev->iod_meta_mempool = mempool_create_node(1,
> mempool_kmalloc, mempool_kfree,
> (void *)meta_size, GFP_KERNEL,
> dev_to_node(dev->dev));
> if (!dev->iod_meta_mempool)
> - return -ENOMEM;
> + goto free;
> return 0;
> +free:
> + mempool_destroy(dev->dmavec_mempool);
> + return -ENOMEM;
> }
>
> static void nvme_free_tagset(struct nvme_dev *dev)
> @@ -3477,6 +3485,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
> nvme_dbbuf_dma_free(dev);
> nvme_free_queues(dev, 0);
> out_release_iod_mempool:
> + mempool_destroy(dev->dmavec_mempool);
> mempool_destroy(dev->iod_meta_mempool);
> out_dev_unmap:
> nvme_dev_unmap(dev);
> @@ -3540,6 +3549,7 @@ static void nvme_remove(struct pci_dev *pdev)
> nvme_dev_remove_admin(dev);
> nvme_dbbuf_dma_free(dev);
> nvme_free_queues(dev, 0);
> + mempool_destroy(dev->dmavec_mempool);
> mempool_destroy(dev->iod_meta_mempool);
> nvme_release_descriptor_pools(dev);
> nvme_dev_unmap(dev);
> --
> 2.47.2
>
More information about the Linux-nvme
mailing list