[PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping

Klara Modin klarasmodin at gmail.com
Thu Jul 10 17:25:12 PDT 2025


Hi,

On 2025-07-07 14:52:23 +0200, Christoph Hellwig wrote:
> The current version of the blk_rq_dma_map support in nvme-pci tries to
> reconstruct the DMA mappings from the on the wire descriptors if they
> are needed for unmapping.  While this is not the case for the direct
> mapping fast path and the IOVA path, it is needed for the non-IOVA slow
> path, e.g. when using the interconnect is not dma coherent, when using
> swiotlb bounce buffering, or a IOMMU mapping that can't coalesce.
> 
> While the reconstruction is easy and works fine for the SGL path, where
> the on the wire representation maps 1:1 to DMA mappings, the code to
> reconstruct the DMA mapping ranges from PRPs can't always work, as a
> given PRP layout can come from different DMA mappings, and the current
> code doesn't even always get that right.
> 
> Give up on this approach and track the actual DMA mapping when actually
> needed again.
> 
> Fixes: 7ce3c1dd78fc ("nvme-pci: convert the data mapping to blk_rq_dma_map")
> Reported-by: Ben Copeland <ben.copeland at linaro.org>
> Signed-off-by: Christoph Hellwig <hch at lst.de>

This patch seems to introduce a memory leak, causing the slab to grow
continuously. The easiest way I found to trigger it is along the lines
of

# dd if=/dev/nvme0n1 of=/dev/null bs=8M

It also seems to happen during normal use but much more slowly.
Reverting fixes the issue.

# cat /proc/allocinfo | sort -n | numfmt --invalid=ignore --to=iec-i | tail -n5
       146Mi     5285 mm/memory.c:4977 func:alloc_anon_folio 
       182Mi    24395 mm/shmem.c:1847 func:shmem_alloc_folio 
       420Mi    38746 mm/readahead.c:186 func:ractl_alloc_folio 
        29Gi  7383484 drivers/nvme/host/pci.c:767 func:nvme_pci_setup_data_prp 
        29Gi   937347 mm/slub.c:2487 func:alloc_slab_page

# slabtop --human --once | head
 Active / Total Objects (% used)    : 8966529 / 9031691 (99.3%)
 Active / Total Slabs (% used)      : 971257 / 971257 (100.0%)
 Active / Total Caches (% used)     : 128 / 211 (60.7%)
 Active / Total Size (% used)       : 28Gi / 28Gi (99.9%)
 Minimum / Average / Maximum Object : 0.01K / 3.32K / 8.00K

  OBJS ACTIVE  USE OBJ SIZE  SLABS OBJ/SLAB CACHE SIZE NAME                   
7410112 7410112 100%    4.00K 926264        8       28Gi kmalloc-4k             
929920 929920 100%    0.12K  29060       32      113Mi kmalloc-128            
 81744  80996  99%    0.10K   2096       39      8.2Mi btrfs_free_space

My machine has 32 GiB of memory and if I leave it be it locks up and
sometimes panics.

# nvme id-ctrl /dev/nvme0 | grep sg
sgls      : 0

Let me know if there's anything else you need.

Regards,
Klara Modin

> ---
>  drivers/nvme/host/pci.c | 114 ++++++++++++++++++++++------------------
>  1 file changed, 62 insertions(+), 52 deletions(-)
> 
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 6a7a8bdbf385..6af184f2b73b 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -173,6 +173,7 @@ struct nvme_dev {
>  	bool hmb;
>  	struct sg_table *hmb_sgt;
>  
> +	mempool_t *dmavec_mempool;
>  	mempool_t *iod_meta_mempool;
>  
>  	/* shadow doorbell buffer support: */
> @@ -262,6 +263,11 @@ enum nvme_iod_flags {
>  	IOD_SINGLE_SEGMENT	= 1U << 2,
>  };
>  
> +struct nvme_dma_vec {
> +	dma_addr_t addr;
> +	unsigned int len;
> +};
> +
>  /*
>   * The nvme_iod describes the data in an I/O.
>   */
> @@ -274,6 +280,8 @@ struct nvme_iod {
>  	unsigned int total_len;
>  	struct dma_iova_state dma_state;
>  	void *descriptors[NVME_MAX_NR_DESCRIPTORS];
> +	struct nvme_dma_vec *dma_vecs;
> +	unsigned int nr_dma_vecs;
>  
>  	dma_addr_t meta_dma;
>  	struct sg_table meta_sgt;
> @@ -674,44 +682,12 @@ static void nvme_free_prps(struct request *req)
>  {
>  	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
>  	struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
> -	struct device *dma_dev = nvmeq->dev->dev;
> -	enum dma_data_direction dir = rq_dma_dir(req);
> -	int length = iod->total_len;
> -	dma_addr_t dma_addr;
> -	int i, desc;
> -	__le64 *prp_list;
> -	u32 dma_len;
> -
> -	dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
> -	dma_len = min_t(u32, length,
> -		NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
> -	length -= dma_len;
> -	if (!length) {
> -		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> -		return;
> -	}
> -
> -	if (length <= NVME_CTRL_PAGE_SIZE) {
> -		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> -		dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
> -		dma_unmap_page(dma_dev, dma_addr, length, dir);
> -		return;
> -	}
> -
> -	i = 0;
> -	desc = 0;
> -	prp_list = iod->descriptors[desc];
> -	do {
> -		dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
> -		if (i == NVME_CTRL_PAGE_SIZE >> 3) {
> -			prp_list = iod->descriptors[++desc];
> -			i = 0;
> -		}
> +	unsigned int i;
>  
> -		dma_addr = le64_to_cpu(prp_list[i++]);
> -		dma_len = min(length, NVME_CTRL_PAGE_SIZE);
> -		length -= dma_len;
> -	} while (length);
> +	for (i = 0; i < iod->nr_dma_vecs; i++)
> +		dma_unmap_page(nvmeq->dev->dev, iod->dma_vecs[i].addr,
> +				iod->dma_vecs[i].len, rq_dma_dir(req));
> +	mempool_free(iod->dma_vecs, nvmeq->dev->dmavec_mempool);
>  }
>  
>  static void nvme_free_sgls(struct request *req)
> @@ -760,6 +736,23 @@ static void nvme_unmap_data(struct request *req)
>  		nvme_free_descriptors(req);
>  }
>  
> +static bool nvme_pci_prp_iter_next(struct request *req, struct device *dma_dev,
> +		struct blk_dma_iter *iter)
> +{
> +	struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
> +
> +	if (iter->len)
> +		return true;
> +	if (!blk_rq_dma_map_iter_next(req, dma_dev, &iod->dma_state, iter))
> +		return false;
> +	if (dma_need_unmap(dma_dev)) {
> +		iod->dma_vecs[iod->nr_dma_vecs].addr = iter->addr;
> +		iod->dma_vecs[iod->nr_dma_vecs].len = iter->len;
> +		iod->nr_dma_vecs++;
> +	}
> +	return true;
> +}
> +
>  static blk_status_t nvme_pci_setup_data_prp(struct request *req,
>  		struct blk_dma_iter *iter)
>  {
> @@ -770,6 +763,16 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
>  	unsigned int prp_len, i;
>  	__le64 *prp_list;
>  
> +	if (dma_need_unmap(nvmeq->dev->dev)) {
> +		iod->dma_vecs = mempool_alloc(nvmeq->dev->dmavec_mempool,
> +				GFP_ATOMIC);
> +		if (!iod->dma_vecs)
> +			return BLK_STS_RESOURCE;
> +		iod->dma_vecs[0].addr = iter->addr;
> +		iod->dma_vecs[0].len = iter->len;
> +		iod->nr_dma_vecs = 1;
> +	}
> +
>  	/*
>  	 * PRP1 always points to the start of the DMA transfers.
>  	 *
> @@ -786,13 +789,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
>  	if (!length)
>  		goto done;
>  
> -	if (!iter->len) {
> -		if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
> -				&iod->dma_state, iter)) {
> -			if (WARN_ON_ONCE(!iter->status))
> -				goto bad_sgl;
> -			goto done;
> -		}
> +	if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
> +		if (WARN_ON_ONCE(!iter->status))
> +			goto bad_sgl;
> +		goto done;
>  	}
>  
>  	/*
> @@ -831,13 +831,10 @@ static blk_status_t nvme_pci_setup_data_prp(struct request *req,
>  		if (!length)
>  			break;
>  
> -		if (iter->len == 0) {
> -			if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
> -					&iod->dma_state, iter)) {
> -				if (WARN_ON_ONCE(!iter->status))
> -					goto bad_sgl;
> -				goto done;
> -			}
> +		if (!nvme_pci_prp_iter_next(req, nvmeq->dev->dev, iter)) {
> +			if (WARN_ON_ONCE(!iter->status))
> +				goto bad_sgl;
> +			goto done;
>  		}
>  
>  		/*
> @@ -3025,14 +3022,25 @@ static int nvme_disable_prepare_reset(struct nvme_dev *dev, bool shutdown)
>  static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
>  {
>  	size_t meta_size = sizeof(struct scatterlist) * (NVME_MAX_META_SEGS + 1);
> +	size_t alloc_size = sizeof(struct nvme_dma_vec) * NVME_MAX_SEGS;
> +
> +	dev->dmavec_mempool = mempool_create_node(1,
> +			mempool_kmalloc, mempool_kfree,
> +			(void *)alloc_size, GFP_KERNEL,
> +			dev_to_node(dev->dev));
> +	if (!dev->dmavec_mempool)
> +		return -ENOMEM;
>  
>  	dev->iod_meta_mempool = mempool_create_node(1,
>  			mempool_kmalloc, mempool_kfree,
>  			(void *)meta_size, GFP_KERNEL,
>  			dev_to_node(dev->dev));
>  	if (!dev->iod_meta_mempool)
> -		return -ENOMEM;
> +		goto free;
>  	return 0;
> +free:
> +	mempool_destroy(dev->dmavec_mempool);
> +	return -ENOMEM;
>  }
>  
>  static void nvme_free_tagset(struct nvme_dev *dev)
> @@ -3477,6 +3485,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
>  	nvme_dbbuf_dma_free(dev);
>  	nvme_free_queues(dev, 0);
>  out_release_iod_mempool:
> +	mempool_destroy(dev->dmavec_mempool);
>  	mempool_destroy(dev->iod_meta_mempool);
>  out_dev_unmap:
>  	nvme_dev_unmap(dev);
> @@ -3540,6 +3549,7 @@ static void nvme_remove(struct pci_dev *pdev)
>  	nvme_dev_remove_admin(dev);
>  	nvme_dbbuf_dma_free(dev);
>  	nvme_free_queues(dev, 0);
> +	mempool_destroy(dev->dmavec_mempool);
>  	mempool_destroy(dev->iod_meta_mempool);
>  	nvme_release_descriptor_pools(dev);
>  	nvme_dev_unmap(dev);
> -- 
> 2.47.2
> 



More information about the Linux-nvme mailing list