[PATCH] nvme-pci: fix dma unmapping when using PRPs and not using the IOVA mapping
Keith Busch
kbusch at kernel.org
Mon Jul 7 07:29:57 PDT 2025
On Mon, Jul 07, 2025 at 02:52:23PM +0200, Christoph Hellwig wrote:
> While the reconstruction is easy and works fine for the SGL path, where
> the on the wire representation maps 1:1 to DMA mappings, the code to
> reconstruct the DMA mapping ranges from PRPs can't always work, as a
> given PRP layout can come from different DMA mappings, and the current
> code doesn't even always get that right.
Given it works fine for SGL, how do you feel about unconditionally
creating an NVMe SGL, and then call some "nvme_sgl_to_prp()" helper only
when needed? This means we only have one teardown path that matches the
setup.
This is something I hacked up over the weekend. It's only lightly
tested, and I know there's a bug somewhere here with the chainging, but
it's a start.
---
drivers/nvme/host/pci.c | 342 +++++++++++++++++++++++++--------------------------------
1 file changed, 150 insertions(+), 192 deletions(-)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 10aa04879d6996..7ef56775e3be9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -41,7 +41,7 @@
* Arbitrary upper bound.
*/
#define NVME_MAX_BYTES SZ_8M
-#define NVME_MAX_NR_DESCRIPTORS 5
+#define NVME_MAX_NR_DESCRIPTORS 6
/*
* For data SGLs we support a single descriptors worth of SGL entries.
@@ -70,7 +70,7 @@
(NVME_MAX_BYTES + 2 * (NVME_CTRL_PAGE_SIZE - 1))
static_assert(MAX_PRP_RANGE / NVME_CTRL_PAGE_SIZE <=
- (1 /* prp1 */ + NVME_MAX_NR_DESCRIPTORS * PRPS_PER_PAGE));
+ (1 /* prp1 */ + (NVME_MAX_NR_DESCRIPTORS - 1) * PRPS_PER_PAGE));
static int use_threaded_interrupts;
module_param(use_threaded_interrupts, int, 0444);
@@ -273,6 +273,7 @@ struct nvme_iod {
unsigned int total_len;
struct dma_iova_state dma_state;
void *descriptors[NVME_MAX_NR_DESCRIPTORS];
+ struct nvme_sgl_desc sgl;
dma_addr_t meta_dma;
struct nvme_sgl_desc *meta_descriptor;
@@ -644,99 +645,61 @@ static inline dma_addr_t nvme_pci_first_desc_dma_addr(struct nvme_command *cmd)
return le64_to_cpu(cmd->common.dptr.prp2);
}
-static void nvme_free_descriptors(struct request *req)
+static void nvme_free_descriptors(struct request *req, struct nvme_iod *iod)
{
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
const int last_prp = NVME_CTRL_PAGE_SIZE / sizeof(__le64) - 1;
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- dma_addr_t dma_addr = nvme_pci_first_desc_dma_addr(&iod->cmd);
- int i;
+ struct dma_pool *pool;
+ dma_addr_t dma_addr;
+ int i = 0;
- if (iod->nr_descriptors == 1) {
- dma_pool_free(nvme_dma_pool(nvmeq, iod), iod->descriptors[0],
- dma_addr);
- return;
+ if (iod->sgl.type == NVME_SGL_FMT_LAST_SEG_DESC << 4) {
+ struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+ unsigned int dma_len = le32_to_cpu(iod->sgl.length);
+ unsigned int nr_entries = dma_len / sizeof(*sg_list);
+
+ if (nr_entries <= NVME_SMALL_POOL_SIZE / sizeof(__le64))
+ pool = nvmeq->descriptor_pools.small;
+ else
+ pool = nvmeq->descriptor_pools.large;
+
+ dma_addr = le64_to_cpu(iod->sgl.addr);
+ dma_pool_free(pool, iod->descriptors[i++], dma_addr);
}
- for (i = 0; i < iod->nr_descriptors; i++) {
+ pool = nvme_dma_pool(nvmeq, iod);
+ for (; i < iod->nr_descriptors; i++) {
__le64 *prp_list = iod->descriptors[i];
dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]);
- dma_pool_free(nvmeq->descriptor_pools.large, prp_list,
- dma_addr);
+ dma_pool_free(pool, prp_list, dma_addr);
dma_addr = next_dma_addr;
}
}
-static void nvme_free_prps(struct request *req)
+static void nvme_free_sgls(struct request *req, struct nvme_iod *iod)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- struct device *dma_dev = nvmeq->dev->dev;
+ unsigned int dma_len = le32_to_cpu(iod->sgl.length);
enum dma_data_direction dir = rq_dma_dir(req);
- int length = iod->total_len;
- dma_addr_t dma_addr;
- int i, desc;
- __le64 *prp_list;
- u32 dma_len;
-
- dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp1);
- dma_len = min_t(u32, length,
- NVME_CTRL_PAGE_SIZE - (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
- length -= dma_len;
- if (!length) {
- dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
- return;
- }
-
- if (length <= NVME_CTRL_PAGE_SIZE) {
- dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
- dma_addr = le64_to_cpu(iod->cmd.common.dptr.prp2);
- dma_unmap_page(dma_dev, dma_addr, length, dir);
- return;
- }
-
- i = 0;
- desc = 0;
- prp_list = iod->descriptors[desc];
- do {
- dma_unmap_page(dma_dev, dma_addr, dma_len, dir);
- if (i == NVME_CTRL_PAGE_SIZE >> 3) {
- prp_list = iod->descriptors[++desc];
- i = 0;
- }
-
- dma_addr = le64_to_cpu(prp_list[i++]);
- dma_len = min(length, NVME_CTRL_PAGE_SIZE);
- length -= dma_len;
- } while (length);
-}
-
-static void nvme_free_sgls(struct request *req)
-{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
- struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
struct device *dma_dev = nvmeq->dev->dev;
- dma_addr_t sqe_dma_addr = le64_to_cpu(iod->cmd.common.dptr.sgl.addr);
- unsigned int sqe_dma_len = le32_to_cpu(iod->cmd.common.dptr.sgl.length);
- struct nvme_sgl_desc *sg_list = iod->descriptors[0];
- enum dma_data_direction dir = rq_dma_dir(req);
- if (iod->nr_descriptors) {
- unsigned int nr_entries = sqe_dma_len / sizeof(*sg_list), i;
+ if (iod->sgl.type == NVME_SGL_FMT_LAST_SEG_DESC << 4) {
+ struct nvme_sgl_desc *sg_list = iod->descriptors[0];
+ unsigned int i, nr_entries = dma_len / sizeof(*sg_list);
for (i = 0; i < nr_entries; i++)
dma_unmap_page(dma_dev, le64_to_cpu(sg_list[i].addr),
le32_to_cpu(sg_list[i].length), dir);
} else {
- dma_unmap_page(dma_dev, sqe_dma_addr, sqe_dma_len, dir);
+ dma_unmap_page(dma_dev, le64_to_cpu(iod->sgl.addr), dma_len, dir);
}
}
static void nvme_unmap_data(struct request *req)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
+ struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct device *dma_dev = nvmeq->dev->dev;
if (iod->flags & IOD_SINGLE_SEGMENT) {
@@ -747,137 +710,116 @@ static void nvme_unmap_data(struct request *req)
return;
}
- if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len)) {
- if (nvme_pci_cmd_use_sgl(&iod->cmd))
- nvme_free_sgls(req);
- else
- nvme_free_prps(req);
+ if (!blk_rq_dma_unmap(req, dma_dev, &iod->dma_state, iod->total_len))
+ nvme_free_sgls(req, iod);
+ if (iod->nr_descriptors)
+ nvme_free_descriptors(req, iod);
+}
+
+static blk_status_t nvme_pci_setup_prp_list(dma_addr_t dma_addr,
+ unsigned int dma_len, unsigned int *index,
+ __le64 **pprp_list, struct nvme_iod *iod,
+ struct nvme_queue *nvmeq)
+{
+ __le64 *prp_list = *pprp_list;
+ int i = *index;
+
+ for (;;) {
+ unsigned int prp_len = min(dma_len, NVME_CTRL_PAGE_SIZE);
+
+ if (i == NVME_CTRL_PAGE_SIZE >> 3) {
+ __le64 *old_prp_list = prp_list;
+ dma_addr_t prp_list_dma;
+
+ BUG_ON(iod->nr_descriptors >= NVME_MAX_NR_DESCRIPTORS);
+ prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
+ GFP_ATOMIC, &prp_list_dma);
+ if (!prp_list)
+ return BLK_STS_RESOURCE;
+
+ iod->descriptors[iod->nr_descriptors++] = prp_list;
+ prp_list[0] = old_prp_list[i - 1];
+ old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
+ i = 1;
+ }
+
+ prp_list[i++] = cpu_to_le64(dma_addr);
+ dma_len -= prp_len;
+ if (dma_len == 0)
+ break;
+ dma_addr += prp_len;
}
- if (iod->nr_descriptors)
- nvme_free_descriptors(req);
+ *index = i;
+ *pprp_list = prp_list;
+ return BLK_STS_OK;
}
-static blk_status_t nvme_pci_setup_data_prp(struct request *req,
- struct blk_dma_iter *iter)
+static blk_status_t nvme_pci_sgl_to_prp(struct request *req,
+ struct nvme_iod *iod)
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- unsigned int length = blk_rq_payload_bytes(req);
+ unsigned int dma_len, prp_len, entries, j, i = 0;
dma_addr_t prp1_dma, prp2_dma = 0;
- unsigned int prp_len, i;
+ struct nvme_sgl_desc *sg_list;
+ struct dma_pool *pool;
+ dma_addr_t dma_addr;
__le64 *prp_list;
+ blk_status_t ret;
- /*
- * PRP1 always points to the start of the DMA transfers.
- *
- * This is the only PRP (except for the list entries) that could be
- * non-aligned.
- */
- prp1_dma = iter->addr;
- prp_len = min(length, NVME_CTRL_PAGE_SIZE -
- (iter->addr & (NVME_CTRL_PAGE_SIZE - 1)));
- iod->total_len += prp_len;
- iter->addr += prp_len;
- iter->len -= prp_len;
- length -= prp_len;
- if (!length)
- goto done;
-
- if (!iter->len) {
- if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
- &iod->dma_state, iter)) {
- if (WARN_ON_ONCE(!iter->status))
- goto bad_sgl;
- goto done;
- }
+ if (iod->sgl.type == NVME_SGL_FMT_LAST_SEG_DESC << 4) {
+ entries = iod->sgl.length / sizeof(*sg_list);
+ sg_list = iod->descriptors[0];
+ } else {
+ entries = 1;
+ sg_list = &iod->sgl;
+ }
+
+ dma_addr = le64_to_cpu(sg_list[i].addr);
+ dma_len = le32_to_cpu(sg_list[i].length);
+ prp1_dma = dma_addr;
+ prp_len = min(dma_len, NVME_CTRL_PAGE_SIZE -
+ (dma_addr & (NVME_CTRL_PAGE_SIZE - 1)));
+ dma_len -= prp_len;
+ if (!dma_len) {
+ if (++i == entries)
+ return BLK_STS_OK;
+ dma_addr = le64_to_cpu(sg_list[i].addr);
+ dma_len = le32_to_cpu(sg_list[i].length);
+ } else {
+ dma_addr += prp_len;
}
- /*
- * PRP2 is usually a list, but can point to data if all data to be
- * transferred fits into PRP1 + PRP2:
- */
- if (length <= NVME_CTRL_PAGE_SIZE) {
- prp2_dma = iter->addr;
- iod->total_len += length;
+ if (i + 1 == entries && dma_len <= NVME_CTRL_PAGE_SIZE) {
+ prp2_dma = dma_addr;
goto done;
}
- if (DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE) <=
+ if (DIV_ROUND_UP(iod->total_len - prp_len, NVME_CTRL_PAGE_SIZE) <=
NVME_SMALL_POOL_SIZE / sizeof(__le64))
iod->flags |= IOD_SMALL_DESCRIPTOR;
- prp_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
- &prp2_dma);
- if (!prp_list) {
- iter->status = BLK_STS_RESOURCE;
- goto done;
- }
- iod->descriptors[iod->nr_descriptors++] = prp_list;
+ pool = nvme_dma_pool(nvmeq, iod);
+ prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp2_dma);
+ if (!prp_list)
+ return BLK_STS_RESOURCE;
- i = 0;
+ j = 0;
for (;;) {
- prp_list[i++] = cpu_to_le64(iter->addr);
- prp_len = min(length, NVME_CTRL_PAGE_SIZE);
- if (WARN_ON_ONCE(iter->len < prp_len))
- goto bad_sgl;
-
- iod->total_len += prp_len;
- iter->addr += prp_len;
- iter->len -= prp_len;
- length -= prp_len;
- if (!length)
- break;
-
- if (iter->len == 0) {
- if (!blk_rq_dma_map_iter_next(req, nvmeq->dev->dev,
- &iod->dma_state, iter)) {
- if (WARN_ON_ONCE(!iter->status))
- goto bad_sgl;
- goto done;
- }
- }
-
- /*
- * If we've filled the entire descriptor, allocate a new that is
- * pointed to be the last entry in the previous PRP list. To
- * accommodate for that move the last actual entry to the new
- * descriptor.
- */
- if (i == NVME_CTRL_PAGE_SIZE >> 3) {
- __le64 *old_prp_list = prp_list;
- dma_addr_t prp_list_dma;
-
- prp_list = dma_pool_alloc(nvmeq->descriptor_pools.large,
- GFP_ATOMIC, &prp_list_dma);
- if (!prp_list) {
- iter->status = BLK_STS_RESOURCE;
- goto done;
- }
- iod->descriptors[iod->nr_descriptors++] = prp_list;
+ ret = nvme_pci_setup_prp_list(dma_addr, dma_len, &j, &prp_list,
+ iod, nvmeq);
+ if (ret)
+ return ret;
- prp_list[0] = old_prp_list[i - 1];
- old_prp_list[i - 1] = cpu_to_le64(prp_list_dma);
- i = 1;
- }
+ if (++i == entries)
+ break;
+ dma_addr = le64_to_cpu(sg_list[i].addr);
+ dma_len = le32_to_cpu(sg_list[i].length);
}
-
done:
- /*
- * nvme_unmap_data uses the DPT field in the SQE to tear down the
- * mapping, so initialize it even for failures.
- */
iod->cmd.common.dptr.prp1 = cpu_to_le64(prp1_dma);
iod->cmd.common.dptr.prp2 = cpu_to_le64(prp2_dma);
- if (unlikely(iter->status))
- nvme_unmap_data(req);
- return iter->status;
-
-bad_sgl:
- dev_err_once(nvmeq->dev->dev,
- "Incorrectly formed request for payload:%d nents:%d\n",
- blk_rq_payload_bytes(req), blk_rq_nr_phys_segments(req));
- return BLK_STS_IOERR;
+ return ret;
}
static void nvme_pci_sgl_set_data(struct nvme_sgl_desc *sge,
@@ -897,49 +839,50 @@ static void nvme_pci_sgl_set_seg(struct nvme_sgl_desc *sge,
}
static blk_status_t nvme_pci_setup_data_sgl(struct request *req,
- struct blk_dma_iter *iter)
+ struct nvme_iod *iod, struct blk_dma_iter *iter)
+
{
- struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
+ unsigned int entries = blk_rq_nr_phys_segments(req), mapped = 0;
struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
- unsigned int entries = blk_rq_nr_phys_segments(req);
struct nvme_sgl_desc *sg_list;
+ struct dma_pool *pool;
dma_addr_t sgl_dma;
- unsigned int mapped = 0;
-
- /* set the transfer type as SGL */
- iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
if (entries == 1 || blk_rq_dma_map_coalesce(&iod->dma_state)) {
- nvme_pci_sgl_set_data(&iod->cmd.common.dptr.sgl, iter);
+ nvme_pci_sgl_set_data(&iod->sgl, iter);
iod->total_len += iter->len;
return BLK_STS_OK;
}
if (entries <= NVME_SMALL_POOL_SIZE / sizeof(*sg_list))
- iod->flags |= IOD_SMALL_DESCRIPTOR;
+ pool = nvmeq->descriptor_pools.small;
+ else
+ pool = nvmeq->descriptor_pools.large;
- sg_list = dma_pool_alloc(nvme_dma_pool(nvmeq, iod), GFP_ATOMIC,
- &sgl_dma);
+ sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma);
if (!sg_list)
return BLK_STS_RESOURCE;
- iod->descriptors[iod->nr_descriptors++] = sg_list;
+ iod->descriptors[iod->nr_descriptors++] = sg_list;
do {
if (WARN_ON_ONCE(mapped == entries)) {
iter->status = BLK_STS_IOERR;
break;
}
+
nvme_pci_sgl_set_data(&sg_list[mapped++], iter);
iod->total_len += iter->len;
} while (blk_rq_dma_map_iter_next(req, nvmeq->dev->dev, &iod->dma_state,
iter));
- nvme_pci_sgl_set_seg(&iod->cmd.common.dptr.sgl, sgl_dma, mapped);
if (unlikely(iter->status))
- nvme_free_sgls(req);
+ nvme_free_sgls(req, iod);
+ else
+ nvme_pci_sgl_set_seg(&iod->sgl, sgl_dma, mapped);
return iter->status;
}
+
static blk_status_t nvme_pci_setup_data_simple(struct request *req,
enum nvme_use_sgl use_sgl)
{
@@ -979,6 +922,13 @@ static blk_status_t nvme_pci_setup_data_simple(struct request *req,
return BLK_STS_OK;
}
+static inline bool nvme_check_sgl_threshold(struct request *req,
+ enum nvme_use_sgl use_sgl)
+{
+ return use_sgl == SGL_SUPPORTED && sgl_threshold &&
+ nvme_pci_avg_seg_size(req) >= sgl_threshold;
+}
+
static blk_status_t nvme_map_data(struct request *req)
{
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
@@ -1001,11 +951,18 @@ static blk_status_t nvme_map_data(struct request *req)
if (!blk_rq_dma_map_iter_start(req, dev->dev, &iod->dma_state, &iter))
return iter.status;
- if (use_sgl == SGL_FORCED ||
- (use_sgl == SGL_SUPPORTED &&
- (sgl_threshold && nvme_pci_avg_seg_size(req) >= sgl_threshold)))
- return nvme_pci_setup_data_sgl(req, &iter);
- return nvme_pci_setup_data_prp(req, &iter);
+ ret = nvme_pci_setup_data_sgl(req, iod, &iter);
+ if (ret)
+ return ret;
+
+ if (use_sgl == SGL_FORCED || nvme_check_sgl_threshold(req, use_sgl)) {
+ iod->cmd.common.flags = NVME_CMD_SGL_METABUF;
+ iod->cmd.common.dptr.sgl = iod->sgl;
+ } else {
+ ret = nvme_pci_sgl_to_prp(req, iod);
+ }
+
+ return ret;
}
static blk_status_t nvme_pci_setup_meta_sgls(struct request *req)
@@ -1075,6 +1032,7 @@ static blk_status_t nvme_prep_rq(struct request *req)
iod->flags = 0;
iod->nr_descriptors = 0;
iod->total_len = 0;
+ iod->sgl.type = 0;
ret = nvme_setup_cmd(req->q->queuedata, req);
if (ret)
--
More information about the Linux-nvme
mailing list