[PATCHv2 0/7] dma mapping optimisations
Keith Busch
kbusch at kernel.org
Thu Aug 4 09:28:25 PDT 2022
On Wed, Aug 03, 2022 at 02:52:11PM -0600, Jens Axboe wrote:
> I ran this on my test box to see how we'd do. First the bad news:
> smaller block size IO seems slower. I ran with QD=8 and used 24 drives,
> and using t/io_uring (with registered buffers, polled, etc) and a 512b
> block size I get:
>
> IOPS=44.36M, BW=21.66GiB/s, IOS/call=1/1
> IOPS=44.64M, BW=21.80GiB/s, IOS/call=2/2
> IOPS=44.69M, BW=21.82GiB/s, IOS/call=1/1
> IOPS=44.55M, BW=21.75GiB/s, IOS/call=2/2
> IOPS=44.93M, BW=21.94GiB/s, IOS/call=1/1
> IOPS=44.79M, BW=21.87GiB/s, IOS/call=1/2
>
> and adding -D1 I get:
>
> IOPS=43.74M, BW=21.36GiB/s, IOS/call=1/1
> IOPS=44.04M, BW=21.50GiB/s, IOS/call=1/1
> IOPS=43.63M, BW=21.30GiB/s, IOS/call=2/2
> IOPS=43.67M, BW=21.32GiB/s, IOS/call=1/1
> IOPS=43.57M, BW=21.28GiB/s, IOS/call=1/2
> IOPS=43.53M, BW=21.25GiB/s, IOS/call=2/1
>
> which does regress that workload.
Bummer, I would expect -D1 to be no worse. My test isn't nearly as consistent
as yours, so I'm having some trouble measuring. I'm only coming with a few
micro-optimizations that might help. A diff is below on top of this series. I
also created a branch with everything folded in here:
git://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git io_uring/dma-register
https://git.kernel.org/pub/scm/linux/kernel/git/kbusch/linux.git/log/?h=io_uring/dma-register
-- >8 --
diff --git a/block/bio.c b/block/bio.c
index 3b7accae8996..c1e97dff5e40 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1154,7 +1154,7 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
-static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
+void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
{
size_t size = iov_iter_count(iter);
@@ -1165,8 +1165,6 @@ static void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter)
bio->bi_opf |= REQ_NOMERGE;
bio_set_flag(bio, BIO_NO_PAGE_REF);
bio_set_flag(bio, BIO_DMA_TAGGED);
-
- iov_iter_advance(iter, bio->bi_iter.bi_size);
}
static int bio_iov_add_page(struct bio *bio, struct page *page,
@@ -1307,6 +1305,7 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
if (iov_iter_is_dma_tag(iter)) {
bio_iov_dma_tag_set(bio, iter);
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
return 0;
}
diff --git a/block/fops.c b/block/fops.c
index db2d1e848f4b..1b3649c7eb17 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -325,7 +325,9 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
* bio_iov_iter_get_pages() and set the bvec directly.
*/
bio_iov_bvec_set(bio, iter);
- } else {
+ } else if (iov_iter_is_dma_tag(iter)) {
+ bio_iov_dma_tag_set(bio, iter);
+ }else {
ret = bio_iov_iter_get_pages(bio, iter);
if (unlikely(ret)) {
bio_put(bio);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index dbf73ab0877e..511cae2b7ce9 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -113,7 +113,8 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
struct nvme_dma_mapping {
int nr_pages;
u16 offset;
- u8 rsvd[2];
+ bool needs_sync;
+ u8 rsvd;
dma_addr_t prp_dma_addr;
__le64 *prps;
};
@@ -556,16 +557,9 @@ static void nvme_sync_dma(struct nvme_dev *dev, struct request *req,
struct nvme_dma_mapping *mapping)
{
int offset, i, j, length, nprps;
- bool needs_sync;
offset = blk_rq_dma_offset(req) + mapping->offset;
i = offset >> NVME_CTRL_PAGE_SHIFT;
- needs_sync = rq_data_dir(req) == READ &&
- dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
-
- if (!needs_sync)
- return;
-
offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
length = blk_rq_payload_bytes(req) - (NVME_CTRL_PAGE_SIZE - offset);
nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE);
@@ -643,7 +637,8 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req)
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
if (mapping) {
- nvme_sync_dma(dev, req, mapping);
+ if (mapping->needs_sync)
+ nvme_sync_dma(dev, req, mapping);
if (iod->npages >= 0)
nvme_free_prp_chain(dev, req, iod);
return;
@@ -894,16 +889,13 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
int i, offset, j, length, nprps, nprps_left;
struct dma_pool *pool;
__le64 *prp_list;
- bool needs_sync;
void **list;
offset = blk_rq_dma_offset(req) + mapping->offset;
i = offset >> NVME_CTRL_PAGE_SHIFT;
offset = offset & (NVME_CTRL_PAGE_SIZE - 1);
- needs_sync = rq_data_dir(req) == WRITE &&
- dma_need_sync(dev->dev, le64_to_cpu(mapping->prps[i]));
- if (needs_sync) {
+ if (mapping->needs_sync) {
dma_sync_single_for_device(dev->dev,
le64_to_cpu(mapping->prps[i]),
NVME_CTRL_PAGE_SIZE - offset, DMA_TO_DEVICE);
@@ -916,7 +908,7 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
return BLK_STS_OK;
if (length <= NVME_CTRL_PAGE_SIZE) {
- if (needs_sync)
+ if (mapping->needs_sync)
dma_sync_single_for_device(dev->dev,
le64_to_cpu(mapping->prps[i]),
NVME_CTRL_PAGE_SIZE, DMA_TO_DEVICE);
@@ -983,7 +975,7 @@ static blk_status_t nvme_premapped(struct nvme_dev *dev, struct request *req,
cmnd->dptr.prp2 = cpu_to_le64(iod->first_dma);
sync:
- if (!needs_sync)
+ if (!mapping->needs_sync)
return BLK_STS_OK;
i = offset >> NVME_CTRL_PAGE_SHIFT;
@@ -1931,6 +1923,7 @@ static void *nvme_pci_dma_map(struct request_queue *q,
if (!mapping->prps)
goto free_mapping;
+ mapping->needs_sync = false;
for (i = 0, k = 0; i < nr_vecs; i++) {
struct bio_vec *bv = bvec + i;
dma_addr_t dma_addr;
@@ -1959,6 +1952,9 @@ static void *nvme_pci_dma_map(struct request_queue *q,
if (i == 0)
dma_addr -= mapping->offset;
+ if (dma_need_sync(dev->dev, dma_addr))
+ mapping->needs_sync = true;
+
for (j = 0; j < ppv; j++)
mapping->prps[k++] = cpu_to_le64(dma_addr +
j * NVME_CTRL_PAGE_SIZE);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 649348bc03c2..b5277ec189e0 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -474,6 +474,7 @@ void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off);
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter);
void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter);
+void bio_iov_dma_tag_set(struct bio *bio, struct iov_iter *iter);
void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index d370b45d7f1b..ebdf81473526 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1070,6 +1070,9 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
iov_iter_iovec_advance(i, size);
} else if (iov_iter_is_bvec(i)) {
iov_iter_bvec_advance(i, size);
+ } else if (iov_iter_is_dma_tag(i)) {
+ i->iov_offset += size;
+ i->count -= size;
} else if (iov_iter_is_pipe(i)) {
pipe_advance(i, size);
} else if (unlikely(iov_iter_is_xarray(i))) {
@@ -1077,9 +1080,6 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
i->count -= size;
} else if (iov_iter_is_discard(i)) {
i->count -= size;
- } else if (iov_iter_is_dma_tag(i)) {
- i->iov_offset += size;
- i->count -= size;
}
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -1353,6 +1353,9 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
if (iov_iter_is_bvec(i))
return iov_iter_aligned_bvec(i, addr_mask, len_mask);
+ if (iov_iter_is_dma_tag(i))
+ return !(i->iov_offset & addr_mask);
+
if (iov_iter_is_pipe(i)) {
unsigned int p_mask = i->pipe->ring_size - 1;
size_t size = i->count;
-- 8< --
More information about the Linux-nvme
mailing list