[RFC 02/12] iov_iter: introduce iter type for pre-registered dma

Pavel Begunkov asml.silence at gmail.com
Fri Jun 27 08:10:29 PDT 2025


Introduce a new iterator type representing vectors with pre-registered
DMA addresses. It carries an array of struct dmavec, which is just a
{dma addr, dma len} pair. It'll be used to pass dmabuf buffers from
io_uring and other interfaces operating with iterators.

The vector is mapped for the device returned by the ->get_dma_device()
callback of the file, and the caller should only pass the iterator to
that file's methods. That should also prevent ITER_DMAVEC iterators
reaching unaware files.

Note, the drivers are responsible for cpu-device memory synchronisation
and should use dma_sync_single_for_{device,cpu} when appropriate.

Suggested-by: Keith Busch <kbusch at kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence at gmail.com>
---
 include/linux/uio.h | 14 +++++++++
 lib/iov_iter.c      | 70 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 1 deletion(-)

diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2e86c653186c..d68148508ef7 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -29,11 +29,17 @@ enum iter_type {
 	ITER_FOLIOQ,
 	ITER_XARRAY,
 	ITER_DISCARD,
+	ITER_DMAVEC,
 };
 
 #define ITER_SOURCE	1	// == WRITE
 #define ITER_DEST	0	// == READ
 
+struct dmavec {
+	dma_addr_t		addr;
+	int			len;
+};
+
 struct iov_iter_state {
 	size_t iov_offset;
 	size_t count;
@@ -71,6 +77,7 @@ struct iov_iter {
 				const struct folio_queue *folioq;
 				struct xarray *xarray;
 				void __user *ubuf;
+				const struct dmavec *dmavec;
 			};
 			size_t count;
 		};
@@ -155,6 +162,11 @@ static inline bool iov_iter_is_xarray(const struct iov_iter *i)
 	return iov_iter_type(i) == ITER_XARRAY;
 }
 
+static inline bool iov_iter_is_dma(const struct iov_iter *i)
+{
+	return iov_iter_type(i) == ITER_DMAVEC;
+}
+
 static inline unsigned char iov_iter_rw(const struct iov_iter *i)
 {
 	return i->data_source ? WRITE : READ;
@@ -302,6 +314,8 @@ void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
 			  unsigned int first_slot, unsigned int offset, size_t count);
 void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
 		     loff_t start, size_t count);
+void iov_iter_dma(struct iov_iter *i, unsigned int direction,
+		  struct dmavec *dmavec, unsigned nr_segs, size_t count);
 ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
 			size_t maxsize, unsigned maxpages, size_t *start);
 ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f9193f952f49..b7740f9aa279 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -559,6 +559,26 @@ static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
 	i->folioq = folioq;
 }
 
+static void iov_iter_dma_advance(struct iov_iter *i, size_t size)
+{
+	const struct dmavec *dmav, *end;
+
+	if (!i->count)
+		return;
+	i->count -= size;
+
+	size += i->iov_offset;
+
+	for (dmav = i->dmavec, end = dmav + i->nr_segs; dmav < end; dmav++) {
+		if (likely(size < dmav->len))
+			break;
+		size -= dmav->len;
+	}
+	i->iov_offset = size;
+	i->nr_segs -= dmav - i->dmavec;
+	i->dmavec = dmav;
+}
+
 void iov_iter_advance(struct iov_iter *i, size_t size)
 {
 	if (unlikely(i->count < size))
@@ -575,6 +595,8 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
 		iov_iter_folioq_advance(i, size);
 	} else if (iov_iter_is_discard(i)) {
 		i->count -= size;
+	} else if (iov_iter_is_dma(i)) {
+		iov_iter_dma_advance(i, size);
 	}
 }
 EXPORT_SYMBOL(iov_iter_advance);
@@ -763,6 +785,20 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
 }
 EXPORT_SYMBOL(iov_iter_xarray);
 
+void iov_iter_dma(struct iov_iter *i, unsigned int direction,
+		  struct dmavec *dmavec, unsigned nr_segs, size_t count)
+{
+	WARN_ON(direction & ~(READ | WRITE));
+	*i = (struct iov_iter){
+		.iter_type = ITER_DMAVEC,
+		.data_source = direction,
+		.dmavec = dmavec,
+		.nr_segs = nr_segs,
+		.iov_offset = 0,
+		.count = count
+	};
+}
+
 /**
  * iov_iter_discard - Initialise an I/O iterator that discards data
  * @i: The iterator to initialise.
@@ -834,6 +870,32 @@ static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
 	return true;
 }
 
+static bool iov_iter_aligned_dma(const struct iov_iter *i, unsigned addr_mask,
+				  unsigned len_mask)
+{
+	const struct dmavec *dmav = i->dmavec;
+	unsigned skip = i->iov_offset;
+	size_t size = i->count;
+
+	do {
+		size_t len = dmav->len - skip;
+
+		if (len > size)
+			len = size;
+		if (len & len_mask)
+			return false;
+		if ((unsigned long)(dmav->addr + skip) & addr_mask)
+			return false;
+
+		dmav++;
+		size -= len;
+		skip = 0;
+	} while (size);
+
+	return true;
+}
+
+
 /**
  * iov_iter_is_aligned() - Check if the addresses and lengths of each segments
  * 	are aligned to the parameters.
@@ -875,6 +937,9 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
 			return false;
 	}
 
+	if (iov_iter_is_dma(i))
+		return iov_iter_aligned_dma(i, addr_mask, len_mask);
+
 	return true;
 }
 EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
@@ -1552,7 +1617,8 @@ EXPORT_SYMBOL_GPL(import_ubuf);
 void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
 {
 	if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
-			 !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
+			 !iter_is_ubuf(i)) && !iov_iter_is_kvec(i) &&
+			 !iov_iter_is_dma(i))
 		return;
 	i->iov_offset = state->iov_offset;
 	i->count = state->count;
@@ -1570,6 +1636,8 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
 	BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
 	if (iov_iter_is_bvec(i))
 		i->bvec -= state->nr_segs - i->nr_segs;
+	else if (iov_iter_is_dma(i))
+		i->dmavec -= state->nr_segs - i->nr_segs;
 	else
 		i->__iov -= state->nr_segs - i->nr_segs;
 	i->nr_segs = state->nr_segs;
-- 
2.49.0




More information about the Linux-nvme mailing list