[RFC 02/12] iov_iter: introduce iter type for pre-registered dma
Pavel Begunkov
asml.silence at gmail.com
Fri Jun 27 08:10:29 PDT 2025
Introduce a new iterator type representing vectors with pre-registered
DMA addresses. It carries an array of struct dmavec, which is just a
{dma addr, dma len} pair. It'll be used to pass dmabuf buffers from
io_uring and other interfaces operating with iterators.
The vector is mapped for the device returned by the ->get_dma_device()
callback of the file, and the caller should only pass the iterator to
that file's methods. That should also prevent ITER_DMAVEC iterators
reaching unaware files.
Note, the drivers are responsible for cpu-device memory synchronisation
and should use dma_sync_single_for_{device,cpu} when appropriate.
Suggested-by: Keith Busch <kbusch at kernel.org>
Signed-off-by: Pavel Begunkov <asml.silence at gmail.com>
---
include/linux/uio.h | 14 +++++++++
lib/iov_iter.c | 70 ++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 83 insertions(+), 1 deletion(-)
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 2e86c653186c..d68148508ef7 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -29,11 +29,17 @@ enum iter_type {
ITER_FOLIOQ,
ITER_XARRAY,
ITER_DISCARD,
+ ITER_DMAVEC,
};
#define ITER_SOURCE 1 // == WRITE
#define ITER_DEST 0 // == READ
+struct dmavec {
+ dma_addr_t addr;
+ int len;
+};
+
struct iov_iter_state {
size_t iov_offset;
size_t count;
@@ -71,6 +77,7 @@ struct iov_iter {
const struct folio_queue *folioq;
struct xarray *xarray;
void __user *ubuf;
+ const struct dmavec *dmavec;
};
size_t count;
};
@@ -155,6 +162,11 @@ static inline bool iov_iter_is_xarray(const struct iov_iter *i)
return iov_iter_type(i) == ITER_XARRAY;
}
+static inline bool iov_iter_is_dma(const struct iov_iter *i)
+{
+ return iov_iter_type(i) == ITER_DMAVEC;
+}
+
static inline unsigned char iov_iter_rw(const struct iov_iter *i)
{
return i->data_source ? WRITE : READ;
@@ -302,6 +314,8 @@ void iov_iter_folio_queue(struct iov_iter *i, unsigned int direction,
unsigned int first_slot, unsigned int offset, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
loff_t start, size_t count);
+void iov_iter_dma(struct iov_iter *i, unsigned int direction,
+ struct dmavec *dmavec, unsigned nr_segs, size_t count);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
size_t maxsize, unsigned maxpages, size_t *start);
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index f9193f952f49..b7740f9aa279 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -559,6 +559,26 @@ static void iov_iter_folioq_advance(struct iov_iter *i, size_t size)
i->folioq = folioq;
}
+static void iov_iter_dma_advance(struct iov_iter *i, size_t size)
+{
+ const struct dmavec *dmav, *end;
+
+ if (!i->count)
+ return;
+ i->count -= size;
+
+ size += i->iov_offset;
+
+ for (dmav = i->dmavec, end = dmav + i->nr_segs; dmav < end; dmav++) {
+ if (likely(size < dmav->len))
+ break;
+ size -= dmav->len;
+ }
+ i->iov_offset = size;
+ i->nr_segs -= dmav - i->dmavec;
+ i->dmavec = dmav;
+}
+
void iov_iter_advance(struct iov_iter *i, size_t size)
{
if (unlikely(i->count < size))
@@ -575,6 +595,8 @@ void iov_iter_advance(struct iov_iter *i, size_t size)
iov_iter_folioq_advance(i, size);
} else if (iov_iter_is_discard(i)) {
i->count -= size;
+ } else if (iov_iter_is_dma(i)) {
+ iov_iter_dma_advance(i, size);
}
}
EXPORT_SYMBOL(iov_iter_advance);
@@ -763,6 +785,20 @@ void iov_iter_xarray(struct iov_iter *i, unsigned int direction,
}
EXPORT_SYMBOL(iov_iter_xarray);
+void iov_iter_dma(struct iov_iter *i, unsigned int direction,
+ struct dmavec *dmavec, unsigned nr_segs, size_t count)
+{
+ WARN_ON(direction & ~(READ | WRITE));
+ *i = (struct iov_iter){
+ .iter_type = ITER_DMAVEC,
+ .data_source = direction,
+ .dmavec = dmavec,
+ .nr_segs = nr_segs,
+ .iov_offset = 0,
+ .count = count
+ };
+}
+
/**
* iov_iter_discard - Initialise an I/O iterator that discards data
* @i: The iterator to initialise.
@@ -834,6 +870,32 @@ static bool iov_iter_aligned_bvec(const struct iov_iter *i, unsigned addr_mask,
return true;
}
+static bool iov_iter_aligned_dma(const struct iov_iter *i, unsigned addr_mask,
+ unsigned len_mask)
+{
+ const struct dmavec *dmav = i->dmavec;
+ unsigned skip = i->iov_offset;
+ size_t size = i->count;
+
+ do {
+ size_t len = dmav->len - skip;
+
+ if (len > size)
+ len = size;
+ if (len & len_mask)
+ return false;
+ if ((unsigned long)(dmav->addr + skip) & addr_mask)
+ return false;
+
+ dmav++;
+ size -= len;
+ skip = 0;
+ } while (size);
+
+ return true;
+}
+
+
/**
* iov_iter_is_aligned() - Check if the addresses and lengths of each segments
* are aligned to the parameters.
@@ -875,6 +937,9 @@ bool iov_iter_is_aligned(const struct iov_iter *i, unsigned addr_mask,
return false;
}
+ if (iov_iter_is_dma(i))
+ return iov_iter_aligned_dma(i, addr_mask, len_mask);
+
return true;
}
EXPORT_SYMBOL_GPL(iov_iter_is_aligned);
@@ -1552,7 +1617,8 @@ EXPORT_SYMBOL_GPL(import_ubuf);
void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
{
if (WARN_ON_ONCE(!iov_iter_is_bvec(i) && !iter_is_iovec(i) &&
- !iter_is_ubuf(i)) && !iov_iter_is_kvec(i))
+ !iter_is_ubuf(i)) && !iov_iter_is_kvec(i) &&
+ !iov_iter_is_dma(i))
return;
i->iov_offset = state->iov_offset;
i->count = state->count;
@@ -1570,6 +1636,8 @@ void iov_iter_restore(struct iov_iter *i, struct iov_iter_state *state)
BUILD_BUG_ON(sizeof(struct iovec) != sizeof(struct kvec));
if (iov_iter_is_bvec(i))
i->bvec -= state->nr_segs - i->nr_segs;
+ else if (iov_iter_is_dma(i))
+ i->dmavec -= state->nr_segs - i->nr_segs;
else
i->__iov -= state->nr_segs - i->nr_segs;
i->nr_segs = state->nr_segs;
--
2.49.0
More information about the Linux-nvme
mailing list