[PATCH 4/5] io_uring: add support for dma pre-mapping
Keith Busch
kbusch at fb.com
Tue Jul 26 10:38:13 PDT 2022
From: Keith Busch <kbusch at kernel.org>
Provide a new register operation that can request to pre-map a known
bvec to the driver of the requested file descriptor's specific
implementation. If successful, io_uring will use the returned dma tag
for future fixed buffer requests to the same file.
Signed-off-by: Keith Busch <kbusch at kernel.org>
---
include/uapi/linux/io_uring.h | 12 ++++
io_uring/io_uring.c | 129 ++++++++++++++++++++++++++++++++++
io_uring/net.c | 2 +-
io_uring/rsrc.c | 13 +++-
io_uring/rsrc.h | 16 ++++-
io_uring/rw.c | 2 +-
6 files changed, 166 insertions(+), 8 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1463cfecb56b..daacbe899d1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -485,6 +485,10 @@ enum {
IORING_REGISTER_NOTIFIERS = 26,
IORING_UNREGISTER_NOTIFIERS = 27,
+ /* dma map registered buffers */
+ IORING_REGISTER_MAP_BUFFERS = 28,
+ IORING_REGISTER_UNMAP_BUFFERS = 29,
+
/* this goes last */
IORING_REGISTER_LAST
};
@@ -661,4 +665,12 @@ struct io_uring_recvmsg_out {
__u32 flags;
};
+struct io_uring_map_buffers {
+ __s32 fd;
+ __s32 buf_start;
+ __s32 buf_end;
+ __u32 flags;
+ __u64 rsvd[2];
+};
+
#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1d600a63643b..12f7354e0423 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3704,6 +3704,123 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
return ret;
}
+#ifdef CONFIG_BLOCK
+static int get_map_range(struct io_ring_ctx *ctx,
+ struct io_uring_map_buffers *map, void __user *arg)
+{
+ int ret;
+
+ if (copy_from_user(map, arg, sizeof(*map)))
+ return -EFAULT;
+ if (map->flags || map->rsvd[0] || map->rsvd[1])
+ return -EINVAL;
+ if (map->buf_start < 0)
+ return -EINVAL;
+ if (map->buf_start >= ctx->nr_user_bufs)
+ return -EINVAL;
+ if (map->buf_end > ctx->nr_user_bufs)
+ map->buf_end = ctx->nr_user_bufs;
+
+ ret = map->buf_end - map->buf_start;
+ if (ret <= 0)
+ return -EINVAL;
+
+ return ret;
+}
+
+void io_dma_unmap(struct io_mapped_ubuf *imu)
+{
+ if (imu->dma_tag)
+ block_dma_unmap(imu->bdev, imu->dma_tag);
+}
+
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_map_buffers map;
+ int i, ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ ret = get_map_range(ctx, &map, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = map.buf_start; i < map.buf_end; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ io_dma_unmap(imu);
+ }
+
+ return 0;
+}
+
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_map_buffers map;
+ struct block_device *bdev;
+ struct file *file;
+ int ret, i;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = get_map_range(ctx, &map, arg);
+ if (ret < 0)
+ return ret;
+
+ file = fget(map.fd);
+ if (!file)
+ return -EBADF;
+
+ if (S_ISBLK(file_inode(file)->i_mode))
+ bdev = I_BDEV(file->f_mapping->host);
+ else if (S_ISREG(file_inode(file)->i_mode))
+ bdev = file->f_inode->i_sb->s_bdev;
+ else
+ return -EOPNOTSUPP;
+
+ for (i = map.buf_start; i < map.buf_end; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+ void *tag;
+
+ if (imu->dma_tag) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ tag = block_dma_map(bdev, imu->bvec, imu->nr_bvecs);
+ if (IS_ERR(tag)) {
+ ret = PTR_ERR(tag);
+ goto err;
+ }
+
+ imu->dma_tag = tag;
+ imu->dma_file = file;
+ imu->bdev = bdev;
+ }
+
+ fput(file);
+ return 0;
+err:
+ while (--i >= map.buf_start) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ io_dma_unmap(imu);
+ }
+ fput(file);
+ return ret;
+}
+#else /* CONFIG_BLOCK */
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BLOCK */
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -3870,6 +3987,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_notif_unregister(ctx);
break;
+ case IORING_REGISTER_MAP_BUFFERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_map_buffers(ctx, arg);
+ break;
+ case IORING_REGISTER_UNMAP_BUFFERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_unmap_buffers(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 8276b9537194..68a996318959 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -977,7 +977,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
- (u64)(uintptr_t)zc->buf, zc->len);
+ (u64)(uintptr_t)zc->buf, zc->len, NULL);
if (unlikely(ret))
return ret;
} else {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 59704b9ac537..1a7a8dedbbd5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -148,6 +148,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
+ io_dma_unmap(imu);
kvfree(imu);
}
*slot = NULL;
@@ -1285,6 +1286,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
+ imu->dma_tag = NULL;
*pimu = imu;
ret = 0;
done:
@@ -1359,9 +1361,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len)
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+ u64 buf_addr, size_t len, struct file *file)
{
u64 buf_end;
size_t offset;
@@ -1379,6 +1380,12 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
+ if (imu->dma_tag && file == imu->dma_file) {
+ unsigned long nr_segs = (buf_addr & (PAGE_SIZE - 1)) +
+ (len >> PAGE_SHIFT);
+ iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, nr_segs, len);
+ return 0;
+ }
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
if (offset) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3a9a177941f..6e63b7a57b34 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,6 +50,11 @@ struct io_mapped_ubuf {
u64 ubuf_end;
unsigned int nr_bvecs;
unsigned long acct_pages;
+ void *dma_tag;
+ struct file *dma_file;
+#ifdef CONFIG_BLOCK
+ struct block_device *bdev;
+#endif
struct bio_vec bvec[];
};
@@ -64,9 +69,14 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill);
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len);
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+ u64 buf_addr, size_t len, struct file *file);
+
+#ifdef CONFIG_BLOCK
+void io_dma_unmap(struct io_mapped_ubuf *imu);
+#else
+static inline void io_dma_unmap(struct io_mapped_ubuf *imu) {}
+#endif
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..9e2164d09adb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -359,7 +359,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
ssize_t ret;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
+ ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file);
if (ret)
return ERR_PTR(ret);
return NULL;
--
2.30.2
More information about the Linux-nvme
mailing list