[PATCHv2 6/7] io_uring: add support for dma pre-mapping

Keith Busch kbusch at fb.com
Tue Aug 2 12:36:32 PDT 2022


From: Keith Busch <kbusch at kernel.org>

Provide a new register operation that can request to pre-map a known
bvec to the requested fixed file's specific implementation. If
successful, io_uring will use the returned dma tag for future fixed
buffer requests to the same file.

Signed-off-by: Keith Busch <kbusch at kernel.org>
---
 include/linux/io_uring_types.h |   2 +
 include/uapi/linux/io_uring.h  |  12 +++
 io_uring/filetable.c           |   7 +-
 io_uring/filetable.h           |   7 +-
 io_uring/io_uring.c            | 137 +++++++++++++++++++++++++++++++++
 io_uring/net.c                 |   2 +-
 io_uring/rsrc.c                |  21 +++--
 io_uring/rsrc.h                |  10 ++-
 io_uring/rw.c                  |   2 +-
 9 files changed, 185 insertions(+), 15 deletions(-)

diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f7fab3758cb9..f62ea17cc480 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -23,6 +23,8 @@ struct io_wq_work {
 	int cancel_seq;
 };
 
+struct io_mapped_ubuf;
+
 struct io_fixed_file {
 	/* file * with additional FFS_* flags */
 	unsigned long file_ptr;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1463cfecb56b..daacbe899d1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -485,6 +485,10 @@ enum {
 	IORING_REGISTER_NOTIFIERS		= 26,
 	IORING_UNREGISTER_NOTIFIERS		= 27,
 
+	/* dma map registered buffers */
+	IORING_REGISTER_MAP_BUFFERS		= 28,
+	IORING_REGISTER_UNMAP_BUFFERS		= 29,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -661,4 +665,12 @@ struct io_uring_recvmsg_out {
 	__u32 flags;
 };
 
+struct io_uring_map_buffers {
+	__s32	fd;
+	__s32	buf_start;
+	__s32	buf_end;
+	__u32	flags;
+	__u64	rsvd[2];
+};
+
 #endif
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 1b8db1918678..5ca2f27f317f 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -189,9 +189,10 @@ int io_file_slot_queue_removal(struct io_ring_ctx *ctx,
 	struct file *file;
 	int ret;
 
-	file = (struct file *)(file_slot->file_ptr & FFS_MASK);
-	ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
-				    ctx->rsrc_node, file);
+	file = io_file_from_fixed(file_slot);
+	io_dma_unmap_file(ctx, file_slot);
+	ret = io_queue_rsrc_removal(ctx->file_data, slot_index, ctx->rsrc_node,
+				    file);
 	if (ret)
 		return ret;
 
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index e52ecf359199..3b2aae5bff76 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -58,12 +58,17 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i)
 	return &table->files[i];
 }
 
+static inline struct file *io_file_from_fixed(struct io_fixed_file *f)
+{
+	return (struct file *) (f->file_ptr & FFS_MASK);
+}
+
 static inline struct file *io_file_from_index(struct io_file_table *table,
 					      int index)
 {
 	struct io_fixed_file *slot = io_fixed_file_slot(table, index);
 
-	return (struct file *) (slot->file_ptr & FFS_MASK);
+	return io_file_from_fixed(slot);
 }
 
 static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b54218da075c..f5be488eaf21 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3681,6 +3681,131 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 	return ret;
 }
 
+static int get_map_range(struct io_ring_ctx *ctx,
+			 struct io_uring_map_buffers *map, void __user *arg)
+{
+	int ret;
+
+	if (copy_from_user(map, arg, sizeof(*map)))
+		return -EFAULT;
+	if (map->flags || map->rsvd[0] || map->rsvd[1])
+		return -EINVAL;
+	if (map->fd >= ctx->nr_user_files)
+		return -EINVAL;
+	if (map->buf_start < 0)
+		return -EINVAL;
+	if (map->buf_start >= ctx->nr_user_bufs)
+		return -EINVAL;
+	if (map->buf_end > ctx->nr_user_bufs)
+		map->buf_end = ctx->nr_user_bufs;
+
+	ret = map->buf_end - map->buf_start;
+	if (ret <= 0)
+		return -EINVAL;
+
+	return ret;
+}
+
+void io_dma_unmap(struct io_mapped_ubuf *imu)
+{
+	struct file *file;
+
+	if (!imu->dma_tag)
+		return;
+
+	file = io_file_from_fixed(imu->dma_file);
+	file_dma_unmap(file, imu->dma_tag);
+	imu->dma_file = NULL;
+	imu->dma_tag = NULL;
+}
+
+void io_dma_unmap_file(struct io_ring_ctx *ctx, struct io_fixed_file *file_slot)
+{
+	int i;
+
+	for (i = 0; i < ctx->nr_user_bufs; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		if (!imu->dma_tag)
+			continue;
+
+		if (imu->dma_file == file_slot)
+			io_dma_unmap(imu);
+	}
+}
+
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_map_buffers map;
+	int i, ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	ret = get_map_range(ctx, &map, arg);
+	if (ret < 0)
+		return ret;
+
+	for (i = map.buf_start; i < map.buf_end; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		io_dma_unmap(imu);
+	}
+
+	return 0;
+}
+
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_map_buffers map;
+	struct io_fixed_file *file_slot;
+	struct file *file;
+	int ret, i;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = get_map_range(ctx, &map, arg);
+	if (ret < 0)
+		return ret;
+
+	file_slot = io_fixed_file_slot(&ctx->file_table,
+			array_index_nospec(map.fd, ctx->nr_user_files));
+	if (!file_slot || !file_slot->file_ptr)
+		return -EBADF;
+
+	file = io_file_from_fixed(file_slot);
+	if (!(file->f_flags & O_DIRECT))
+		return -EOPNOTSUPP;
+
+	for (i = map.buf_start; i < map.buf_end; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+		void *tag;
+
+		if (imu->dma_tag) {
+			ret = -EBUSY;
+			goto err;
+		}
+
+		tag = file_dma_map(file, imu->bvec, imu->nr_bvecs);
+		if (IS_ERR(tag)) {
+			ret = PTR_ERR(tag);
+			goto err;
+		}
+
+		imu->dma_tag = tag;
+		imu->dma_file = file_slot;
+	}
+
+	return 0;
+err:
+	while (--i >= map.buf_start) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		io_dma_unmap(imu);
+	}
+	return ret;
+}
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -3847,6 +3972,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_notif_unregister(ctx);
 		break;
+	case IORING_REGISTER_MAP_BUFFERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_map_buffers(ctx, arg);
+		break;
+	case IORING_REGISTER_UNMAP_BUFFERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_unmap_buffers(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 32fc3da04e41..2793fd7d99d5 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -977,7 +977,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
 		ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
-					(u64)(uintptr_t)zc->buf, zc->len);
+					(u64)(uintptr_t)zc->buf, zc->len, NULL);
 		if (unlikely(ret))
 				return ret;
 	} else {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1f10eecad4d7..ee5e5284203d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -148,6 +148,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 			unpin_user_page(imu->bvec[i].bv_page);
 		if (imu->acct_pages)
 			io_unaccount_mem(ctx, imu->acct_pages);
+		io_dma_unmap(imu);
 		kvfree(imu);
 	}
 	*slot = NULL;
@@ -809,12 +810,16 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
 	int i;
 
 	for (i = 0; i < ctx->nr_user_files; i++) {
-		struct file *file = io_file_from_index(&ctx->file_table, i);
+		struct io_fixed_file *f = io_fixed_file_slot(&ctx->file_table, i);
+		struct file *file;
 
-		if (!file)
+		if (!f)
 			continue;
-		if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
+		if (f->file_ptr & FFS_SCM)
 			continue;
+
+		io_dma_unmap_file(ctx, f);
+		file = io_file_from_fixed(f);
 		io_file_bitmap_clear(&ctx->file_table, i);
 		fput(file);
 	}
@@ -1282,6 +1287,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	imu->ubuf = (unsigned long) iov->iov_base;
 	imu->ubuf_end = imu->ubuf + iov->iov_len;
 	imu->nr_bvecs = nr_pages;
+	imu->dma_tag = NULL;
 	*pimu = imu;
 	ret = 0;
 done:
@@ -1356,9 +1362,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-int io_import_fixed(int ddir, struct iov_iter *iter,
-			   struct io_mapped_ubuf *imu,
-			   u64 buf_addr, size_t len)
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+		    u64 buf_addr, size_t len, struct file *file)
 {
 	u64 buf_end;
 	size_t offset;
@@ -1376,6 +1381,10 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 	 * and advance us to the beginning.
 	 */
 	offset = buf_addr - imu->ubuf;
+	if (imu->dma_tag && file == io_file_from_fixed(imu->dma_file)) {
+		iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, len);
+		return 0;
+	}
 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
 
 	if (offset) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3a9a177941f..47a2942aa537 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,6 +50,8 @@ struct io_mapped_ubuf {
 	u64		ubuf_end;
 	unsigned int	nr_bvecs;
 	unsigned long	acct_pages;
+	void		*dma_tag;
+	struct io_fixed_file	*dma_file;
 	struct bio_vec	bvec[];
 };
 
@@ -64,9 +66,11 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill);
 
-int io_import_fixed(int ddir, struct iov_iter *iter,
-			   struct io_mapped_ubuf *imu,
-			   u64 buf_addr, size_t len);
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+		    u64 buf_addr, size_t len, struct file *file);
+
+void io_dma_unmap(struct io_mapped_ubuf *imu);
+void io_dma_unmap_file(struct io_ring_ctx *ctx, struct io_fixed_file *file_slot);
 
 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..9e2164d09adb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -359,7 +359,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 	ssize_t ret;
 
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
+		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file);
 		if (ret)
 			return ERR_PTR(ret);
 		return NULL;
-- 
2.30.2




More information about the Linux-nvme mailing list