[PATCH 4/5] io_uring: add support for dma pre-mapping

Keith Busch kbusch at fb.com
Tue Jul 26 10:38:13 PDT 2022


From: Keith Busch <kbusch at kernel.org>

Provide a new register operation that can request to pre-map a known
bvec to the driver of the requested file descriptor's specific
implementation. If successful, io_uring will use the returned dma tag
for future fixed buffer requests to the same file.

Signed-off-by: Keith Busch <kbusch at kernel.org>
---
 include/uapi/linux/io_uring.h |  12 ++++
 io_uring/io_uring.c           | 129 ++++++++++++++++++++++++++++++++++
 io_uring/net.c                |   2 +-
 io_uring/rsrc.c               |  13 +++-
 io_uring/rsrc.h               |  16 ++++-
 io_uring/rw.c                 |   2 +-
 6 files changed, 166 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1463cfecb56b..daacbe899d1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -485,6 +485,10 @@ enum {
 	IORING_REGISTER_NOTIFIERS		= 26,
 	IORING_UNREGISTER_NOTIFIERS		= 27,
 
+	/* dma map registered buffers */
+	IORING_REGISTER_MAP_BUFFERS		= 28,
+	IORING_REGISTER_UNMAP_BUFFERS		= 29,
+
 	/* this goes last */
 	IORING_REGISTER_LAST
 };
@@ -661,4 +665,12 @@ struct io_uring_recvmsg_out {
 	__u32 flags;
 };
 
+struct io_uring_map_buffers {
+	__s32	fd;
+	__s32	buf_start;
+	__s32	buf_end;
+	__u32	flags;
+	__u64	rsvd[2];
+};
+
 #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 1d600a63643b..12f7354e0423 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3704,6 +3704,123 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
 	return ret;
 }
 
+#ifdef CONFIG_BLOCK
+static int get_map_range(struct io_ring_ctx *ctx,
+			 struct io_uring_map_buffers *map, void __user *arg)
+{
+	int ret;
+
+	if (copy_from_user(map, arg, sizeof(*map)))
+		return -EFAULT;
+	if (map->flags || map->rsvd[0] || map->rsvd[1])
+		return -EINVAL;
+	if (map->buf_start < 0)
+		return -EINVAL;
+	if (map->buf_start >= ctx->nr_user_bufs)
+		return -EINVAL;
+	if (map->buf_end > ctx->nr_user_bufs)
+		map->buf_end = ctx->nr_user_bufs;
+
+	ret = map->buf_end - map->buf_start;
+	if (ret <= 0)
+		return -EINVAL;
+
+	return ret;
+}
+
+void io_dma_unmap(struct io_mapped_ubuf *imu)
+{
+	if (imu->dma_tag)
+		block_dma_unmap(imu->bdev, imu->dma_tag);
+}
+
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_map_buffers map;
+	int i, ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	ret = get_map_range(ctx, &map, arg);
+	if (ret < 0)
+		return ret;
+
+	for (i = map.buf_start; i < map.buf_end; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		io_dma_unmap(imu);
+	}
+
+	return 0;
+}
+
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	struct io_uring_map_buffers map;
+	struct block_device *bdev;
+	struct file *file;
+	int ret, i;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = get_map_range(ctx, &map, arg);
+	if (ret < 0)
+		return ret;
+
+	file = fget(map.fd);
+	if (!file)
+		return -EBADF;
+
+	if (S_ISBLK(file_inode(file)->i_mode))
+		bdev = I_BDEV(file->f_mapping->host);
+	else if (S_ISREG(file_inode(file)->i_mode))
+		bdev = file->f_inode->i_sb->s_bdev;
+	else
+		return -EOPNOTSUPP;
+
+	for (i = map.buf_start; i < map.buf_end; i++) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+		void *tag;
+
+		if (imu->dma_tag) {
+			ret = -EBUSY;
+			goto err;
+		}
+
+		tag = block_dma_map(bdev, imu->bvec, imu->nr_bvecs);
+		if (IS_ERR(tag)) {
+			ret = PTR_ERR(tag);
+			goto err;
+		}
+
+		imu->dma_tag = tag;
+		imu->dma_file = file;
+		imu->bdev = bdev;
+	}
+
+	fput(file);
+	return 0;
+err:
+	while (--i >= map.buf_start) {
+		struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+		io_dma_unmap(imu);
+	}
+	fput(file);
+	return ret;
+}
+#else /* CONFIG_BLOCK */
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+	return -EOPNOTSUPP;
+}
+#endif /* CONFIG_BLOCK */
+
 static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			       void __user *arg, unsigned nr_args)
 	__releases(ctx->uring_lock)
@@ -3870,6 +3987,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
 			break;
 		ret = io_notif_unregister(ctx);
 		break;
+	case IORING_REGISTER_MAP_BUFFERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_map_buffers(ctx, arg);
+		break;
+	case IORING_REGISTER_UNMAP_BUFFERS:
+		ret = -EINVAL;
+		if (!arg || nr_args != 1)
+			break;
+		ret = io_register_unmap_buffers(ctx, arg);
+		break;
 	default:
 		ret = -EINVAL;
 		break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 8276b9537194..68a996318959 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -977,7 +977,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
 
 	if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
 		ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
-					(u64)(uintptr_t)zc->buf, zc->len);
+					(u64)(uintptr_t)zc->buf, zc->len, NULL);
 		if (unlikely(ret))
 				return ret;
 	} else {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 59704b9ac537..1a7a8dedbbd5 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -148,6 +148,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
 			unpin_user_page(imu->bvec[i].bv_page);
 		if (imu->acct_pages)
 			io_unaccount_mem(ctx, imu->acct_pages);
+		io_dma_unmap(imu);
 		kvfree(imu);
 	}
 	*slot = NULL;
@@ -1285,6 +1286,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
 	imu->ubuf = (unsigned long) iov->iov_base;
 	imu->ubuf_end = imu->ubuf + iov->iov_len;
 	imu->nr_bvecs = nr_pages;
+	imu->dma_tag = NULL;
 	*pimu = imu;
 	ret = 0;
 done:
@@ -1359,9 +1361,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
 	return ret;
 }
 
-int io_import_fixed(int ddir, struct iov_iter *iter,
-			   struct io_mapped_ubuf *imu,
-			   u64 buf_addr, size_t len)
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+		    u64 buf_addr, size_t len, struct file *file)
 {
 	u64 buf_end;
 	size_t offset;
@@ -1379,6 +1380,12 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
 	 * and advance us to the beginning.
 	 */
 	offset = buf_addr - imu->ubuf;
+	if (imu->dma_tag && file == imu->dma_file) {
+		unsigned long nr_segs = (buf_addr & (PAGE_SIZE - 1)) +
+					(len >> PAGE_SHIFT);
+		iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, nr_segs, len);
+		return 0;
+	}
 	iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
 
 	if (offset) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3a9a177941f..6e63b7a57b34 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,6 +50,11 @@ struct io_mapped_ubuf {
 	u64		ubuf_end;
 	unsigned int	nr_bvecs;
 	unsigned long	acct_pages;
+	void		*dma_tag;
+	struct file	*dma_file;
+#ifdef CONFIG_BLOCK
+	struct block_device *bdev;
+#endif
 	struct bio_vec	bvec[];
 };
 
@@ -64,9 +69,14 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
 void io_rsrc_node_switch(struct io_ring_ctx *ctx,
 			 struct io_rsrc_data *data_to_kill);
 
-int io_import_fixed(int ddir, struct iov_iter *iter,
-			   struct io_mapped_ubuf *imu,
-			   u64 buf_addr, size_t len);
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+		    u64 buf_addr, size_t len, struct file *file);
+
+#ifdef CONFIG_BLOCK
+void io_dma_unmap(struct io_mapped_ubuf *imu);
+#else
+static inline void io_dma_unmap(struct io_mapped_ubuf *imu) {}
+#endif
 
 void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
 int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..9e2164d09adb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -359,7 +359,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
 	ssize_t ret;
 
 	if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
-		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
+		ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file);
 		if (ret)
 			return ERR_PTR(ret);
 		return NULL;
-- 
2.30.2




More information about the Linux-nvme mailing list