[PATCHv2 6/7] io_uring: add support for dma pre-mapping
Keith Busch
kbusch at fb.com
Tue Aug 2 12:36:32 PDT 2022
From: Keith Busch <kbusch at kernel.org>
Provide a new register operation that can request to pre-map a known
bvec to the requested fixed file's specific implementation. If
successful, io_uring will use the returned dma tag for future fixed
buffer requests to the same file.
Signed-off-by: Keith Busch <kbusch at kernel.org>
---
include/linux/io_uring_types.h | 2 +
include/uapi/linux/io_uring.h | 12 +++
io_uring/filetable.c | 7 +-
io_uring/filetable.h | 7 +-
io_uring/io_uring.c | 137 +++++++++++++++++++++++++++++++++
io_uring/net.c | 2 +-
io_uring/rsrc.c | 21 +++--
io_uring/rsrc.h | 10 ++-
io_uring/rw.c | 2 +-
9 files changed, 185 insertions(+), 15 deletions(-)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h
index f7fab3758cb9..f62ea17cc480 100644
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -23,6 +23,8 @@ struct io_wq_work {
int cancel_seq;
};
+struct io_mapped_ubuf;
+
struct io_fixed_file {
/* file * with additional FFS_* flags */
unsigned long file_ptr;
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 1463cfecb56b..daacbe899d1d 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -485,6 +485,10 @@ enum {
IORING_REGISTER_NOTIFIERS = 26,
IORING_UNREGISTER_NOTIFIERS = 27,
+ /* dma map registered buffers */
+ IORING_REGISTER_MAP_BUFFERS = 28,
+ IORING_REGISTER_UNMAP_BUFFERS = 29,
+
/* this goes last */
IORING_REGISTER_LAST
};
@@ -661,4 +665,12 @@ struct io_uring_recvmsg_out {
__u32 flags;
};
+struct io_uring_map_buffers {
+ __s32 fd;
+ __s32 buf_start;
+ __s32 buf_end;
+ __u32 flags;
+ __u64 rsvd[2];
+};
+
#endif
diff --git a/io_uring/filetable.c b/io_uring/filetable.c
index 1b8db1918678..5ca2f27f317f 100644
--- a/io_uring/filetable.c
+++ b/io_uring/filetable.c
@@ -189,9 +189,10 @@ int io_file_slot_queue_removal(struct io_ring_ctx *ctx,
struct file *file;
int ret;
- file = (struct file *)(file_slot->file_ptr & FFS_MASK);
- ret = io_queue_rsrc_removal(ctx->file_data, slot_index,
- ctx->rsrc_node, file);
+ file = io_file_from_fixed(file_slot);
+ io_dma_unmap_file(ctx, file_slot);
+ ret = io_queue_rsrc_removal(ctx->file_data, slot_index, ctx->rsrc_node,
+ file);
if (ret)
return ret;
diff --git a/io_uring/filetable.h b/io_uring/filetable.h
index e52ecf359199..3b2aae5bff76 100644
--- a/io_uring/filetable.h
+++ b/io_uring/filetable.h
@@ -58,12 +58,17 @@ io_fixed_file_slot(struct io_file_table *table, unsigned i)
return &table->files[i];
}
+static inline struct file *io_file_from_fixed(struct io_fixed_file *f)
+{
+ return (struct file *) (f->file_ptr & FFS_MASK);
+}
+
static inline struct file *io_file_from_index(struct io_file_table *table,
int index)
{
struct io_fixed_file *slot = io_fixed_file_slot(table, index);
- return (struct file *) (slot->file_ptr & FFS_MASK);
+ return io_file_from_fixed(slot);
}
static inline void io_fixed_file_set(struct io_fixed_file *file_slot,
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index b54218da075c..f5be488eaf21 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3681,6 +3681,131 @@ static __cold int io_register_iowq_max_workers(struct io_ring_ctx *ctx,
return ret;
}
+static int get_map_range(struct io_ring_ctx *ctx,
+ struct io_uring_map_buffers *map, void __user *arg)
+{
+ int ret;
+
+ if (copy_from_user(map, arg, sizeof(*map)))
+ return -EFAULT;
+ if (map->flags || map->rsvd[0] || map->rsvd[1])
+ return -EINVAL;
+ if (map->fd >= ctx->nr_user_files)
+ return -EINVAL;
+ if (map->buf_start < 0)
+ return -EINVAL;
+ if (map->buf_start >= ctx->nr_user_bufs)
+ return -EINVAL;
+ if (map->buf_end > ctx->nr_user_bufs)
+ map->buf_end = ctx->nr_user_bufs;
+
+ ret = map->buf_end - map->buf_start;
+ if (ret <= 0)
+ return -EINVAL;
+
+ return ret;
+}
+
+void io_dma_unmap(struct io_mapped_ubuf *imu)
+{
+ struct file *file;
+
+ if (!imu->dma_tag)
+ return;
+
+ file = io_file_from_fixed(imu->dma_file);
+ file_dma_unmap(file, imu->dma_tag);
+ imu->dma_file = NULL;
+ imu->dma_tag = NULL;
+}
+
+void io_dma_unmap_file(struct io_ring_ctx *ctx, struct io_fixed_file *file_slot)
+{
+ int i;
+
+ for (i = 0; i < ctx->nr_user_bufs; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ if (!imu->dma_tag)
+ continue;
+
+ if (imu->dma_file == file_slot)
+ io_dma_unmap(imu);
+ }
+}
+
+static int io_register_unmap_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_map_buffers map;
+ int i, ret;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ ret = get_map_range(ctx, &map, arg);
+ if (ret < 0)
+ return ret;
+
+ for (i = map.buf_start; i < map.buf_end; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ io_dma_unmap(imu);
+ }
+
+ return 0;
+}
+
+static int io_register_map_buffers(struct io_ring_ctx *ctx, void __user *arg)
+{
+ struct io_uring_map_buffers map;
+ struct io_fixed_file *file_slot;
+ struct file *file;
+ int ret, i;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ ret = get_map_range(ctx, &map, arg);
+ if (ret < 0)
+ return ret;
+
+ file_slot = io_fixed_file_slot(&ctx->file_table,
+ array_index_nospec(map.fd, ctx->nr_user_files));
+ if (!file_slot || !file_slot->file_ptr)
+ return -EBADF;
+
+ file = io_file_from_fixed(file_slot);
+ if (!(file->f_flags & O_DIRECT))
+ return -EOPNOTSUPP;
+
+ for (i = map.buf_start; i < map.buf_end; i++) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+ void *tag;
+
+ if (imu->dma_tag) {
+ ret = -EBUSY;
+ goto err;
+ }
+
+ tag = file_dma_map(file, imu->bvec, imu->nr_bvecs);
+ if (IS_ERR(tag)) {
+ ret = PTR_ERR(tag);
+ goto err;
+ }
+
+ imu->dma_tag = tag;
+ imu->dma_file = file_slot;
+ }
+
+ return 0;
+err:
+ while (--i >= map.buf_start) {
+ struct io_mapped_ubuf *imu = ctx->user_bufs[i];
+
+ io_dma_unmap(imu);
+ }
+ return ret;
+}
+
static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
void __user *arg, unsigned nr_args)
__releases(ctx->uring_lock)
@@ -3847,6 +3972,18 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
break;
ret = io_notif_unregister(ctx);
break;
+ case IORING_REGISTER_MAP_BUFFERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_map_buffers(ctx, arg);
+ break;
+ case IORING_REGISTER_UNMAP_BUFFERS:
+ ret = -EINVAL;
+ if (!arg || nr_args != 1)
+ break;
+ ret = io_register_unmap_buffers(ctx, arg);
+ break;
default:
ret = -EINVAL;
break;
diff --git a/io_uring/net.c b/io_uring/net.c
index 32fc3da04e41..2793fd7d99d5 100644
--- a/io_uring/net.c
+++ b/io_uring/net.c
@@ -977,7 +977,7 @@ int io_sendzc(struct io_kiocb *req, unsigned int issue_flags)
if (zc->flags & IORING_RECVSEND_FIXED_BUF) {
ret = io_import_fixed(WRITE, &msg.msg_iter, req->imu,
- (u64)(uintptr_t)zc->buf, zc->len);
+ (u64)(uintptr_t)zc->buf, zc->len, NULL);
if (unlikely(ret))
return ret;
} else {
diff --git a/io_uring/rsrc.c b/io_uring/rsrc.c
index 1f10eecad4d7..ee5e5284203d 100644
--- a/io_uring/rsrc.c
+++ b/io_uring/rsrc.c
@@ -148,6 +148,7 @@ static void io_buffer_unmap(struct io_ring_ctx *ctx, struct io_mapped_ubuf **slo
unpin_user_page(imu->bvec[i].bv_page);
if (imu->acct_pages)
io_unaccount_mem(ctx, imu->acct_pages);
+ io_dma_unmap(imu);
kvfree(imu);
}
*slot = NULL;
@@ -809,12 +810,16 @@ void __io_sqe_files_unregister(struct io_ring_ctx *ctx)
int i;
for (i = 0; i < ctx->nr_user_files; i++) {
- struct file *file = io_file_from_index(&ctx->file_table, i);
+ struct io_fixed_file *f = io_fixed_file_slot(&ctx->file_table, i);
+ struct file *file;
- if (!file)
+ if (!f)
continue;
- if (io_fixed_file_slot(&ctx->file_table, i)->file_ptr & FFS_SCM)
+ if (f->file_ptr & FFS_SCM)
continue;
+
+ io_dma_unmap_file(ctx, f);
+ file = io_file_from_fixed(f);
io_file_bitmap_clear(&ctx->file_table, i);
fput(file);
}
@@ -1282,6 +1287,7 @@ static int io_sqe_buffer_register(struct io_ring_ctx *ctx, struct iovec *iov,
imu->ubuf = (unsigned long) iov->iov_base;
imu->ubuf_end = imu->ubuf + iov->iov_len;
imu->nr_bvecs = nr_pages;
+ imu->dma_tag = NULL;
*pimu = imu;
ret = 0;
done:
@@ -1356,9 +1362,8 @@ int io_sqe_buffers_register(struct io_ring_ctx *ctx, void __user *arg,
return ret;
}
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len)
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+ u64 buf_addr, size_t len, struct file *file)
{
u64 buf_end;
size_t offset;
@@ -1376,6 +1381,10 @@ int io_import_fixed(int ddir, struct iov_iter *iter,
* and advance us to the beginning.
*/
offset = buf_addr - imu->ubuf;
+ if (imu->dma_tag && file == io_file_from_fixed(imu->dma_file)) {
+ iov_iter_dma_tag(iter, ddir, imu->dma_tag, offset, len);
+ return 0;
+ }
iov_iter_bvec(iter, ddir, imu->bvec, imu->nr_bvecs, offset + len);
if (offset) {
diff --git a/io_uring/rsrc.h b/io_uring/rsrc.h
index f3a9a177941f..47a2942aa537 100644
--- a/io_uring/rsrc.h
+++ b/io_uring/rsrc.h
@@ -50,6 +50,8 @@ struct io_mapped_ubuf {
u64 ubuf_end;
unsigned int nr_bvecs;
unsigned long acct_pages;
+ void *dma_tag;
+ struct io_fixed_file *dma_file;
struct bio_vec bvec[];
};
@@ -64,9 +66,11 @@ int io_queue_rsrc_removal(struct io_rsrc_data *data, unsigned idx,
void io_rsrc_node_switch(struct io_ring_ctx *ctx,
struct io_rsrc_data *data_to_kill);
-int io_import_fixed(int ddir, struct iov_iter *iter,
- struct io_mapped_ubuf *imu,
- u64 buf_addr, size_t len);
+int io_import_fixed(int ddir, struct iov_iter *iter, struct io_mapped_ubuf *imu,
+ u64 buf_addr, size_t len, struct file *file);
+
+void io_dma_unmap(struct io_mapped_ubuf *imu);
+void io_dma_unmap_file(struct io_ring_ctx *ctx, struct io_fixed_file *file_slot);
void __io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
int io_sqe_buffers_unregister(struct io_ring_ctx *ctx);
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 2b784795103c..9e2164d09adb 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -359,7 +359,7 @@ static struct iovec *__io_import_iovec(int ddir, struct io_kiocb *req,
ssize_t ret;
if (opcode == IORING_OP_READ_FIXED || opcode == IORING_OP_WRITE_FIXED) {
- ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len);
+ ret = io_import_fixed(ddir, iter, req->imu, rw->addr, rw->len, req->file);
if (ret)
return ERR_PTR(ret);
return NULL;
--
2.30.2
More information about the Linux-nvme
mailing list