[PATCH v7 06/10] io_uring/rw: add support to send metadata along with read/write
Anuj Gupta
anuj20.g at samsung.com
Mon Nov 4 06:05:57 PST 2024
This patch adds the capability of passing integrity metadata along with
read/write. A new meta_type field is introduced in SQE which indicates
the type of metadata being passed. A new 'struct io_uring_sqe_ext'
represents the secondary SQE space for read/write. The last 32 bytes of
secondary SQE is used to pass following PI related information:
- flags: integrity check flags namely
IO_INTEGRITY_CHK_{GUARD/APPTAG/REFTAG}
- len: length of the pi/metadata buffer
- buf: address of the metadata buffer
- seed: seed value for reftag remapping
- app_tag: application defined 16b value
Application sets up a SQE128 ring, prepares PI information within the
second SQE. The patch processes this information to prepare uio_meta
descriptor and passes it down using kiocb->private.
Meta exchange is supported only for direct IO.
Also vectored read/write operations with meta are not supported
currently.
Signed-off-by: Anuj Gupta <anuj20.g at samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
---
include/uapi/linux/io_uring.h | 30 ++++++++++++
io_uring/io_uring.c | 8 ++++
io_uring/rw.c | 88 ++++++++++++++++++++++++++++++++++-
io_uring/rw.h | 14 +++++-
4 files changed, 137 insertions(+), 3 deletions(-)
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
index 024745283783..7f01124bedd5 100644
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -92,6 +92,10 @@ struct io_uring_sqe {
__u16 addr_len;
__u16 __pad3[1];
};
+ struct {
+ __u16 meta_type;
+ __u16 __pad4[1];
+ };
};
union {
struct {
@@ -107,6 +111,32 @@ struct io_uring_sqe {
};
};
+enum io_uring_sqe_meta_type_bits {
+ META_TYPE_PI_BIT,
+ /* not a real meta type; just to make sure that we don't overflow */
+ META_TYPE_LAST_BIT,
+};
+
+/* meta type flags */
+#define META_TYPE_PI (1U << META_TYPE_PI_BIT)
+
+/* Second half of SQE128 for IORING_OP_READ/WRITE */
+struct io_uring_sqe_ext {
+ __u64 rsvd0[4];
+ /* if sqe->meta_type is META_TYPE_PI, last 32 bytes are for PI */
+ union {
+ __u64 rsvd1[4];
+ struct {
+ __u16 flags;
+ __u16 app_tag;
+ __u32 len;
+ __u64 addr;
+ __u64 seed;
+ __u64 rsvd;
+ } rw_pi;
+ };
+};
+
/*
* If sqe->file_index is set to this for opcodes that instantiate a new
* direct descriptor (like openat/openat2/accept), then io_uring will allocate
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
index 44a772013c09..116c93022985 100644
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -3875,7 +3875,9 @@ static int __init io_uring_init(void)
BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
BUILD_BUG_SQE_ELEM(44, __u32, file_index);
BUILD_BUG_SQE_ELEM(44, __u16, addr_len);
+ BUILD_BUG_SQE_ELEM(44, __u16, meta_type);
BUILD_BUG_SQE_ELEM(46, __u16, __pad3[0]);
+ BUILD_BUG_SQE_ELEM(46, __u16, __pad4[0]);
BUILD_BUG_SQE_ELEM(48, __u64, addr3);
BUILD_BUG_SQE_ELEM_SIZE(48, 0, cmd);
BUILD_BUG_SQE_ELEM(56, __u64, __pad2);
@@ -3902,6 +3904,12 @@ static int __init io_uring_init(void)
/* top 8bits are for internal use */
BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
+ BUILD_BUG_ON(sizeof(struct io_uring_sqe_ext) !=
+ sizeof(struct io_uring_sqe));
+
+ BUILD_BUG_ON(META_TYPE_LAST_BIT >
+ 8 * sizeof_field(struct io_uring_sqe, meta_type));
+
io_uring_optable_init();
/*
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 30448f343c7f..eb19b033df24 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -257,11 +257,64 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
return 0;
}
+static inline void io_meta_save_state(struct io_async_rw *io)
+{
+ io->meta_state.seed = io->meta.seed;
+ iov_iter_save_state(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline void io_meta_restore(struct io_async_rw *io)
+{
+ io->meta.seed = io->meta_state.seed;
+ iov_iter_restore(&io->meta.iter, &io->meta_state.iter_meta);
+}
+
+static inline const void *io_uring_sqe_ext(const struct io_uring_sqe *sqe)
+{
+ return (sqe + 1);
+}
+
+static int io_prep_rw_pi(struct io_kiocb *req, const struct io_uring_sqe *sqe,
+ struct io_rw *rw, int ddir)
+{
+ const struct io_uring_sqe_ext *sqe_ext;
+ const struct io_issue_def *def;
+ struct io_async_rw *io;
+ int ret;
+
+ if (!(req->ctx->flags & IORING_SETUP_SQE128))
+ return -EINVAL;
+
+ sqe_ext = io_uring_sqe_ext(sqe);
+ if (READ_ONCE(sqe_ext->rsvd0[0]) || READ_ONCE(sqe_ext->rsvd0[1])
+ || READ_ONCE(sqe_ext->rsvd0[2]) || READ_ONCE(sqe_ext->rsvd0[3]))
+ return -EINVAL;
+ if (READ_ONCE(sqe_ext->rw_pi.rsvd))
+ return -EINVAL;
+
+ def = &io_issue_defs[req->opcode];
+ if (def->vectored)
+ return -EOPNOTSUPP;
+
+ io = req->async_data;
+ io->meta.flags = READ_ONCE(sqe_ext->rw_pi.flags);
+ io->meta.app_tag = READ_ONCE(sqe_ext->rw_pi.app_tag);
+ io->meta.seed = READ_ONCE(sqe_ext->rw_pi.seed);
+ ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(sqe_ext->rw_pi.addr)),
+ READ_ONCE(sqe_ext->rw_pi.len), &io->meta.iter);
+ if (unlikely(ret < 0))
+ return ret;
+ rw->kiocb.ki_flags |= IOCB_HAS_METADATA;
+ io_meta_save_state(io);
+ return ret;
+}
+
static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
int ddir, bool do_import)
{
struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
unsigned ioprio;
+ u16 meta_type;
int ret;
rw->kiocb.ki_pos = READ_ONCE(sqe->off);
@@ -279,11 +332,23 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
rw->kiocb.ki_ioprio = get_current_ioprio();
}
rw->kiocb.dio_complete = NULL;
+ rw->kiocb.ki_flags = 0;
rw->addr = READ_ONCE(sqe->addr);
rw->len = READ_ONCE(sqe->len);
rw->flags = READ_ONCE(sqe->rw_flags);
- return io_prep_rw_setup(req, ddir, do_import);
+ ret = io_prep_rw_setup(req, ddir, do_import);
+
+ if (unlikely(ret))
+ return ret;
+
+ meta_type = READ_ONCE(sqe->meta_type);
+ if (meta_type) {
+ if (READ_ONCE(sqe->__pad4[0]) || !(meta_type & META_TYPE_PI))
+ return -EINVAL;
+ ret = io_prep_rw_pi(req, sqe, rw, ddir);
+ }
+ return ret;
}
int io_prep_read(struct io_kiocb *req, const struct io_uring_sqe *sqe)
@@ -409,7 +474,10 @@ static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req)
static void io_resubmit_prep(struct io_kiocb *req)
{
struct io_async_rw *io = req->async_data;
+ struct io_rw *rw = io_kiocb_to_cmd(req, struct io_rw);
+ if (rw->kiocb.ki_flags & IOCB_HAS_METADATA)
+ io_meta_restore(io);
iov_iter_restore(&io->iter, &io->iter_state);
}
@@ -794,7 +862,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
if (!(req->flags & REQ_F_FIXED_FILE))
req->flags |= io_file_get_flags(file);
- kiocb->ki_flags = file->f_iocb_flags;
+ kiocb->ki_flags |= file->f_iocb_flags;
ret = kiocb_set_rw_flags(kiocb, rw->flags, rw_type);
if (unlikely(ret))
return ret;
@@ -823,6 +891,18 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode, int rw_type)
kiocb->ki_complete = io_complete_rw;
}
+ if (kiocb->ki_flags & IOCB_HAS_METADATA) {
+ struct io_async_rw *io = req->async_data;
+
+ /*
+ * We have a union of meta fields with wpq used for buffered-io
+ * in io_async_rw, so fail it here.
+ */
+ if (!(req->file->f_flags & O_DIRECT))
+ return -EOPNOTSUPP;
+ kiocb->private = &io->meta;
+ }
+
return 0;
}
@@ -897,6 +977,8 @@ static int __io_read(struct io_kiocb *req, unsigned int issue_flags)
* manually if we need to.
*/
iov_iter_restore(&io->iter, &io->iter_state);
+ if (kiocb->ki_flags & IOCB_HAS_METADATA)
+ io_meta_restore(io);
do {
/*
@@ -1101,6 +1183,8 @@ int io_write(struct io_kiocb *req, unsigned int issue_flags)
} else {
ret_eagain:
iov_iter_restore(&io->iter, &io->iter_state);
+ if (kiocb->ki_flags & IOCB_HAS_METADATA)
+ io_meta_restore(io);
if (kiocb->ki_flags & IOCB_WRITE)
io_req_end_write(req);
return -EAGAIN;
diff --git a/io_uring/rw.h b/io_uring/rw.h
index 3f432dc75441..2d7656bd268d 100644
--- a/io_uring/rw.h
+++ b/io_uring/rw.h
@@ -2,6 +2,11 @@
#include <linux/pagemap.h>
+struct io_meta_state {
+ u32 seed;
+ struct iov_iter_state iter_meta;
+};
+
struct io_async_rw {
size_t bytes_done;
struct iov_iter iter;
@@ -9,7 +14,14 @@ struct io_async_rw {
struct iovec fast_iov;
struct iovec *free_iovec;
int free_iov_nr;
- struct wait_page_queue wpq;
+ /* wpq is for buffered io, while meta fields are used with direct io */
+ union {
+ struct wait_page_queue wpq;
+ struct {
+ struct uio_meta meta;
+ struct io_meta_state meta_state;
+ };
+ };
};
int io_prep_read_fixed(struct io_kiocb *req, const struct io_uring_sqe *sqe);
--
2.25.1
More information about the Linux-nvme
mailing list