[PATCH v2 08/10] io_uring/rw: add support to send meta along with read/write
Gabriel Krisman Bertazi
krisman at suse.de
Wed Jun 26 10:17:28 PDT 2024
Anuj Gupta <anuj20.g at samsung.com> writes:
> This patch adds the capability of sending meta along with read/write.
> This meta is represented by a newly introduced 'struct io_uring_meta'
> which specifies information such as meta type/flags/buffer/length and
> apptag.
> Application sets up a SQE128 ring, prepares io_uring_meta within the SQE
> at offset pointed by sqe->cmd.
> The patch processes the user-passed information to prepare uio_meta
> descriptor and passes it down using kiocb->private.
>
> Meta exchange is supported only for direct IO.
> Also vectored read/write operations with meta are not supported
> currently.
>
> Signed-off-by: Anuj Gupta <anuj20.g at samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
> ---
> include/linux/fs.h | 1 +
> include/uapi/linux/io_uring.h | 30 +++++++++++++++-
> io_uring/io_uring.c | 7 ++++
> io_uring/rw.c | 68 +++++++++++++++++++++++++++++++++--
> io_uring/rw.h | 9 ++++-
> 5 files changed, 110 insertions(+), 5 deletions(-)
>
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index db26b4a70c62..0132565288c2 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -330,6 +330,7 @@ struct readahead_control;
> #define IOCB_NOIO (1 << 20)
> /* can use bio alloc cache */
> #define IOCB_ALLOC_CACHE (1 << 21)
> +#define IOCB_HAS_META (1 << 22)
> /*
> * IOCB_DIO_CALLER_COMP can be set by the iocb owner, to indicate that the
> * iocb completion can be passed back to the owner for execution from a safe
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index 2aaf7ee256ac..9140c66b315b 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -101,12 +101,40 @@ struct io_uring_sqe {
> __u64 optval;
> /*
> * If the ring is initialized with IORING_SETUP_SQE128, then
> - * this field is used for 80 bytes of arbitrary command data
> + * this field is starting offset for 80 bytes of data.
> + * This data is opaque for uring command op. And for meta io,
> + * this contains 'struct io_uring_meta'.
> */
> __u8 cmd[0];
> };
> };
>
> +enum io_uring_sqe_meta_type_bits {
> + META_TYPE_INTEGRITY_BIT,
> + /* not a real meta type; just to make sure that we don't overflow */
> + META_TYPE_LAST_BIT,
> +};
> +
> +/* meta type flags */
> +#define META_TYPE_INTEGRITY (1U << META_TYPE_INTEGRITY_BIT)
> +
> +struct io_uring_meta {
> + __u16 meta_type;
> + __u16 meta_flags;
> + __u32 meta_len;
> + __u64 meta_addr;
> + /* the next 64 bytes goes to SQE128 */
> + __u16 apptag;
> + __u8 pad[62];
> +};
> +
> +/*
> + * flags for integrity meta
> + */
> +#define INTEGRITY_CHK_GUARD (1U << 0) /* enforce guard check */
> +#define INTEGRITY_CHK_APPTAG (1U << 1) /* enforce app tag check */
> +#define INTEGRITY_CHK_REFTAG (1U << 2) /* enforce ref tag check */
> +
> /*
> * If sqe->file_index is set to this for opcodes that instantiate a new
> * direct descriptor (like openat/openat2/accept), then io_uring will allocate
> diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c
> index 7ed1e009aaec..0d26ee1193ca 100644
> --- a/io_uring/io_uring.c
> +++ b/io_uring/io_uring.c
> @@ -3704,6 +3704,13 @@ static int __init io_uring_init(void)
> /* top 8bits are for internal use */
> BUILD_BUG_ON((IORING_URING_CMD_MASK & 0xff000000) != 0);
>
> + BUILD_BUG_ON(sizeof(struct io_uring_meta) >
> + 2 * sizeof(struct io_uring_sqe) -
> + offsetof(struct io_uring_sqe, cmd));
> +
> + BUILD_BUG_ON(META_TYPE_LAST_BIT >
> + 8 * sizeof_field(struct io_uring_meta, meta_type));
> +
> io_uring_optable_init();
>
> /*
> diff --git a/io_uring/rw.c b/io_uring/rw.c
> index c004d21e2f12..e8f5b5af4d2f 100644
> --- a/io_uring/rw.c
> +++ b/io_uring/rw.c
> @@ -23,6 +23,8 @@
> #include "poll.h"
> #include "rw.h"
>
> +#define INTEGRITY_VALID_FLAGS (INTEGRITY_CHK_GUARD | INTEGRITY_CHK_APPTAG | \
> + INTEGRITY_CHK_REFTAG)
> struct io_rw {
> /* NOTE: kiocb has the file as the first member, so don't do it here */
> struct kiocb kiocb;
> @@ -247,6 +249,42 @@ static int io_prep_rw_setup(struct io_kiocb *req, int ddir, bool do_import)
> return 0;
> }
>
> +static int io_prep_rw_meta(struct io_kiocb *req, const struct io_uring_sqe *sqe,
> + struct io_rw *rw, int ddir)
> +{
> + const struct io_uring_meta *md = (struct io_uring_meta *)sqe->cmd;
> + u16 meta_type = READ_ONCE(md->meta_type);
> + const struct io_issue_def *def;
> + struct io_async_rw *io;
> + int ret;
> +
> + if (!meta_type)
> + return 0;
> + if (!(meta_type & META_TYPE_INTEGRITY))
> + return -EINVAL;
> +
> + /* should fit into two bytes */
> + BUILD_BUG_ON(INTEGRITY_VALID_FLAGS >= (1 << 16));
> +
> + def = &io_issue_defs[req->opcode];
> + if (def->vectored)
> + return -EOPNOTSUPP;
> +
> + io = req->async_data;
> + io->meta.flags = READ_ONCE(md->meta_flags);
> + if (io->meta.flags & ~INTEGRITY_VALID_FLAGS)
> + return -EINVAL;
> +
> + io->meta.apptag = READ_ONCE(md->apptag);
> + ret = import_ubuf(ddir, u64_to_user_ptr(READ_ONCE(md->meta_addr)),
> + READ_ONCE(md->meta_len), &io->meta.iter);
> + if (unlikely(ret < 0))
> + return ret;
> + rw->kiocb.ki_flags |= IOCB_HAS_META;
> + iov_iter_save_state(&io->meta.iter, &io->iter_meta_state);
> + return ret;
> +}
> +
> static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
> int ddir, bool do_import)
> {
> @@ -269,11 +307,16 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe,
> rw->kiocb.ki_ioprio = get_current_ioprio();
> }
> rw->kiocb.dio_complete = NULL;
> + rw->kiocb.ki_flags = 0;
>
> rw->addr = READ_ONCE(sqe->addr);
> rw->len = READ_ONCE(sqe->len);
> rw->flags = READ_ONCE(sqe->rw_flags);
> - return io_prep_rw_setup(req, ddir, do_import);
> + ret = io_prep_rw_setup(req, ddir, do_import);
> +
> + if (unlikely(req->ctx->flags & IORING_SETUP_SQE128 && !ret))
> + ret = io_prep_rw_meta(req, sqe, rw, ddir);
> + return ret;
Would it be useful to have a flag to differentiate a malformed SQE from
a SQE with io_uring_meta, instead of assuming sqe->cmd has it? We don't
check for addr3 at the moment and differently from uring_cmd, userspace
will be mixing writes commands with and without metadata to different
files, so it would be useful to catch that.
Also, just styling, but can you turn that !ret into a separate if leg?
We are bound to add more code here eventually, and the next patch to
touch it will end up doing it anyway. I mean:
ret = io_prep_rw_setup(req, ddir, do_import);
if (unlikely(ret))
return ret;
if (unlikely(req->ctx->flags & IORING_SETUP_SQE128))
ret = io_prep_rw_meta(req, sqe, rw, ddir);
return ret;
--
Gabriel Krisman Bertazi
More information about the Linux-nvme
mailing list