[PATCH v3 1/5] fs,io_uring: add infrastructure for uring-cmd
Kanchan Joshi
joshiiitr at gmail.com
Wed May 4 08:12:44 PDT 2022
On Tue, May 3, 2022 at 1:52 PM Christoph Hellwig <hch at lst.de> wrote:
>
> This mostly looks good, but a few comments an an (untested) patch
> to fix these complaints below:
>
> > +struct io_uring_cmd {
> > + struct file *file;
> > + void *cmd;
> > + /* for irq-completion - if driver requires doing stuff in task-context*/
> > + void (*driver_cb)(struct io_uring_cmd *cmd);
>
> I'd rename this task_Work_cb or similar, as driver is not very
> meaningful.
>
> > + u32 flags;
> > + u32 cmd_op;
> > + u32 unused;
> > + u8 pdu[28]; /* available inline for free use */
>
> The unused field here is not only unused, but also messed up the
> alignment so that any pointer in pdu will not be properly aligned
> on 64-bit platforms and tjus might cause crashes on some platforms.
Indeed, thanks. Since we are going to remove that, we can give that
space to pdu for now i.e. pdu[32].This will help in removing "packed"
attribute in nvme pdu (which is 32 bytes without packing).
> For now we can just remove it, but I really want to get rid of the pdu
> thing that causes these problems, but I will not hold the series off for
> that and plan to look into that later.
> Also a lot of the fields here were using spaces instead of tabs for
> indentation.
>
> > union {
> > __u64 addr; /* pointer to buffer or iovecs */
> > __u64 splice_off_in;
> > + __u16 cmd_len;
> > };
>
> This field isn't actually used and can be removed.
>
> > __u32 len; /* buffer size or number of iovecs */
> > union {
> > @@ -61,7 +63,10 @@ struct io_uring_sqe {
> > __s32 splice_fd_in;
> > __u32 file_index;
> > };
> > - __u64 addr3;
> > + union {
> > + __u64 addr3;
> > + __u64 cmd;
> > + };
> > __u64 __pad2[1];
>
> The way how the tail of the structure is handled here is a mess. I
> think something like a union of two structs for the small and large
> case would be much better, so that cmd can be the actual array for
> the data that can be used directly and without the nasty cast (that
> also casted away the constness):
>
> diff --git a/fs/io_uring.c b/fs/io_uring.c
> index b774e6eac5384..fe24d606ca306 100644
> --- a/fs/io_uring.c
> +++ b/fs/io_uring.c
> @@ -4544,7 +4544,7 @@ static int io_getxattr_prep(struct io_kiocb *req,
> if (ret)
> return ret;
>
> - path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
> + path = u64_to_user_ptr(READ_ONCE(sqe->small.addr3));
>
> ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
> if (IS_ERR(ix->filename)) {
> @@ -4645,7 +4645,7 @@ static int io_setxattr_prep(struct io_kiocb *req,
> if (ret)
> return ret;
>
> - path = u64_to_user_ptr(READ_ONCE(sqe->addr3));
> + path = u64_to_user_ptr(READ_ONCE(sqe->small.addr3));
>
> ix->filename = getname_flags(path, LOOKUP_FOLLOW, NULL);
> if (IS_ERR(ix->filename)) {
> @@ -4909,15 +4909,15 @@ static int io_linkat(struct io_kiocb *req, unsigned int issue_flags)
>
> static void io_uring_cmd_work(struct io_kiocb *req, bool *locked)
> {
> - req->uring_cmd.driver_cb(&req->uring_cmd);
> + req->uring_cmd.task_work_cb(&req->uring_cmd);
> }
>
> void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
> - void (*driver_cb)(struct io_uring_cmd *))
> + void (*task_work_cb)(struct io_uring_cmd *))
> {
> struct io_kiocb *req = container_of(ioucmd, struct io_kiocb, uring_cmd);
>
> - req->uring_cmd.driver_cb = driver_cb;
> + req->uring_cmd.task_work_cb = task_work_cb;
> req->io_task_work.func = io_uring_cmd_work;
> io_req_task_work_add(req, !!(req->ctx->flags & IORING_SETUP_SQPOLL));
> }
> @@ -4949,7 +4949,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req,
> return -EOPNOTSUPP;
> if (!(req->ctx->flags & IORING_SETUP_CQE32))
> return -EOPNOTSUPP;
> - ioucmd->cmd = (void *) &sqe->cmd;
> + ioucmd->cmd = sqe->big.cmd;
> ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
> ioucmd->flags = 0;
> return 0;
> @@ -12735,7 +12735,7 @@ static int __init io_uring_init(void)
> BUILD_BUG_SQE_ELEM(42, __u16, personality);
> BUILD_BUG_SQE_ELEM(44, __s32, splice_fd_in);
> BUILD_BUG_SQE_ELEM(44, __u32, file_index);
> - BUILD_BUG_SQE_ELEM(48, __u64, addr3);
> + BUILD_BUG_SQE_ELEM(48, __u64, small.addr3);
>
> BUILD_BUG_ON(sizeof(struct io_uring_files_update) !=
> sizeof(struct io_uring_rsrc_update));
> diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
> index a4ff4696cbead..7274884859910 100644
> --- a/include/linux/io_uring.h
> +++ b/include/linux/io_uring.h
> @@ -13,20 +13,19 @@ enum io_uring_cmd_flags {
> };
>
> struct io_uring_cmd {
> - struct file *file;
> - void *cmd;
> - /* for irq-completion - if driver requires doing stuff in task-context*/
> - void (*driver_cb)(struct io_uring_cmd *cmd);
> - u32 flags;
> - u32 cmd_op;
> - u32 unused;
> + struct file *file;
> + const u8 *cmd;
> + /* callback to defer completions to task context */
> + void (*task_work_cb)(struct io_uring_cmd *cmd);
> + u32 flags;
> + u32 cmd_op;
> u8 pdu[28]; /* available inline for free use */
> };
>
> #if defined(CONFIG_IO_URING)
> void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret, ssize_t res2);
> void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
> - void (*driver_cb)(struct io_uring_cmd *));
> + void (*task_work_cb)(struct io_uring_cmd *));
> struct sock *io_uring_get_socket(struct file *file);
> void __io_uring_cancel(bool cancel_all);
> void __io_uring_free(struct task_struct *tsk);
> @@ -56,7 +55,7 @@ static inline void io_uring_cmd_done(struct io_uring_cmd *cmd, ssize_t ret,
> {
> }
> static inline void io_uring_cmd_complete_in_task(struct io_uring_cmd *ioucmd,
> - void (*driver_cb)(struct io_uring_cmd *))
> + void (*task_work_cb)(struct io_uring_cmd *))
> {
> }
> static inline struct sock *io_uring_get_socket(struct file *file)
> diff --git a/include/trace/events/io_uring.h b/include/trace/events/io_uring.h
> index 630982b3c34c5..b1f64c2412935 100644
> --- a/include/trace/events/io_uring.h
> +++ b/include/trace/events/io_uring.h
> @@ -539,8 +539,8 @@ TRACE_EVENT(io_uring_req_failed,
> __entry->buf_index = sqe->buf_index;
> __entry->personality = sqe->personality;
> __entry->file_index = sqe->file_index;
> - __entry->pad1 = sqe->__pad2[0];
> - __entry->addr3 = sqe->addr3;
> + __entry->addr3 = sqe->small.addr3;
> + __entry->pad1 = sqe->small.__pad2[0];
> __entry->error = error;
> ),
>
> diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h
> index c081511119bfa..bd4221184f594 100644
> --- a/include/uapi/linux/io_uring.h
> +++ b/include/uapi/linux/io_uring.h
> @@ -27,7 +27,6 @@ struct io_uring_sqe {
> union {
> __u64 addr; /* pointer to buffer or iovecs */
> __u64 splice_off_in;
> - __u16 cmd_len;
> };
> __u32 len; /* buffer size or number of iovecs */
> union {
> @@ -64,16 +63,19 @@ struct io_uring_sqe {
> __u32 file_index;
> };
> union {
> - __u64 addr3;
> - __u64 cmd;
> + struct {
> + __u64 addr3;
> + __u64 __pad2[1];
> + } small;
Thinking if this can cause any issue for existing users of addr3, i.e.
in the userspace side? Since it needs to access this field with
small.addr3.
Jens - is xattr infra already frozen?
> + /*
> + * If the ring is initialized with IORING_SETUP_SQE128, then
> + * this field is used for 80 bytes of arbitrary command data.
> + */
> + struct {
> + __u8 cmd[0];
> + } big;
> };
> - __u64 __pad2[1];
> -
> - /*
> - * If the ring is initialized with IORING_SETUP_SQE128, then this field
> - * contains 64-bytes of padding, doubling the size of the SQE.
> - */
> - __u64 __big_sqe_pad[0];
> };
>
> enum {
--
Joshi
More information about the Linux-nvme
mailing list