[PATCH v4 1/5] fs,io_uring: add infrastructure for uring-cmd
Jens Axboe
axboe at kernel.dk
Thu May 5 10:04:41 PDT 2022
On 5/5/22 10:17 AM, Jens Axboe wrote:
> On 5/5/22 12:06 AM, Kanchan Joshi wrote:
>> +static int io_uring_cmd_prep(struct io_kiocb *req,
>> + const struct io_uring_sqe *sqe)
>> +{
>> + struct io_uring_cmd *ioucmd = &req->uring_cmd;
>> + struct io_ring_ctx *ctx = req->ctx;
>> +
>> + if (ctx->flags & IORING_SETUP_IOPOLL)
>> + return -EOPNOTSUPP;
>> + /* do not support uring-cmd without big SQE/CQE */
>> + if (!(ctx->flags & IORING_SETUP_SQE128))
>> + return -EOPNOTSUPP;
>> + if (!(ctx->flags & IORING_SETUP_CQE32))
>> + return -EOPNOTSUPP;
>> + if (sqe->ioprio || sqe->rw_flags)
>> + return -EINVAL;
>> + ioucmd->cmd = sqe->cmd;
>> + ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
>> + return 0;
>> +}
>
> While looking at the other suggested changes, I noticed a more
> fundamental issue with the passthrough support. For any other command,
> SQE contents are stable once prep has been done. The above does do that
> for the basic items, but this case is special as the lower level command
> itself resides in the SQE.
>
> For cases where the command needs deferral, it's problematic. There are
> two main cases where this can happen:
>
> - The issue attempt yields -EAGAIN (we ran out of requests, etc). If you
> look at other commands, if they have data that doesn't fit in the
> io_kiocb itself, then they need to allocate room for that data and have
> it be persistent
>
> - Deferral is specified by the application, using eg IOSQE_IO_LINK or
> IOSQE_ASYNC.
>
> We're totally missing support for both of these cases. Consider the case
> where the ring is setup with an SQ size of 1. You prep a passthrough
> command (request A) and issue it with io_uring_submit(). Due to one of
> the two above mentioned conditions, the internal request is deferred.
> Either it was sent to ->uring_cmd() but we got -EAGAIN, or it was
> deferred even before that happened. The application doesn't know this
> happened, it gets another SQE to submit a new request (request B). Fills
> it in, calls io_uring_submit(). Since we only have one SQE available in
> that ring, when request A gets re-issued, it's now happily reading SQE
> contents from command B. Oops.
>
> This is why prep handlers are the only ones that get an sqe passed to
> them. They are supposed to ensure that we no longer read from the SQE
> past that. Applications can always rely on that fact that once
> io_uring_submit() has been done, which consumes the SQE in the SQ ring,
> that no further reads are done from that SQE.
>
> IOW, we need io_req_prep_async() handling for URING_CMD, which needs to
> allocate the full 80 bytes and copy them over. Subsequent issue attempts
> will then use that memory rather than the SQE parts. Just need to find a
> sane way to do that so we don't need to make the usual prep + direct
> issue path any slower.
This one should take care of that. Some parts should be folded into
patch 1, other bits into the nvme adoption.
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 8c3b15d3e86d..7daa95d50db1 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -411,8 +411,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
{
- struct nvme_uring_cmd *cmd =
- (struct nvme_uring_cmd *)ioucmd->cmd;
+ struct nvme_uring_cmd *cmd = ioucmd->cmd;
struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
struct nvme_command c;
struct request *req;
@@ -548,8 +547,8 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
return __nvme_ioctl(ns, cmd, (void __user *)arg);
}
-static void nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
- unsigned int issue_flags)
+static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
+ unsigned int issue_flags)
{
int ret;
@@ -567,15 +566,15 @@ static void nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
ret = -ENOTTY;
}
- if (ret != -EIOCBQUEUED)
- io_uring_cmd_done(ioucmd, ret, 0);
+ return ret;
}
-void nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
+int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
{
struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
struct nvme_ns, cdev);
- nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+
+ return nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
}
#ifdef CONFIG_NVME_MULTIPATH
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 761ad6c629c4..26adc7660d28 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -783,7 +783,7 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
long nvme_dev_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
-void nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
+int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
void nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
unsigned int issue_flags);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index cf7087dc12b3..cee7fc95f2c2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1251,6 +1251,8 @@ static const struct io_op_def io_op_defs[] = {
[IORING_OP_URING_CMD] = {
.needs_file = 1,
.plug = 1,
+ .async_size = 2 * sizeof(struct io_uring_sqe) -
+ offsetof(struct io_uring_sqe, cmd),
},
};
@@ -4936,6 +4938,20 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
}
EXPORT_SYMBOL_GPL(io_uring_cmd_done);
+static int io_uring_cmd_prep_async(struct io_kiocb *req)
+{
+ struct io_uring_cmd *ioucmd = &req->uring_cmd;
+ struct io_ring_ctx *ctx = req->ctx;
+ size_t cmd_size = sizeof(struct io_uring_sqe) -
+ offsetof(struct io_uring_sqe, cmd);
+
+ if (ctx->flags & IORING_SETUP_SQE128)
+ cmd_size += sizeof(struct io_uring_sqe);
+
+ memcpy(req->async_data, ioucmd->cmd, cmd_size);
+ return 0;
+}
+
static int io_uring_cmd_prep(struct io_kiocb *req,
const struct io_uring_sqe *sqe)
{
@@ -4951,7 +4967,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req,
return -EOPNOTSUPP;
if (sqe->ioprio || sqe->rw_flags)
return -EINVAL;
- ioucmd->cmd = sqe->cmd;
+ ioucmd->cmd = (void *) sqe->cmd;
ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
return 0;
}
@@ -4960,10 +4976,18 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
{
struct file *file = req->file;
struct io_uring_cmd *ioucmd = &req->uring_cmd;
+ int ret;
if (!req->file->f_op->uring_cmd)
return -EOPNOTSUPP;
- file->f_op->uring_cmd(ioucmd, issue_flags);
+
+ if (req_has_async_data(req))
+ ioucmd->cmd = req->async_data;
+
+ ret = file->f_op->uring_cmd(ioucmd, issue_flags);
+ if (ret != -EIOCBQUEUED)
+ io_uring_cmd_done(ioucmd, ret, 0);
+
return 0;
}
@@ -7849,6 +7873,8 @@ static int io_req_prep_async(struct io_kiocb *req)
return io_recvmsg_prep_async(req);
case IORING_OP_CONNECT:
return io_connect_prep_async(req);
+ case IORING_OP_URING_CMD:
+ return io_uring_cmd_prep_async(req);
}
printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
req->opcode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 30c98d9993df..87b5af1d9fbe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1996,7 +1996,7 @@ struct file_operations {
struct file *file_out, loff_t pos_out,
loff_t len, unsigned int remap_flags);
int (*fadvise)(struct file *, loff_t, loff_t, int);
- void (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
+ int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
} __randomize_layout;
struct inode_operations {
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index e4cbc80949ce..74371503478a 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -14,10 +14,11 @@ enum io_uring_cmd_flags {
struct io_uring_cmd {
struct file *file;
- const u8 *cmd;
+ void *cmd;
/* callback to defer completions to task context */
void (*task_work_cb)(struct io_uring_cmd *cmd);
u32 cmd_op;
+ u32 pad;
u8 pdu[32]; /* available inline for free use */
};
--
Jens Axboe
More information about the Linux-nvme
mailing list