[PATCH v4 1/5] fs,io_uring: add infrastructure for uring-cmd

Thu May 5 10:04:41 PDT 2022

On 5/5/22 10:17 AM, Jens Axboe wrote:
> On 5/5/22 12:06 AM, Kanchan Joshi wrote:
>> +static int io_uring_cmd_prep(struct io_kiocb *req,
>> +			     const struct io_uring_sqe *sqe)
>> +{
>> +	struct io_uring_cmd *ioucmd = &req->uring_cmd;
>> +	struct io_ring_ctx *ctx = req->ctx;
>> +
>> +	if (ctx->flags & IORING_SETUP_IOPOLL)
>> +		return -EOPNOTSUPP;
>> +	/* do not support uring-cmd without big SQE/CQE */
>> +	if (!(ctx->flags & IORING_SETUP_SQE128))
>> +		return -EOPNOTSUPP;
>> +	if (!(ctx->flags & IORING_SETUP_CQE32))
>> +		return -EOPNOTSUPP;
>> +	if (sqe->ioprio || sqe->rw_flags)
>> +		return -EINVAL;
>> +	ioucmd->cmd = sqe->cmd;
>> +	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
>> +	return 0;
>> +}
> 
> While looking at the other suggested changes, I noticed a more
> fundamental issue with the passthrough support. For any other command,
> SQE contents are stable once prep has been done. The above does do that
> for the basic items, but this case is special as the lower level command
> itself resides in the SQE.
> 
> For cases where the command needs deferral, it's problematic. There are
> two main cases where this can happen:
> 
> - The issue attempt yields -EAGAIN (we ran out of requests, etc). If you
>   look at other commands, if they have data that doesn't fit in the
>   io_kiocb itself, then they need to allocate room for that data and have
>   it be persistent
> 
> - Deferral is specified by the application, using eg IOSQE_IO_LINK or
>   IOSQE_ASYNC.
> 
> We're totally missing support for both of these cases. Consider the case
> where the ring is setup with an SQ size of 1. You prep a passthrough
> command (request A) and issue it with io_uring_submit(). Due to one of
> the two above mentioned conditions, the internal request is deferred.
> Either it was sent to ->uring_cmd() but we got -EAGAIN, or it was
> deferred even before that happened. The application doesn't know this
> happened, it gets another SQE to submit a new request (request B). Fills
> it in, calls io_uring_submit(). Since we only have one SQE available in
> that ring, when request A gets re-issued, it's now happily reading SQE
> contents from command B. Oops.
> 
> This is why prep handlers are the only ones that get an sqe passed to
> them. They are supposed to ensure that we no longer read from the SQE
> past that. Applications can always rely on that fact that once
> io_uring_submit() has been done, which consumes the SQE in the SQ ring,
> that no further reads are done from that SQE.
> 
> IOW, we need io_req_prep_async() handling for URING_CMD, which needs to
> allocate the full 80 bytes and copy them over. Subsequent issue attempts
> will then use that memory rather than the SQE parts. Just need to find a
> sane way to do that so we don't need to make the usual prep + direct
> issue path any slower.

This one should take care of that. Some parts should be folded into
patch 1, other bits into the nvme adoption.

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 8c3b15d3e86d..7daa95d50db1 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -411,8 +411,7 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		struct io_uring_cmd *ioucmd, unsigned int issue_flags, bool vec)
 {
-	struct nvme_uring_cmd *cmd =
-		(struct nvme_uring_cmd *)ioucmd->cmd;
+	struct nvme_uring_cmd *cmd = ioucmd->cmd;
 	struct request_queue *q = ns ? ns->queue : ctrl->admin_q;
 	struct nvme_command c;
 	struct request *req;
@@ -548,8 +547,8 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	return __nvme_ioctl(ns, cmd, (void __user *)arg);
 }
 
-static void nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
-		unsigned int issue_flags)
+static int nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
+			     unsigned int issue_flags)
 {
 	int ret;
 
@@ -567,15 +566,15 @@ static void nvme_ns_uring_cmd(struct nvme_ns *ns, struct io_uring_cmd *ioucmd,
 		ret = -ENOTTY;
 	}
 
-	if (ret != -EIOCBQUEUED)
-		io_uring_cmd_done(ioucmd, ret, 0);
+	return ret;
 }
 
-void nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
+int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
 {
 	struct nvme_ns *ns = container_of(file_inode(ioucmd->file)->i_cdev,
 			struct nvme_ns, cdev);
-	nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
+
+	return nvme_ns_uring_cmd(ns, ioucmd, issue_flags);
 }
 
 #ifdef CONFIG_NVME_MULTIPATH
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 761ad6c629c4..26adc7660d28 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -783,7 +783,7 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
 long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 		unsigned long arg);
-void nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
+int nvme_ns_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
 void nvme_ns_head_chr_uring_cmd(struct io_uring_cmd *ioucmd,
 		unsigned int issue_flags);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index cf7087dc12b3..cee7fc95f2c2 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -1251,6 +1251,8 @@ static const struct io_op_def io_op_defs[] = {
 	[IORING_OP_URING_CMD] = {
 		.needs_file		= 1,
 		.plug			= 1,
+		.async_size		= 2 * sizeof(struct io_uring_sqe) -
+					  offsetof(struct io_uring_sqe, cmd),
 	},
 };
 
@@ -4936,6 +4938,20 @@ void io_uring_cmd_done(struct io_uring_cmd *ioucmd, ssize_t ret, ssize_t res2)
 }
 EXPORT_SYMBOL_GPL(io_uring_cmd_done);
 
+static int io_uring_cmd_prep_async(struct io_kiocb *req)
+{
+	struct io_uring_cmd *ioucmd = &req->uring_cmd;
+	struct io_ring_ctx *ctx = req->ctx;
+	size_t cmd_size = sizeof(struct io_uring_sqe) -
+				offsetof(struct io_uring_sqe, cmd);
+
+	if (ctx->flags & IORING_SETUP_SQE128)
+		cmd_size += sizeof(struct io_uring_sqe);
+
+	memcpy(req->async_data, ioucmd->cmd, cmd_size);
+	return 0;
+}
+
 static int io_uring_cmd_prep(struct io_kiocb *req,
 			     const struct io_uring_sqe *sqe)
 {
@@ -4951,7 +4967,7 @@ static int io_uring_cmd_prep(struct io_kiocb *req,
 		return -EOPNOTSUPP;
 	if (sqe->ioprio || sqe->rw_flags)
 		return -EINVAL;
-	ioucmd->cmd = sqe->cmd;
+	ioucmd->cmd = (void *) sqe->cmd;
 	ioucmd->cmd_op = READ_ONCE(sqe->cmd_op);
 	return 0;
 }
@@ -4960,10 +4976,18 @@ static int io_uring_cmd(struct io_kiocb *req, unsigned int issue_flags)
 {
 	struct file *file = req->file;
 	struct io_uring_cmd *ioucmd = &req->uring_cmd;
+	int ret;
 
 	if (!req->file->f_op->uring_cmd)
 		return -EOPNOTSUPP;
-	file->f_op->uring_cmd(ioucmd, issue_flags);
+
+	if (req_has_async_data(req))
+		ioucmd->cmd = req->async_data;
+
+	ret = file->f_op->uring_cmd(ioucmd, issue_flags);
+	if (ret != -EIOCBQUEUED)
+		io_uring_cmd_done(ioucmd, ret, 0);
+
 	return 0;
 }
 
@@ -7849,6 +7873,8 @@ static int io_req_prep_async(struct io_kiocb *req)
 		return io_recvmsg_prep_async(req);
 	case IORING_OP_CONNECT:
 		return io_connect_prep_async(req);
+	case IORING_OP_URING_CMD:
+		return io_uring_cmd_prep_async(req);
 	}
 	printk_once(KERN_WARNING "io_uring: prep_async() bad opcode %d\n",
 		    req->opcode);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 30c98d9993df..87b5af1d9fbe 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1996,7 +1996,7 @@ struct file_operations {
 				   struct file *file_out, loff_t pos_out,
 				   loff_t len, unsigned int remap_flags);
 	int (*fadvise)(struct file *, loff_t, loff_t, int);
-	void (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
+	int (*uring_cmd)(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
 } __randomize_layout;
 
 struct inode_operations {
diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h
index e4cbc80949ce..74371503478a 100644
--- a/include/linux/io_uring.h
+++ b/include/linux/io_uring.h
@@ -14,10 +14,11 @@ enum io_uring_cmd_flags {
 
 struct io_uring_cmd {
 	struct file	*file;
-	const u8	*cmd;
+	void		*cmd;
 	/* callback to defer completions to task context */
 	void (*task_work_cb)(struct io_uring_cmd *cmd);
 	u32		cmd_op;
+	u32		pad;
 	u8		pdu[32]; /* available inline for free use */
 };
 

-- 
Jens Axboe