[PATCH 2/2] nvme: support fused NVME_IOCTL_SUBMIT_IO

Tue Jan 5 17:49:39 EST 2021

From: Clay Mayers <mayerc at kioxia.com>

Extends the functionality of the NVME_IOCTL_SUBMIT_IO ioctl to support
a pair of fused nvme_user_io requests.

When submitting a fused pair, an array of two nvme_user_io structs are
supplied when invoking NVME_IOCTL_SUBMIT_IO ioctl.  Rather than
introduce a new ioctl code, the presence of a fused pair is indicated
by the nvme_user_io.flags having the value of NVME_CMD_FUSED_FIRST.
This then indicates a second nvme_user_io struct follows the first with
an nvme_user_io.flags set to NVME_CMD_FUSED_SECOND.

A fused pair may fail to submit with -EWOULDBLOCK.  This indicates the
device queue selected for the first command didn't have a tag available
when the request for the second command was created.

Signed-off-by: Clay Mayers <clay.mayers at kioxia.com>
---
 drivers/nvme/host/core.c | 260 ++++++++++++++++++++++++++++++---------
 1 file changed, 200 insertions(+), 60 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index a498cf6a9eaf..ce5d2a9a08a8 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -1468,16 +1468,40 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
 	return (void __user *)ptrval;
 }
 
-static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+struct nvme_user_io_req {
+	struct nvme_command cmd;
+	struct request *rq;
+	struct bio *bio;	/* bio in rq at the time of allocation */
+	void *meta;
+	void __user *udata;
+	void __user *umeta;
+	unsigned int len;
+	unsigned int mlen;
+	u32 mseed;
+};
+
+static void nvme_free_io(struct nvme_user_io_req *nrq)
+{
+	if (!nrq)
+		return;
+	kfree(nrq->meta);
+	if (nrq->bio)
+		blk_rq_unmap_user(nrq->bio);
+	if (nrq->rq)
+		blk_mq_free_request(nrq->rq);
+	nrq->meta = NULL;
+	nrq->bio = NULL;
+	nrq->rq = NULL;
+}
+
+static int nvme_prep_io(struct nvme_ns *ns, struct nvme_user_io_req *nrq,
+			struct nvme_user_io __user *uio, int size)
 {
 	struct nvme_user_io io;
-	struct nvme_command c;
-	unsigned length, meta_len;
-	void __user *metadata;
 
-	if (copy_from_user(&io, uio, sizeof(io)))
+	if (unlikely(copy_from_user(&io, uio, size)))
 		return -EFAULT;
-	if (io.flags)
+	if (unlikely(io.flags & ~(NVME_CMD_FUSE_FIRST|NVME_CMD_FUSE_SECOND)))
 		return -EINVAL;
 
 	switch (io.opcode) {
@@ -1489,33 +1513,160 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 		return -EINVAL;
 	}
 
-	length = (io.nblocks + 1) << ns->lba_shift;
-	meta_len = (io.nblocks + 1) * ns->ms;
-	metadata = nvme_to_user_ptr(io.metadata);
+	nrq->udata = nvme_to_user_ptr(io.addr);
+	nrq->len = (io.nblocks + 1) << ns->lba_shift;
+	nrq->umeta = nvme_to_user_ptr(io.metadata);
+	nrq->mlen = (io.nblocks + 1) * ns->ms;
+	nrq->mseed = lower_32_bits(io.slba);
+	nrq->bio = nrq->meta = NULL;
 
 	if (ns->features & NVME_NS_EXT_LBAS) {
-		length += meta_len;
-		meta_len = 0;
-	} else if (meta_len) {
+		nrq->len += nrq->mlen;
+		nrq->mlen = 0;
+	} else if (nrq->mlen) {
 		if ((io.metadata & 3) || !io.metadata)
 			return -EINVAL;
 	}
 
-	memset(&c, 0, sizeof(c));
-	c.rw.opcode = io.opcode;
-	c.rw.flags = io.flags;
-	c.rw.nsid = cpu_to_le32(ns->head->ns_id);
-	c.rw.slba = cpu_to_le64(io.slba);
-	c.rw.length = cpu_to_le16(io.nblocks);
-	c.rw.control = cpu_to_le16(io.control);
-	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
-	c.rw.reftag = cpu_to_le32(io.reftag);
-	c.rw.apptag = cpu_to_le16(io.apptag);
-	c.rw.appmask = cpu_to_le16(io.appmask);
-
-	return nvme_submit_user_cmd(ns->queue, &c,
-			nvme_to_user_ptr(io.addr), length,
-			metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
+	memset(&nrq->cmd, 0, sizeof(nrq->cmd));
+	nrq->cmd.rw.opcode = io.opcode;
+	nrq->cmd.rw.flags = io.flags;
+	nrq->cmd.rw.nsid = cpu_to_le32(ns->head->ns_id);
+	nrq->cmd.rw.slba = cpu_to_le64(io.slba);
+	nrq->cmd.rw.length = cpu_to_le16(io.nblocks);
+	nrq->cmd.rw.control = cpu_to_le16(io.control);
+	nrq->cmd.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+	nrq->cmd.rw.reftag = cpu_to_le32(io.reftag);
+	nrq->cmd.rw.apptag = cpu_to_le16(io.apptag);
+	nrq->cmd.rw.appmask = cpu_to_le16(io.appmask);
+
+	return 0;
+}
+
+static struct request *nvme_mk_req_io(struct nvme_ns *ns,
+				      struct nvme_user_io_req *nrq,
+				      blk_mq_req_flags_t flags, int qid,
+				      int timeout)
+{
+	bool write = nvme_is_write(&nrq->cmd);
+	struct request_queue *q = ns->queue;
+	struct gendisk *disk = ns->disk;
+	struct request *rq;
+	struct bio *bio = NULL;
+	void *meta = NULL;
+	int ret;
+
+	rq = nvme_alloc_request(q, &nrq->cmd, flags, qid);
+	if (unlikely(IS_ERR(rq)))
+		return rq;
+
+	rq->timeout = timeout ? timeout : ADMIN_TIMEOUT;
+	nvme_req(rq)->flags |= NVME_REQ_USERCMD;
+
+	if (nrq->udata && nrq->len) {
+		ret = blk_rq_map_user(q, rq, NULL, nrq->udata, nrq->len,
+				GFP_KERNEL);
+		if (ret)
+			goto out;
+		bio = rq->bio;
+		bio->bi_disk = disk;
+		if (disk && nrq->umeta && nrq->mlen) {
+			meta = nvme_add_user_metadata(bio, nrq->umeta, nrq->mlen,
+					nrq->mseed, write);
+			if (IS_ERR(meta)) {
+				ret = PTR_ERR(meta);
+				goto out_unmap;
+			}
+			nrq->meta = meta;
+		}
+	}
+	nrq->bio = bio;
+	return rq;
+out_unmap:
+	if (bio)
+		blk_rq_unmap_user(bio);
+ out:
+	blk_mq_free_request(rq);
+	return ERR_PTR(ret);
+}
+
+static int nvme_unprep_io(struct nvme_user_io_req *nrq,
+			  u64 *result)
+{
+	struct request *rq = nrq->rq;
+	int write = nvme_is_write(&nrq->cmd);
+	int ret;
+
+	if (unlikely(nvme_req(rq)->flags & NVME_REQ_CANCELLED))
+		ret = -EINTR;
+	else
+		ret = nvme_req(rq)->status;
+	if (result)
+		*result = le64_to_cpu(nvme_req(rq)->result.u64);
+	if (nrq->meta && !ret && !write) {
+		if (copy_to_user(nrq->umeta, nrq->meta, nrq->mlen))
+			ret = -EFAULT;
+	}
+	nvme_free_io(nrq);
+	return ret;
+}
+
+/* support both NVME_IOCTL_SUBMIT_IO and NVME_IOCTL_SUBMIT_IO32 */
+static int nvme_submit_io(struct nvme_ns *ns, void __user *uio,
+			  int size)
+{
+	struct nvme_user_io_req nrq, nrq2;
+	struct request *rq, *rq2;
+	int ret, fused;
+
+	ret = nvme_prep_io(ns, &nrq, uio, size);
+	if (unlikely(ret))
+		return ret;
+	fused = (nrq.cmd.common.flags == NVME_CMD_FUSE_FIRST);
+	if (fused) {
+		ret = nvme_prep_io(ns, &nrq2, uio+size, size);
+		if (unlikely(ret))
+			return ret;
+		if (unlikely(nrq2.cmd.common.flags != NVME_CMD_FUSE_SECOND))
+			return -EINVAL;
+	} else if (unlikely(nrq.cmd.common.flags)) {
+		return -EINVAL;
+	}
+	rq = nvme_mk_req_io(ns, &nrq, 0, NVME_QID_ANY, 0);
+	if (unlikely(IS_ERR(rq)))
+		return PTR_ERR(rq);
+	nrq.rq = rq;
+	if (fused) {
+		DECLARE_COMPLETION_ONSTACK(wait);
+
+		rq2 = nvme_mk_req_io(ns, &nrq2, BLK_MQ_REQ_NOWAIT,
+				     nvme_req_qid(rq), 0);
+		if (unlikely(IS_ERR(rq2))) {
+			nvme_free_io(&nrq);
+			return PTR_ERR(rq2);
+		}
+		nvme_req(rq)->nrq2 = nvme_req(rq2);
+		nrq2.rq = rq2;
+
+		rq->cmd_flags |= REQ_NOMERGE;
+		rq2->cmd_flags |= REQ_NOMERGE;
+		rq->end_io_data = &wait;
+		blk_execute_rq_nowait(rq->q, ns->disk, rq, false, nvme_end_sync_rq);
+		nvme_execute_passthru_rq(rq2);
+
+		/*
+		 * both will be complete at this point, but nvme spec doesn't
+		 * specify cqe ordering for fused operations so wait for the
+		 * first to complete as well
+		 */
+		wait_for_completion_io(&wait);
+		nvme_unprep_io(&nrq, NULL);
+		ret = nvme_unprep_io(&nrq2, NULL);
+	} else {
+		nvme_execute_passthru_rq(rq);
+		ret = nvme_unprep_io(&nrq, NULL);
+	}
+	return ret;
 }
 
 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
@@ -1672,6 +1823,23 @@ static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
 	return ret;
 }
 
+struct nvme_user_io32 {
+	__u8	opcode;
+	__u8	flags;
+	__u16	control;
+	__u16	nblocks;
+	__u16	rsvd;
+	__u64	metadata;
+	__u64	addr;
+	__u64	slba;
+	__u32	dsmgmt;
+	__u32	reftag;
+	__u16	apptag;
+	__u16	appmask;
+} __attribute__((__packed__));
+
+#define NVME_IOCTL_SUBMIT_IO32	_IOW('N', 0x42, struct nvme_user_io32)
+
 static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 		unsigned int cmd, unsigned long arg)
 {
@@ -1700,8 +1868,10 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	case NVME_IOCTL_IO_CMD:
 		ret = nvme_user_cmd(ns->ctrl, ns, argp);
 		break;
+	case NVME_IOCTL_SUBMIT_IO32:
+		fallthrough;	/* structures are identical except size */
 	case NVME_IOCTL_SUBMIT_IO:
-		ret = nvme_submit_io(ns, argp);
+		ret = nvme_submit_io(ns, argp, _IOC_SIZE(cmd));
 		break;
 	case NVME_IOCTL_IO64_CMD:
 		ret = nvme_user_cmd64(ns->ctrl, ns, argp);
@@ -1717,41 +1887,11 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	return ret;
 }
 
-#ifdef CONFIG_COMPAT
-struct nvme_user_io32 {
-	__u8	opcode;
-	__u8	flags;
-	__u16	control;
-	__u16	nblocks;
-	__u16	rsvd;
-	__u64	metadata;
-	__u64	addr;
-	__u64	slba;
-	__u32	dsmgmt;
-	__u32	reftag;
-	__u16	apptag;
-	__u16	appmask;
-} __attribute__((__packed__));
-
-#define NVME_IOCTL_SUBMIT_IO32	_IOW('N', 0x42, struct nvme_user_io32)
 
+#ifdef CONFIG_COMPAT
 static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode,
 		unsigned int cmd, unsigned long arg)
 {
-	/*
-	 * Corresponds to the difference of NVME_IOCTL_SUBMIT_IO
-	 * between 32 bit programs and 64 bit kernel.
-	 * The cause is that the results of sizeof(struct nvme_user_io),
-	 * which is used to define NVME_IOCTL_SUBMIT_IO,
-	 * are not same between 32 bit compiler and 64 bit compiler.
-	 * NVME_IOCTL_SUBMIT_IO32 is for 64 bit kernel handling
-	 * NVME_IOCTL_SUBMIT_IO issued from 32 bit programs.
-	 * Other IOCTL numbers are same between 32 bit and 64 bit.
-	 * So there is nothing to do regarding to other IOCTL numbers.
-	 */
-	if (cmd == NVME_IOCTL_SUBMIT_IO32)
-		return nvme_ioctl(bdev, mode, NVME_IOCTL_SUBMIT_IO, arg);
-
 	return nvme_ioctl(bdev, mode, cmd, arg);
 }
 #else
@@ -3118,7 +3258,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
 	ret = nvme_configure_apst(ctrl);
 	if (ret < 0)
 		return ret;
-	
+
 	ret = nvme_configure_timestamp(ctrl);
 	if (ret < 0)
 		return ret;
-- 
2.27.0