[RFC PATCH 12/12] pci: implement submission/completion for rawq commands

Sat Apr 29 02:39:25 PDT 2023

Implement ->queue_uring_cmd and ->poll_uring_cmd for leaner submission
and completion. Both operate on "struct io_uring_cmd" directly, without
having to take hoops involving request and bio.

This is currently enabled for small (single segement length), premapped
(fixedbufs) IOs.

Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g at samsung.com>
---
 drivers/nvme/host/pci.c | 194 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 192 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 30d7a1a6eaab..ca0580c4e977 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -28,6 +28,8 @@
 #include <linux/io-64-nonatomic-hi-lo.h>
 #include <linux/sed-opal.h>
 #include <linux/pci-p2pdma.h>
+#include <linux/nvme_ioctl.h>
+#include <linux/io_uring.h>
 
 #include "trace.h"
 #include "nvme.h"
@@ -210,6 +212,7 @@ struct nvme_queue {
 	unsigned long *cmdid_bmp;
 	spinlock_t cmdid_lock;
 	struct nvme_iod *iod;
+	u8	reg_id;
 	 /* only used for poll queues: */
 	spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
 	struct nvme_completion *cqes;
@@ -249,7 +252,11 @@ union nvme_descriptor {
  * to the actual struct scatterlist.
  */
 struct nvme_iod {
-	struct nvme_request req;
+	union {
+		struct nvme_request req;
+		/* for raw-queue */
+		struct io_uring_cmd *ioucmd;
+	};
 	struct nvme_command cmd;
 	bool aborted;
 	s8 nr_allocations;	/* PRP list pool allocations. 0 means small
@@ -1025,6 +1032,13 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
 		writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
 }
 
+static void nvme_pci_put_cmdid(struct nvme_queue *nvmeq, int id)
+{
+	spin_lock(&nvmeq->cmdid_lock);
+	clear_bit(id, nvmeq->cmdid_bmp);
+	spin_unlock(&nvmeq->cmdid_lock);
+}
+
 static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 {
 	if (!nvmeq->qid)
@@ -1032,6 +1046,37 @@ static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
 	return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
 }
 
+static inline struct nvme_uring_direct_pdu *nvme_uring_cmd_direct_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct nvme_uring_direct_pdu *)&ioucmd->pdu;
+}
+
+static inline void nvme_handle_cqe_rawq(struct nvme_queue *nvmeq,
+					struct nvme_completion *cqe,
+					__u16 command_id)
+{
+	struct nvme_dev *dev = nvmeq->dev;
+	u32 status = le16_to_cpu(cqe->status) >> 1;
+	u64 result = cqe->result.u64;
+	struct nvme_iod *iod;
+	int id, reg_id;
+
+	reg_id = nvme_genctr_from_cid(command_id);
+	/* we should not encounter completions from past registration*/
+	WARN_ON_ONCE(nvmeq->reg_id != reg_id);
+
+	id = nvme_tag_from_cid(command_id);
+	iod = &nvmeq->iod[id];
+
+	if (iod->dma_len)
+		dma_unmap_page(dev->dev, iod->first_dma, iod->dma_len,
+		nvme_is_write(&iod->cmd) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+
+	nvme_pci_put_cmdid(nvmeq, id);
+	io_uring_cmd_done(iod->ioucmd, status, result, IO_URING_F_UNLOCKED);
+}
+
 static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 				   struct io_comp_batch *iob, u16 idx)
 {
@@ -1039,6 +1084,9 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq,
 	__u16 command_id = READ_ONCE(cqe->command_id);
 	struct request *req;
 
+	if (test_bit(NVMEQ_RAW, &nvmeq->flags))
+		return nvme_handle_cqe_rawq(nvmeq, cqe, command_id);
+
 	/*
 	 * AEN requests are special as they don't time out and can
 	 * survive any kind of queue freeze and often don't respond to
@@ -1151,6 +1199,25 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
 	return found;
 }
 
+static int nvme_poll_uring_cmd(struct io_uring_cmd *ioucmd, int qid,
+			       struct io_comp_batch *iob)
+
+{
+	struct nvme_uring_direct_pdu *pdu = nvme_uring_cmd_direct_pdu(ioucmd);
+	struct nvme_dev *dev = to_nvme_dev(pdu->ns->ctrl);
+	struct nvme_queue *nvmeq = &dev->queues[qid];
+	bool found;
+
+	if (!nvme_cqe_pending(nvmeq))
+		return 0;
+
+	spin_lock(&nvmeq->cq_poll_lock);
+	found = nvme_poll_cq(nvmeq, iob);
+	spin_unlock(&nvmeq->cq_poll_lock);
+
+	return found;
+}
+
 static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
 {
 	struct nvme_dev *dev = to_nvme_dev(ctrl);
@@ -1762,6 +1829,22 @@ static int nvme_pci_alloc_cmdid_bmp(struct nvme_queue *nvmeq)
 	return 0;
 }
 
+static int nvme_pci_get_cmdid(struct nvme_queue *nvmeq)
+{
+	int id = 0, size = nvmeq->q_depth - 1;
+
+	spin_lock(&nvmeq->cmdid_lock);
+	id = find_first_zero_bit(nvmeq->cmdid_bmp, size);
+	if (id >= size) {
+		id = -EBUSY;
+		goto unlock;
+	}
+	set_bit(id, nvmeq->cmdid_bmp);
+unlock:
+	spin_unlock(&nvmeq->cmdid_lock);
+	return id;
+}
+
 static int nvme_pci_alloc_iod_array(struct nvme_queue *nvmeq)
 {
 	if (!test_bit(NVMEQ_RAW, &nvmeq->flags))
@@ -1793,13 +1876,17 @@ static int nvme_pci_register_queue(void *data)
 	struct nvme_ns *ns = (struct nvme_ns *) data;
 	struct nvme_dev *dev = to_nvme_dev(ns->ctrl);
 	int qid, ret;
+	struct nvme_queue *nvmeq;
 
 	qid = nvme_pci_get_rawq(dev);
 	if (qid > 0) {
 		/* setup command-id bitmap and iod array */
-		ret = nvme_pci_setup_rawq(&dev->queues[qid]);
+		nvmeq = &dev->queues[qid];
+		ret = nvme_pci_setup_rawq(nvmeq);
 		if (ret < 0)
 			qid = ret;
+		else
+			nvmeq->reg_id++;
 	}
 	return qid;
 }
@@ -1812,6 +1899,107 @@ static int nvme_pci_unregister_queue(void *data, int qid)
 	return nvme_pci_put_rawq(dev, qid);
 }
 
+static int nvme_map_io_fb(struct request_queue *q, struct io_uring_cmd *ioucmd,
+			struct nvme_dev *dev, struct nvme_iod *iod,
+			unsigned long addr, unsigned int buf_len)
+{
+	struct nvme_command *cmnd = &iod->cmd;
+	int ret, rw = nvme_is_write(cmnd);
+	struct iov_iter iter;
+	size_t nr_iter, nr_segs;
+	struct bio_vec *bv;
+
+	if (!(ioucmd->flags & IORING_URING_CMD_FIXED))
+		return -EINVAL;
+
+	ret = io_uring_cmd_import_fixed(addr, buf_len, rw, &iter, ioucmd);
+	if (ret < 0)
+		return ret;
+	nr_iter = iov_iter_count(&iter);
+	nr_segs = iter.nr_segs;
+
+	/* device will do these checks anyway, so why do duplicate work? */
+	if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q))
+		goto err;
+	if (nr_segs > queue_max_segments(q))
+		goto err;
+	/* this will be attempted from regular path instead */
+	if (nr_segs > 1)
+		goto err;
+
+	bv = (struct bio_vec *)&iter.bvec[0];
+	if (bv->bv_offset + bv->bv_len <= NVME_CTRL_PAGE_SIZE * 2)
+		return nvme_setup_prp_simple(dev, iod,
+				(struct nvme_rw_command *)cmnd,
+				bv, rw ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
+err:
+	return -EINVAL;
+}
+
+static int nvme_alloc_cmd_from_q(struct nvme_queue *nvmeq, int *cmdid,
+				struct nvme_iod **iod)
+{
+	int id;
+
+	id = nvme_pci_get_cmdid(nvmeq);
+	if (id < 0)
+		return id;
+	*cmdid = id | nvme_cid_install_genctr(nvmeq->reg_id);
+	*iod = &nvmeq->iod[id];
+	return 0;
+}
+
+static int nvme_pci_queue_ucmd(struct io_uring_cmd *ioucmd, int qid)
+{
+	struct nvme_uring_direct_pdu *pdu = nvme_uring_cmd_direct_pdu(ioucmd);
+	struct nvme_ns *ns = pdu->ns;
+	struct nvme_ctrl *ctrl = ns->ctrl;
+	struct nvme_dev *dev = to_nvme_dev(ctrl);
+	struct nvme_command *nvme_cmd;
+	struct nvme_uring_data d;
+	int ret, cmdid;
+	struct nvme_iod *iod;
+	struct nvme_queue *nvmeq = &dev->queues[qid];
+
+	ret = nvme_alloc_cmd_from_q(nvmeq, &cmdid, &iod);
+	if (ret)
+		return ret;
+
+	iod->ioucmd = ioucmd;
+	nvme_cmd = &iod->cmd;
+	ret = nvme_prep_cmd_from_ioucmd(ctrl, ns, ioucmd, nvme_cmd, &d);
+	if (ret)
+		goto out;
+
+	nvme_cmd->common.command_id = cmdid;
+	iod->aborted = false;
+	iod->nr_allocations = -1;
+	iod->sgt.nents = 0;
+	ret = nvme_map_io_fb(ns->queue, ioucmd, dev, iod, d.addr, d.data_len);
+	if  (ret)
+		goto out;
+
+	/*
+	 * since this nvmeq is exclusive to single submitter (io_uring
+	 * follows one-thread-per-ring model), we do not need the lock
+	 * ideally. But we have lifetime difference between io_uring
+	 * and nvme sqe. io_uring SQE can be reused just after submission
+	 * but for nvme we have to wait until completion.
+	 * So if we run out of space, submission will be deferred to
+	 * io_uring worker. So we can't skip the lock.
+	 * But not all is lost! Due to one-thread-per-ring model and
+	 * polled-completion, contention is not common in most cases.
+	 */
+	spin_lock(&nvmeq->sq_lock);
+	nvme_sq_copy_cmd(nvmeq, &iod->cmd);
+	nvme_write_sq_db(nvmeq, true);
+	spin_unlock(&nvmeq->sq_lock);
+	return -EIOCBQUEUED;
+out:
+	nvme_pci_put_cmdid(nvmeq, cmdid);
+	return ret;
+}
+
 static const struct blk_mq_ops nvme_mq_admin_ops = {
 	.queue_rq	= nvme_queue_rq,
 	.complete	= nvme_pci_complete_rq,
@@ -1832,6 +2020,8 @@ static const struct blk_mq_ops nvme_mq_ops = {
 	.poll		= nvme_poll,
 	.register_queue	= nvme_pci_register_queue,
 	.unregister_queue =  nvme_pci_unregister_queue,
+	.queue_uring_cmd = nvme_pci_queue_ucmd,
+	.poll_uring_cmd	= nvme_poll_uring_cmd,
 };
 
 static void nvme_dev_remove_admin(struct nvme_dev *dev)
-- 
2.25.1