[RFC PATCH 3/4] nvme: add async ioctl support

Wed Jan 27 10:00:28 EST 2021

Add async_ioctl handler that implements asynchronous handling of ioctl
operation. If requested ioctl opcode does not involve submitting a
command to device (e.g. NVME_IOCTL_ID), it is made to return instantly.
Otherwise, ioctl-completion is decoupled from submission, and
-EIOCBQUEUED is returned post submission. When completion arrives from
device, nvme calls the ioctl-completion handler supplied by upper-layer.
But there is execption to that. An ioctl completion may also require
updating certain ioctl-specific user buffers/fields which can be
accessed only in context of original submitter-task. For such ioctl,
nvme-completion schedules a task-work which first updates ioctl-specific
buffers/fields and after that invokes the ioctl-completion handler.

Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
Signed-off-by: Anuj Gupta <anuj20.g at samsung.com>
---
 drivers/nvme/host/core.c | 347 +++++++++++++++++++++++++++++++--------
 1 file changed, 280 insertions(+), 67 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 200bdd672c28..57f3040bae34 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -21,6 +21,7 @@
 #include <linux/nvme_ioctl.h>
 #include <linux/pm_qos.h>
 #include <asm/unaligned.h>
+#include <linux/task_work.h>
 
 #include "nvme.h"
 #include "fabrics.h"
@@ -1092,7 +1093,107 @@ static void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects)
 	}
 }
 
-void nvme_execute_passthru_rq(struct request *rq)
+struct async_pt_desc {
+	struct bio *bio;
+	int status; /* command status */
+	u64 result; /* nvme cmd result */
+	void __user *res_ptr; /* can be null, 32bit addr or 64 bit addr */
+	void __user *meta_ptr;
+	void *meta; /* kernel-space resident buffer */
+	unsigned metalen; /* length of meta */
+	bool is_res64 : 1; /* res_ptr refers to 64bit of space */
+	bool is_write : 1;
+	bool is_taskwork : 1;
+};
+
+static int nvme_add_task_work(struct task_struct *tsk,
+			struct callback_head *twork,
+			task_work_func_t work_func)
+{
+	int ret;
+
+	get_task_struct(tsk);
+	init_task_work(twork, work_func);
+	ret = task_work_add(tsk, twork, TWA_SIGNAL);
+	if (!ret)
+		wake_up_process(tsk);
+	return ret;
+}
+
+static void async_pt_update_work(struct callback_head *cbh)
+{
+	struct pt_ioctl_ctx *ptioc;
+	struct async_pt_desc *ptd;
+	struct task_struct *tsk;
+	int ret;
+
+	ptioc = container_of(cbh, struct pt_ioctl_ctx, pt_work);
+	ptd = ptioc->ioc_data;
+	tsk = ptioc->task;
+
+	/* handle meta update */
+	if (ptd->meta) {
+		if (!ptd->status && !ptd->is_write)
+			if (copy_to_user(ptd->meta_ptr, ptd->meta, ptd->metalen))
+				ptd->status = -EFAULT;
+		kfree(ptd->meta);
+	}
+	/* handle result update */
+	if (ptd->res_ptr) {
+		if (!ptd->is_res64)
+			ret = put_user(ptd->result, (u32 __user *)ptd->res_ptr);
+		else
+			ret = put_user(ptd->result, (u64 __user *)ptd->res_ptr);
+		if (ret)
+			ptd->status = -EFAULT;
+	}
+
+	ptioc->pt_complete(ptioc, ptd->status);
+	put_task_struct(tsk);
+	kfree(ptd);
+}
+
+static void nvme_end_async_pt(struct request *req, blk_status_t err)
+{
+	struct pt_ioctl_ctx *ptioc;
+	struct async_pt_desc *ptd;
+	struct bio *bio;
+
+	ptioc = req->end_io_data;
+	ptd = ptioc->ioc_data;
+
+	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
+		ptd->status = -EINTR;
+	else
+		ptd->status = nvme_req(req)->status;
+
+	ptd->result = le64_to_cpu(nvme_req(req)->result.u64);
+	bio = ptd->bio;
+	/* setup task work if needed */
+	if (ptd->is_taskwork) {
+		int ret = nvme_add_task_work(ptioc->task, &ptioc->pt_work,
+				async_pt_update_work);
+		/* update failure if task-work could not be setup */
+		if (ret < 0) {
+			put_task_struct(ptioc->task);
+			ptioc->pt_complete(ptioc, ret);
+			kfree(ptd->meta);
+			kfree(ptd);
+		}
+	} else {
+		/* return status via callback, nothing else to update */
+		ptioc->pt_complete(ptioc, ptd->status);
+		kfree(ptd);
+	}
+
+	/* unmap pages, free bio, nvme command and request */
+	blk_rq_unmap_user(bio);
+	kfree(nvme_req(req)->cmd);
+	blk_mq_free_request(req);
+}
+
+
+void nvme_execute_passthru_rq_common(struct request *rq, int async)
 {
 	struct nvme_command *cmd = nvme_req(rq)->cmd;
 	struct nvme_ctrl *ctrl = nvme_req(rq)->ctrl;
@@ -1101,15 +1202,52 @@ void nvme_execute_passthru_rq(struct request *rq)
 	u32 effects;
 
 	effects = nvme_passthru_start(ctrl, ns, cmd->common.opcode);
-	blk_execute_rq(rq->q, disk, rq, 0);
+	if (!async)
+		blk_execute_rq(rq->q, disk, rq, 0);
+	else
+		blk_execute_rq_nowait(rq->q, disk, rq, 0, nvme_end_async_pt);
 	nvme_passthru_end(ctrl, effects);
 }
+
+void nvme_execute_passthru_rq(struct request *rq)
+{
+	return nvme_execute_passthru_rq_common(rq, 0);
+}
 EXPORT_SYMBOL_NS_GPL(nvme_execute_passthru_rq, NVME_TARGET_PASSTHRU);
 
+static int setup_async_pt_desc(struct request *rq, struct pt_ioctl_ctx *ptioc,
+		void __user *resptr, void __user *meta_buffer, void *meta,
+		unsigned meta_len, bool write, bool is_res64)
+{
+	struct async_pt_desc *ptd;
+
+	ptd = kzalloc(sizeof(struct async_pt_desc), GFP_KERNEL);
+	if (!ptd)
+		return -ENOMEM;
+
+	/* to free bio on completion, as req->bio will be null at that time */
+	ptd->bio = rq->bio;
+	ptd->res_ptr = resptr;
+	ptd->is_write = write;
+	ptd->is_res64 = is_res64;
+	if (meta) {
+		ptd->meta_ptr = meta_buffer;
+		ptd->meta = meta;
+		ptd->metalen = meta_len;
+	}
+	if (resptr)
+		ptd->is_taskwork = 1;
+
+	ptioc->ioc_data = ptd;
+	rq->end_io_data = ptioc;
+	return 0;
+}
+
 static int nvme_submit_user_cmd(struct request_queue *q,
 		struct nvme_command *cmd, void __user *ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, u64 *result, unsigned timeout)
+		u32 meta_seed, u64 *result, unsigned timeout,
+		struct pt_ioctl_ctx *ptioc, bool is_res64)
 {
 	bool write = nvme_is_write(cmd);
 	struct nvme_ns *ns = q->queuedata;
@@ -1145,6 +1283,18 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 		}
 	}
 
+	if (ptioc) { /* async handling */
+		ret = setup_async_pt_desc(req, ptioc, result, meta_buffer,
+			       meta, meta_len, write, is_res64);
+		if (ret) {
+			kfree(meta);
+			goto out_unmap;
+		}
+		/* send request for async processing */
+		nvme_execute_passthru_rq_common(req, 1);
+		return ret;
+	}
+	/* sync handling */
 	nvme_execute_passthru_rq(req);
 	if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
 		ret = -EINTR;
@@ -1521,10 +1671,11 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
 	return (void __user *)ptrval;
 }
 
-static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
+static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio,
+			struct pt_ioctl_ctx *ptioc)
 {
 	struct nvme_user_io io;
-	struct nvme_command c;
+	struct nvme_command c, *cptr;
 	unsigned length, meta_len;
 	void __user *metadata;
 
@@ -1554,31 +1705,42 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
 			return -EINVAL;
 	}
 
-	memset(&c, 0, sizeof(c));
-	c.rw.opcode = io.opcode;
-	c.rw.flags = io.flags;
-	c.rw.nsid = cpu_to_le32(ns->head->ns_id);
-	c.rw.slba = cpu_to_le64(io.slba);
-	c.rw.length = cpu_to_le16(io.nblocks);
-	c.rw.control = cpu_to_le16(io.control);
-	c.rw.dsmgmt = cpu_to_le32(io.dsmgmt);
-	c.rw.reftag = cpu_to_le32(io.reftag);
-	c.rw.apptag = cpu_to_le16(io.apptag);
-	c.rw.appmask = cpu_to_le16(io.appmask);
-
-	return nvme_submit_user_cmd(ns->queue, &c,
+	if (!ptioc)
+		cptr = &c;
+	else { /* for async - allocate cmd dynamically */
+		cptr = kmalloc(sizeof(struct nvme_command), GFP_KERNEL);
+		if (!cptr)
+			return -ENOMEM;
+	}
+
+	memset(cptr, 0, sizeof(c));
+	cptr->rw.opcode = io.opcode;
+	cptr->rw.flags = io.flags;
+	cptr->rw.nsid = cpu_to_le32(ns->head->ns_id);
+	cptr->rw.slba = cpu_to_le64(io.slba);
+	cptr->rw.length = cpu_to_le16(io.nblocks);
+	cptr->rw.control = cpu_to_le16(io.control);
+	cptr->rw.dsmgmt = cpu_to_le32(io.dsmgmt);
+	cptr->rw.reftag = cpu_to_le32(io.reftag);
+	cptr->rw.apptag = cpu_to_le16(io.apptag);
+	cptr->rw.appmask = cpu_to_le16(io.appmask);
+
+	return nvme_submit_user_cmd(ns->queue, cptr,
 			nvme_to_user_ptr(io.addr), length,
-			metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
+			metadata, meta_len, lower_32_bits(io.slba), NULL, 0,
+			ptioc, 0);
 }
 
 static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-			struct nvme_passthru_cmd __user *ucmd)
+			struct nvme_passthru_cmd __user *ucmd,
+			struct pt_ioctl_ctx *ptioc)
 {
 	struct nvme_passthru_cmd cmd;
-	struct nvme_command c;
+	struct nvme_command c, *cptr;
 	unsigned timeout = 0;
 	u64 result;
 	int status;
+	void *resptr;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -1586,43 +1748,61 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 		return -EFAULT;
 	if (cmd.flags)
 		return -EINVAL;
+	if (!ptioc) {
+		cptr = &c;
+		resptr = &result;
+	} else {
+		/*
+		 * for async - (a) allocate cmd dynamically
+		 * (b) use user-space result addr
+		 */
+		cptr = kmalloc(sizeof(struct nvme_command), GFP_KERNEL);
+		if (!cptr)
+			return -ENOMEM;
+		resptr = &ucmd->result;
+	}
 
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = cmd.opcode;
-	c.common.flags = cmd.flags;
-	c.common.nsid = cpu_to_le32(cmd.nsid);
-	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
-	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
-	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
-	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
-	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
-	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
-	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+	memset(cptr, 0, sizeof(c));
+	cptr->common.opcode = cmd.opcode;
+	cptr->common.flags = cmd.flags;
+	cptr->common.nsid = cpu_to_le32(cmd.nsid);
+	cptr->common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	cptr->common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	cptr->common.cdw10 = cpu_to_le32(cmd.cdw10);
+	cptr->common.cdw11 = cpu_to_le32(cmd.cdw11);
+	cptr->common.cdw12 = cpu_to_le32(cmd.cdw12);
+	cptr->common.cdw13 = cpu_to_le32(cmd.cdw13);
+	cptr->common.cdw14 = cpu_to_le32(cmd.cdw14);
+	cptr->common.cdw15 = cpu_to_le32(cmd.cdw15);
 
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
-	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, cptr,
 			nvme_to_user_ptr(cmd.addr), cmd.data_len,
 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &result, timeout);
+			0, resptr, timeout, ptioc, 0);
 
-	if (status >= 0) {
+	if (!ptioc && status >= 0) {
 		if (put_user(result, &ucmd->result))
 			return -EFAULT;
 	}
+	/* async case, free cmd in case of error */
+	if (ptioc && status < 0)
+		kfree(cptr);
 
 	return status;
 }
 
 static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
-			struct nvme_passthru_cmd64 __user *ucmd)
+			struct nvme_passthru_cmd64 __user *ucmd,
+			struct pt_ioctl_ctx *ptioc)
 {
 	struct nvme_passthru_cmd64 cmd;
-	struct nvme_command c;
+	struct nvme_command c, *cptr;
 	unsigned timeout = 0;
 	int status;
+	void *resptr;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EACCES;
@@ -1631,31 +1811,43 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (cmd.flags)
 		return -EINVAL;
 
-	memset(&c, 0, sizeof(c));
-	c.common.opcode = cmd.opcode;
-	c.common.flags = cmd.flags;
-	c.common.nsid = cpu_to_le32(cmd.nsid);
-	c.common.cdw2[0] = cpu_to_le32(cmd.cdw2);
-	c.common.cdw2[1] = cpu_to_le32(cmd.cdw3);
-	c.common.cdw10 = cpu_to_le32(cmd.cdw10);
-	c.common.cdw11 = cpu_to_le32(cmd.cdw11);
-	c.common.cdw12 = cpu_to_le32(cmd.cdw12);
-	c.common.cdw13 = cpu_to_le32(cmd.cdw13);
-	c.common.cdw14 = cpu_to_le32(cmd.cdw14);
-	c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+	if (!ptioc) {
+		cptr = &c;
+		resptr = &cmd.result;
+	} else {
+		cptr = kmalloc(sizeof(struct nvme_command), GFP_KERNEL);
+		if (!cptr)
+			return -ENOMEM;
+		resptr = &ucmd->result;
+	}
+
+	memset(cptr, 0, sizeof(struct nvme_command));
+	cptr->common.opcode = cmd.opcode;
+	cptr->common.flags = cmd.flags;
+	cptr->common.nsid = cpu_to_le32(cmd.nsid);
+	cptr->common.cdw2[0] = cpu_to_le32(cmd.cdw2);
+	cptr->common.cdw2[1] = cpu_to_le32(cmd.cdw3);
+	cptr->common.cdw10 = cpu_to_le32(cmd.cdw10);
+	cptr->common.cdw11 = cpu_to_le32(cmd.cdw11);
+	cptr->common.cdw12 = cpu_to_le32(cmd.cdw12);
+	cptr->common.cdw13 = cpu_to_le32(cmd.cdw13);
+	cptr->common.cdw14 = cpu_to_le32(cmd.cdw14);
+	cptr->common.cdw15 = cpu_to_le32(cmd.cdw15);
 
 	if (cmd.timeout_ms)
 		timeout = msecs_to_jiffies(cmd.timeout_ms);
 
-	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
+	status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, cptr,
 			nvme_to_user_ptr(cmd.addr), cmd.data_len,
 			nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
-			0, &cmd.result, timeout);
+			0, resptr, timeout, ptioc, 1);
 
-	if (status >= 0) {
+	if (!ptioc && status >= 0) {
 		if (put_user(cmd.result, &ucmd->result))
 			return -EFAULT;
 	}
+	if (ptioc && status < 0)
+		kfree(cptr);
 
 	return status;
 }
@@ -1702,7 +1894,8 @@ static bool is_ctrl_ioctl(unsigned int cmd)
 static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
 				  void __user *argp,
 				  struct nvme_ns_head *head,
-				  int srcu_idx)
+				  int srcu_idx,
+				  struct pt_ioctl_ctx *ptioc)
 {
 	struct nvme_ctrl *ctrl = ns->ctrl;
 	int ret;
@@ -1712,21 +1905,24 @@ static int nvme_handle_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
 
 	switch (cmd) {
 	case NVME_IOCTL_ADMIN_CMD:
-		ret = nvme_user_cmd(ctrl, NULL, argp);
+		ret = nvme_user_cmd(ctrl, NULL, argp, ptioc);
 		break;
 	case NVME_IOCTL_ADMIN64_CMD:
-		ret = nvme_user_cmd64(ctrl, NULL, argp);
+		ret = nvme_user_cmd64(ctrl, NULL, argp, ptioc);
 		break;
 	default:
-		ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+		if (!ptioc)
+			ret = sed_ioctl(ctrl->opal_dev, cmd, argp);
+		else
+			ret = -EOPNOTSUPP; /* RFP: no support for now */
 		break;
 	}
 	nvme_put_ctrl(ctrl);
 	return ret;
 }
 
-static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
-		unsigned int cmd, unsigned long arg)
+static int nvme_async_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg, struct pt_ioctl_ctx *ptioc)
 {
 	struct nvme_ns_head *head = NULL;
 	void __user *argp = (void __user *)arg;
@@ -1743,33 +1939,49 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
 	 * deadlock when deleting namespaces using the passthrough interface.
 	 */
 	if (is_ctrl_ioctl(cmd))
-		return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+		return nvme_handle_ctrl_ioctl(ns, cmd, argp, head, srcu_idx, ptioc);
 
 	switch (cmd) {
 	case NVME_IOCTL_ID:
 		force_successful_syscall_return();
 		ret = ns->head->ns_id;
+		if (ptioc)
+			goto put_ns; /* return in sync fashion always */
 		break;
 	case NVME_IOCTL_IO_CMD:
-		ret = nvme_user_cmd(ns->ctrl, ns, argp);
+		ret = nvme_user_cmd(ns->ctrl, ns, argp, ptioc);
 		break;
 	case NVME_IOCTL_SUBMIT_IO:
-		ret = nvme_submit_io(ns, argp);
+		ret = nvme_submit_io(ns, argp, ptioc);
 		break;
 	case NVME_IOCTL_IO64_CMD:
-		ret = nvme_user_cmd64(ns->ctrl, ns, argp);
+		ret = nvme_user_cmd64(ns->ctrl, ns, argp, ptioc);
 		break;
 	default:
+		if (ptioc) {
+			/* RFP- don't support this for now */
+			ret = -EOPNOTSUPP;
+			break;
+		}
 		if (ns->ndev)
 			ret = nvme_nvm_ioctl(ns, cmd, arg);
 		else
 			ret = -ENOTTY;
 	}
-
+	/* if there is no error, return queued for async-ioctl */
+	if (ptioc && ret >= 0)
+		ret = -EIOCBQUEUED;
+ put_ns:
 	nvme_put_ns_from_disk(head, srcu_idx);
 	return ret;
 }
 
+static int nvme_ioctl(struct block_device *bdev, fmode_t mode,
+		unsigned int cmd, unsigned long arg)
+{
+	return nvme_async_ioctl(bdev, mode, cmd, arg, NULL);
+}
+
 #ifdef CONFIG_COMPAT
 struct nvme_user_io32 {
 	__u8	opcode;
@@ -2324,6 +2536,7 @@ EXPORT_SYMBOL_GPL(nvme_sec_submit);
 static const struct block_device_operations nvme_bdev_ops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= nvme_ioctl,
+	.async_ioctl	= nvme_async_ioctl,
 	.compat_ioctl	= nvme_compat_ioctl,
 	.open		= nvme_open,
 	.release	= nvme_release,
@@ -3261,7 +3474,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
 	kref_get(&ns->kref);
 	up_read(&ctrl->namespaces_rwsem);
 
-	ret = nvme_user_cmd(ctrl, ns, argp);
+	ret = nvme_user_cmd(ctrl, ns, argp, NULL);
 	nvme_put_ns(ns);
 	return ret;
 
@@ -3278,9 +3491,9 @@ static long nvme_dev_ioctl(struct file *file, unsigned int cmd,
 
 	switch (cmd) {
 	case NVME_IOCTL_ADMIN_CMD:
-		return nvme_user_cmd(ctrl, NULL, argp);
+		return nvme_user_cmd(ctrl, NULL, argp, NULL);
 	case NVME_IOCTL_ADMIN64_CMD:
-		return nvme_user_cmd64(ctrl, NULL, argp);
+		return nvme_user_cmd64(ctrl, NULL, argp, NULL);
 	case NVME_IOCTL_IO_CMD:
 		return nvme_dev_user_cmd(ctrl, argp);
 	case NVME_IOCTL_RESET:
-- 
2.25.1