[PATCH] nvme: optimize passthrough IOPOLL completion for local ring context
Ming Lei
ming.lei at redhat.com
Thu Jan 15 00:59:52 PST 2026
When multiple io_uring rings poll on the same NVMe queue, one ring can
find completions belonging to another ring. The current code always
uses task_work to handle this, but this adds overhead for the common
single-ring case.
This patch passes the polling io_ring_ctx through the iopoll callback
chain via io_comp_batch and stores it in the request. In the NVMe
end_io handler, we compare the polling context with the request's
owning context. If they match (local), we complete inline. If they
differ (remote) or it's a non-IOPOLL path, we use task_work as before.
Changes:
- Add poll_ctx field to struct io_comp_batch
- Add poll_ctx to struct request's hash/ipi_list union
- Set iob.poll_ctx in io_do_iopoll() before calling iopoll callbacks
- Store poll_ctx in request in nvme_ns_chr_uring_cmd_iopoll()
- Check local vs remote context in nvme_uring_cmd_end_io()
~10% IOPS improvement is observed in the following benchmark:
fio/t/io_uring -b512 -d128 -c32 -s32 -p1 -F1 -B[0|1] -O0 -P1 -u1 -n1 /dev/ng0n1
Signed-off-by: Ming Lei <ming.lei at redhat.com>
---
drivers/nvme/host/ioctl.c | 36 ++++++++++++++++++++++++++++--------
include/linux/blk-mq.h | 4 +++-
include/linux/blkdev.h | 1 +
io_uring/rw.c | 7 +++++++
4 files changed, 39 insertions(+), 9 deletions(-)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index a9c097dacad6..0b85378f7fbb 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -425,14 +425,28 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
pdu->result = le64_to_cpu(nvme_req(req)->result.u64);
/*
- * IOPOLL could potentially complete this request directly, but
- * if multiple rings are polling on the same queue, then it's possible
- * for one ring to find completions for another ring. Punting the
- * completion via task_work will always direct it to the right
- * location, rather than potentially complete requests for ringA
- * under iopoll invocations from ringB.
+ * For IOPOLL, check if this completion is happening in the context
+ * of the same io_ring that owns the request (local context). If so,
+ * we can complete inline without task_work overhead. Otherwise, we
+ * must punt to task_work to ensure completion happens in the correct
+ * ring's context.
*/
- io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+ if (blk_rq_is_poll(req) && req->poll_ctx == io_uring_cmd_ctx_handle(ioucmd)) {
+ /*
+ * Local context: the polling ring owns this request.
+ * Complete inline for optimal performance.
+ */
+ if (pdu->bio)
+ blk_rq_unmap_user(pdu->bio);
+ io_uring_cmd_done32(ioucmd, pdu->status, pdu->result, 0);
+ } else {
+ /*
+ * Remote or non-IOPOLL context: either a different ring found
+ * this completion, or this is IRQ/softirq completion. Use
+ * task_work to direct completion to the correct location.
+ */
+ io_uring_cmd_do_in_task_lazy(ioucmd, nvme_uring_task_cb);
+ }
return RQ_END_IO_FREE;
}
@@ -677,8 +691,14 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
struct nvme_uring_cmd_pdu *pdu = nvme_uring_cmd_pdu(ioucmd);
struct request *req = pdu->req;
- if (req && blk_rq_is_poll(req))
+ if (req && blk_rq_is_poll(req)) {
+ /*
+ * Store the polling context in the request so end_io can
+ * detect if it's completing in the local ring's context.
+ */
+ req->poll_ctx = iob ? iob->poll_ctx : NULL;
return blk_rq_poll(req, iob, poll_flags);
+ }
return 0;
}
#ifdef CONFIG_NVME_MULTIPATH
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index cae9e857aea4..1975f5dd29f8 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -175,11 +175,13 @@ struct request {
* request reaches the dispatch list. The ipi_list is only used
* to queue the request for softirq completion, which is long
* after the request has been unhashed (and even removed from
- * the dispatch list).
+ * the dispatch list). poll_ctx is used during iopoll to track
+ * the io_ring_ctx that initiated the poll operation.
*/
union {
struct hlist_node hash; /* merge hash */
struct llist_node ipi_list;
+ void *poll_ctx; /* iopoll context */
};
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 72e34acd439c..4ed708912127 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1820,6 +1820,7 @@ void bdev_fput(struct file *bdev_file);
struct io_comp_batch {
struct rq_list req_list;
+ void *poll_ctx;
bool need_ts;
void (*complete)(struct io_comp_batch *);
};
diff --git a/io_uring/rw.c b/io_uring/rw.c
index c33c533a267e..27a49ce3de46 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -1321,6 +1321,13 @@ int io_do_iopoll(struct io_ring_ctx *ctx, bool force_nonspin)
struct io_kiocb *req, *tmp;
int nr_events = 0;
+ /*
+ * Store the polling ctx so drivers can detect if they're completing
+ * a request from the same ring that's polling (local) vs a different
+ * ring (remote). This enables optimizations for local completions.
+ */
+ iob.poll_ctx = ctx;
+
/*
* Only spin for completions if we don't have multiple devices hanging
* off our complete list.
--
2.47.1
More information about the Linux-nvme
mailing list