[PATCH] nvme-tcp: align I/O cpu with blk-mq mapping

Tue Jun 18 05:03:45 PDT 2024

Add a new module parameter 'wq_affinity' to spread the I/O
over all cpus within the blk-mq hctx mapping for the queue.
This avoids bouncing I/O between cpus when we have less
hardware queues than cpus.

Signed-off-by: Hannes Reinecke <hare at kernel.org>
---
 drivers/nvme/host/tcp.c | 57 ++++++++++++++++++++++++++++++++---------
 1 file changed, 45 insertions(+), 12 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 3be67c98c906..8c675ee50ccf 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -44,6 +44,13 @@ static bool wq_unbound;
 module_param(wq_unbound, bool, 0644);
 MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)");
 
+/*
+ * Balance workqueues across all CPUs in the set for a given queue
+ */
+static bool wq_affinity;
+module_param(wq_affinity, bool, 0644);
+MODULE_PARM_DESC(wq_affinity, "Match workqueues to blk-mq cpumask for nvme-tcp IO context (default false)");
+
 /*
  * TLS handshake timeout
  */
@@ -1550,20 +1557,40 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
 static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
 {
 	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
-	int qid = nvme_tcp_queue_id(queue);
+	struct blk_mq_tag_set *set = &ctrl->tag_set;
+	int qid = nvme_tcp_queue_id(queue) - 1;
+	unsigned int *mq_map;
 	int n = 0;
 
-	if (nvme_tcp_default_queue(queue))
-		n = qid - 1;
-	else if (nvme_tcp_read_queue(queue))
-		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
-	else if (nvme_tcp_poll_queue(queue))
+	if (nvme_tcp_default_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
+		n = qid;
+	} else if (nvme_tcp_read_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_READ].mq_map;
+		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
+	} else if (nvme_tcp_poll_queue(queue)) {
+		mq_map = set->map[HCTX_TYPE_POLL].mq_map;
 		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
-				ctrl->io_queues[HCTX_TYPE_READ] - 1;
+				ctrl->io_queues[HCTX_TYPE_READ];
+	}
 	if (wq_unbound)
 		queue->io_cpu = WORK_CPU_UNBOUND;
-	else
+	else if (wq_affinity) {
+		int i;
+
+		if (WARN_ON(!mq_map))
+			return;
+		for_each_cpu(i, cpu_online_mask) {
+			if (mq_map[i] == qid) {
+				queue->io_cpu = i;
+				break;
+			}
+		}
+	} else
 		queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+
+	dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
+		qid, queue->io_cpu);
 }
 
 static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
@@ -1704,7 +1731,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 
 	queue->sock->sk->sk_allocation = GFP_ATOMIC;
 	queue->sock->sk->sk_use_task_frag = false;
-	nvme_tcp_set_queue_io_cpu(queue);
+	queue->io_cpu = WORK_CPU_UNBOUND;
 	queue->request = NULL;
 	queue->data_remaining = 0;
 	queue->ddgst_remaining = 0;
@@ -1858,9 +1885,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
 	nvme_tcp_init_recv_ctx(queue);
 	nvme_tcp_setup_sock_ops(queue);
 
-	if (idx)
+	if (idx) {
+		nvme_tcp_set_queue_io_cpu(queue);
 		ret = nvmf_connect_io_queue(nctrl, idx);
-	else
+	} else
 		ret = nvmf_connect_admin_queue(nctrl);
 
 	if (!ret) {
@@ -2837,8 +2865,13 @@ static int __init nvme_tcp_init_module(void)
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
 	BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
 
-	if (wq_unbound)
+	if (wq_unbound) {
+		if (wq_affinity) {
+			pr_err("cannot specify both 'wq_unbound' and 'wq_affinity'\n");
+			return -EINVAL;
+		}
 		wq_flags |= WQ_UNBOUND;
+	}
 
 	nvme_tcp_wq = alloc_workqueue("nvme_tcp_wq", wq_flags, 0);
 	if (!nvme_tcp_wq)
-- 
2.35.3