[RFC PATCH 1/4] nvme-tcp: optionally limit I/O queue count based on NIC queues

Nilay Shroff nilay at linux.ibm.com
Mon Apr 20 04:49:33 PDT 2026


NVMe-TCP currently provisions I/O queues primarily based on CPU
availability. On systems where the number of CPUs significantly exceeds
the number of NIC hardware queues, this can lead to multiple I/O queues
sharing the same NIC TX/RX queues, resulting in increased lock
contention, cacheline bouncing, and inter-processor interrupts (IPIs).

In such configurations, limiting the number of NVMe-TCP I/O queues to
the number of NIC hardware queues can improve performance by reducing
contention and improving locality. Aligning NVMe-TCP worker threads with
NIC queue topology may also help reduce tail latency.

Add a new transport option "match_hw_queues" to allow users to
optionally limit the number of NVMe-TCP I/O queues to the number of NIC
TX/RX queues. When enabled, the number of I/O queues is set to:

    min(num_online_cpus, num_nic_queues)

This behavior is opt-in and does not change existing defaults.

Signed-off-by: Nilay Shroff <nilay at linux.ibm.com>
---
 drivers/nvme/host/fabrics.c |   4 ++
 drivers/nvme/host/fabrics.h |   3 +
 drivers/nvme/host/tcp.c     | 120 +++++++++++++++++++++++++++++++++++-
 3 files changed, 126 insertions(+), 1 deletion(-)

diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index ac3d4f400601..62ae998825e1 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -709,6 +709,7 @@ static const match_table_t opt_tokens = {
 	{ NVMF_OPT_TLS,			"tls"			},
 	{ NVMF_OPT_CONCAT,		"concat"		},
 #endif
+	{ NVMF_OPT_MATCH_HW_QUEUES,	"match_hw_queues"	},
 	{ NVMF_OPT_ERR,			NULL			}
 };
 
@@ -1064,6 +1065,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
 			}
 			opts->concat = true;
 			break;
+		case NVMF_OPT_MATCH_HW_QUEUES:
+			opts->match_hw_queues = true;
+			break;
 		default:
 			pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n",
 				p);
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index caf5503d0833..e8e3a2672832 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -67,6 +67,7 @@ enum {
 	NVMF_OPT_KEYRING	= 1 << 26,
 	NVMF_OPT_TLS_KEY	= 1 << 27,
 	NVMF_OPT_CONCAT		= 1 << 28,
+	NVMF_OPT_MATCH_HW_QUEUES = 1 << 29,
 };
 
 /**
@@ -106,6 +107,7 @@ enum {
  * @disable_sqflow: disable controller sq flow control
  * @hdr_digest: generate/verify header digest (TCP)
  * @data_digest: generate/verify data digest (TCP)
+ * @match_hw_queues: limit controller IO queue count based on NIC queues (TCP)
  * @nr_write_queues: number of queues for write I/O
  * @nr_poll_queues: number of queues for polling I/O
  * @tos: type of service
@@ -136,6 +138,7 @@ struct nvmf_ctrl_options {
 	bool			disable_sqflow;
 	bool			hdr_digest;
 	bool			data_digest;
+	bool			match_hw_queues;
 	unsigned int		nr_write_queues;
 	unsigned int		nr_poll_queues;
 	int			tos;
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 243dab830dc8..7102a7a54d78 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -16,6 +16,8 @@
 #include <net/tls.h>
 #include <net/tls_prot.h>
 #include <net/handshake.h>
+#include <net/ip6_route.h>
+#include <linux/in6.h>
 #include <linux/blk-mq.h>
 #include <net/busy_poll.h>
 #include <trace/events/sock.h>
@@ -1762,6 +1764,103 @@ static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
 	return ret;
 }
 
+static struct net_device *nvme_tcp_get_netdev(struct nvme_ctrl *ctrl)
+{
+	struct net_device *dev = NULL;
+
+	if (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)
+		dev = dev_get_by_name(&init_net, ctrl->opts->host_iface);
+	else {
+		struct nvme_tcp_ctrl *tctrl = to_tcp_ctrl(ctrl);
+
+		if (tctrl->addr.ss_family == AF_INET) {
+			struct rtable *rt;
+			struct flowi4 fl4 = {};
+			struct sockaddr_in *addr =
+					(struct sockaddr_in *)&tctrl->addr;
+
+			fl4.daddr = addr->sin_addr.s_addr;
+			if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
+				addr = (struct sockaddr_in *)&tctrl->src_addr;
+				fl4.saddr = addr->sin_addr.s_addr;
+			}
+			fl4.flowi4_proto = IPPROTO_TCP;
+
+			rt = ip_route_output_key(&init_net, &fl4);
+			if (IS_ERR(rt))
+				return NULL;
+
+			dev = dst_dev(&rt->dst);
+			/*
+			 * Get reference to netdev as ip_rt_put() will
+			 * release the netdev reference.
+			 */
+			if (dev)
+				dev_hold(dev);
+
+			ip_rt_put(rt);
+
+		} else if (tctrl->addr.ss_family == AF_INET6) {
+			struct dst_entry *dst;
+			struct flowi6 fl6 = {};
+			struct sockaddr_in6 *addr6 =
+					(struct sockaddr_in6 *)&tctrl->addr;
+
+			fl6.daddr = addr6->sin6_addr;
+			if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
+				addr6 = (struct sockaddr_in6 *)&tctrl->src_addr;
+				fl6.saddr = addr6->sin6_addr;
+			}
+			fl6.flowi6_proto = IPPROTO_TCP;
+
+			dst = ip6_route_output(&init_net, NULL, &fl6);
+			if (dst->error) {
+				dst_release(dst);
+				return NULL;
+			}
+
+			dev = dst_dev(dst);
+			/*
+			 * Get reference to netdev as dst_release() will
+			 * release the netdev reference.
+			 */
+			if (dev)
+				dev_hold(dev);
+
+			dst_release(dst);
+		}
+	}
+
+	return dev;
+}
+
+static void nvme_tcp_put_netdev(struct net_device *dev)
+{
+	if (dev)
+		dev_put(dev);
+}
+
+/*
+ * Returns number of active NIC queues (min of TX/RX), or 0 if device cannot
+ * be determined.
+ */
+static int nvme_tcp_get_netdev_current_queue_count(struct nvme_ctrl *ctrl)
+{
+	struct net_device *dev;
+	int tx_queues, rx_queues;
+
+	dev = nvme_tcp_get_netdev(ctrl);
+	if (!dev)
+		return 0;
+
+	tx_queues = dev->real_num_tx_queues;
+	rx_queues = dev->real_num_rx_queues;
+
+	nvme_tcp_put_netdev(dev);
+
+	return min(tx_queues, rx_queues);
+}
+
 static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 				key_serial_t pskid)
 {
@@ -2144,6 +2243,24 @@ static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
 	unsigned int nr_io_queues;
 	int ret;
 
+	if (!(ctrl->opts->mask & NVMF_OPT_NR_IO_QUEUES) &&
+			(ctrl->opts->mask & NVMF_OPT_MATCH_HW_QUEUES)) {
+		int nr_hw_queues;
+
+		nr_hw_queues = nvme_tcp_get_netdev_current_queue_count(ctrl);
+		if (nr_hw_queues <= 0)
+			goto init_queue;
+
+		ctrl->opts->nr_io_queues = min(nr_hw_queues, num_online_cpus());
+
+		if (ctrl->opts->nr_io_queues < num_online_cpus())
+			dev_info(ctrl->device,
+				"limiting I/O queues to %u (NIC queues %d, CPUs %u)\n",
+				ctrl->opts->nr_io_queues, nr_hw_queues,
+				num_online_cpus());
+	}
+
+init_queue:
 	nr_io_queues = nvmf_nr_io_queues(ctrl->opts);
 	ret = nvme_set_queue_count(ctrl, &nr_io_queues);
 	if (ret)
@@ -3019,7 +3136,8 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
 			  NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
 			  NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
 			  NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
-			  NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY | NVMF_OPT_CONCAT,
+			  NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY |
+			  NVMF_OPT_CONCAT | NVMF_OPT_MATCH_HW_QUEUES,
 	.create_ctrl	= nvme_tcp_create_ctrl,
 };
 
-- 
2.53.0




More information about the Linux-nvme mailing list