[PATCH 2/3] nvme-tcp: align I/O cpu with blk-mq mapping
Hannes Reinecke
hare at kernel.org
Mon Jul 8 00:10:12 PDT 2024
We should align the 'io_cpu' setting with the blk-mq
cpu mapping to ensure that we're not bouncing threads
when doing I/O. To avoid cpu contention this patch also
adds an atomic counter for the number of queues on each
cpu to distribute the load across all CPUs in the blk-mq cpu set.
Additionally we should always set the 'io_cpu' value, as
in the WQ_UNBOUND case it'll be treated as a hint anyway.
Performance comparison:
baseline rx/tx blk-mq align
4k seq write: 449MiB/s 480MiB/s 524MiB/s
4k rand write: 410MiB/s 481MiB/s 524MiB/s
4k seq read: 478MiB/s 481MiB/s 566MiB/s
4k rand read: 547MiB/s 480MiB/s 511MiB/s
Signed-off-by: Hannes Reinecke <hare at kernel.org>
---
drivers/nvme/host/tcp.c | 65 +++++++++++++++++++++++++++++++----------
1 file changed, 49 insertions(+), 16 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index f621d3ba89b2..a5c42a7b4bee 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -26,6 +26,8 @@
struct nvme_tcp_queue;
+static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
+
/* Define the socket priority to use for connections were it is desirable
* that the NIC consider performing optimized packet processing or filtering.
* A non-zero value being sufficient to indicate general consideration of any
@@ -1578,20 +1580,42 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
{
struct nvme_tcp_ctrl *ctrl = queue->ctrl;
- int qid = nvme_tcp_queue_id(queue);
- int n = 0;
-
- if (nvme_tcp_default_queue(queue))
- n = qid - 1;
- else if (nvme_tcp_read_queue(queue))
- n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
- else if (nvme_tcp_poll_queue(queue))
+ struct blk_mq_tag_set *set = &ctrl->tag_set;
+ int qid = nvme_tcp_queue_id(queue) - 1;
+ unsigned int *mq_map = NULL;;
+ int n = 0, cpu, io_cpu, min_queues = WORK_CPU_UNBOUND;
+
+ if (nvme_tcp_default_queue(queue)) {
+ mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
+ n = qid;
+ } else if (nvme_tcp_read_queue(queue)) {
+ mq_map = set->map[HCTX_TYPE_READ].mq_map;
+ n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
+ } else if (nvme_tcp_poll_queue(queue)) {
+ mq_map = set->map[HCTX_TYPE_POLL].mq_map;
n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
- ctrl->io_queues[HCTX_TYPE_READ] - 1;
- if (wq_unbound)
- queue->io_cpu = WORK_CPU_UNBOUND;
- else
- queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
+ ctrl->io_queues[HCTX_TYPE_READ];
+ }
+
+ if (WARN_ON(!mq_map))
+ return;
+ for_each_online_cpu(cpu) {
+ int num_queues;
+
+ if (mq_map[cpu] != qid)
+ continue;
+ num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
+ if (num_queues < min_queues) {
+ min_queues = num_queues;
+ io_cpu = cpu;
+ }
+ }
+ if (io_cpu != queue->io_cpu) {
+ queue->io_cpu = io_cpu;
+ atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
+ }
+ dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
+ qid, queue->io_cpu);
}
static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
@@ -1735,7 +1759,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
queue->sock->sk->sk_allocation = GFP_ATOMIC;
queue->sock->sk->sk_use_task_frag = false;
- nvme_tcp_set_queue_io_cpu(queue);
+ queue->io_cpu = WORK_CPU_UNBOUND;
queue->request = NULL;
queue->data_remaining = 0;
queue->ddgst_remaining = 0;
@@ -1847,6 +1871,10 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
kernel_sock_shutdown(queue->sock, SHUT_RDWR);
nvme_tcp_restore_sock_ops(queue);
cancel_work_sync(&queue->io_work);
+ if (queue->io_cpu != WORK_CPU_UNBOUND) {
+ atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]);
+ queue->io_cpu = WORK_CPU_UNBOUND;
+ }
}
static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
@@ -1891,9 +1919,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
nvme_tcp_init_recv_ctx(queue);
nvme_tcp_setup_sock_ops(queue);
- if (idx)
+ if (idx) {
+ nvme_tcp_set_queue_io_cpu(queue);
ret = nvmf_connect_io_queue(nctrl, idx);
- else
+ } else
ret = nvmf_connect_admin_queue(nctrl);
if (!ret) {
@@ -2920,6 +2949,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
static int __init nvme_tcp_init_module(void)
{
unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
+ int cpu;
BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
@@ -2937,6 +2967,9 @@ static int __init nvme_tcp_init_module(void)
if (!nvme_tcp_wq)
return -ENOMEM;
+ for_each_possible_cpu(cpu)
+ atomic_set(&nvme_tcp_cpu_queues[cpu], 0);
+
nvmf_register_transport(&nvme_tcp_transport);
return 0;
}
--
2.35.3
More information about the Linux-nvme
mailing list