[PATCH 2/3] nvme-tcp: align I/O cpu with blk-mq mapping
Sagi Grimberg
sagi at grimberg.me
Mon Jul 8 05:08:18 PDT 2024
On 08/07/2024 10:10, Hannes Reinecke wrote:
> We should align the 'io_cpu' setting with the blk-mq
> cpu mapping to ensure that we're not bouncing threads
> when doing I/O. To avoid cpu contention this patch also
> adds an atomic counter for the number of queues on each
> cpu to distribute the load across all CPUs in the blk-mq cpu set.
> Additionally we should always set the 'io_cpu' value, as
> in the WQ_UNBOUND case it'll be treated as a hint anyway.
>
> Performance comparison:
> baseline rx/tx blk-mq align
> 4k seq write: 449MiB/s 480MiB/s 524MiB/s
> 4k rand write: 410MiB/s 481MiB/s 524MiB/s
> 4k seq read: 478MiB/s 481MiB/s 566MiB/s
> 4k rand read: 547MiB/s 480MiB/s 511MiB/s
>
> Signed-off-by: Hannes Reinecke <hare at kernel.org>
> ---
> drivers/nvme/host/tcp.c | 65 +++++++++++++++++++++++++++++++----------
> 1 file changed, 49 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index f621d3ba89b2..a5c42a7b4bee 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -26,6 +26,8 @@
>
> struct nvme_tcp_queue;
>
> +static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
> +
> /* Define the socket priority to use for connections were it is desirable
> * that the NIC consider performing optimized packet processing or filtering.
> * A non-zero value being sufficient to indicate general consideration of any
> @@ -1578,20 +1580,42 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
> static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
> {
> struct nvme_tcp_ctrl *ctrl = queue->ctrl;
> - int qid = nvme_tcp_queue_id(queue);
> - int n = 0;
> -
> - if (nvme_tcp_default_queue(queue))
> - n = qid - 1;
> - else if (nvme_tcp_read_queue(queue))
> - n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
> - else if (nvme_tcp_poll_queue(queue))
> + struct blk_mq_tag_set *set = &ctrl->tag_set;
> + int qid = nvme_tcp_queue_id(queue) - 1;
umm, its not the qid, why change it? I mean it looks harmless, but
I don't see why.
> + unsigned int *mq_map = NULL;;
> + int n = 0, cpu, io_cpu, min_queues = WORK_CPU_UNBOUND;
min_queues initialization looks very very weird. I can't even parse it.
besides, why is it declared in this scope?
> +
> + if (nvme_tcp_default_queue(queue)) {
> + mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
> + n = qid;
> + } else if (nvme_tcp_read_queue(queue)) {
> + mq_map = set->map[HCTX_TYPE_READ].mq_map;
> + n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
> + } else if (nvme_tcp_poll_queue(queue)) {
> + mq_map = set->map[HCTX_TYPE_POLL].mq_map;
> n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
> - ctrl->io_queues[HCTX_TYPE_READ] - 1;
> - if (wq_unbound)
> - queue->io_cpu = WORK_CPU_UNBOUND;
> - else
> - queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
> + ctrl->io_queues[HCTX_TYPE_READ];
> + }
> +
> + if (WARN_ON(!mq_map))
> + return;
> + for_each_online_cpu(cpu) {
> + int num_queues;
> +
> + if (mq_map[cpu] != qid)
> + continue;
> + num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
> + if (num_queues < min_queues) {
> + min_queues = num_queues;
> + io_cpu = cpu;
> + }
> + }
> + if (io_cpu != queue->io_cpu) {
> + queue->io_cpu = io_cpu;
> + atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
> + }
Why is that conditioned ? why not always assign and inc the counter?
> + dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
> + qid, queue->io_cpu);
> }
>
> static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
> @@ -1735,7 +1759,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
>
> queue->sock->sk->sk_allocation = GFP_ATOMIC;
> queue->sock->sk->sk_use_task_frag = false;
> - nvme_tcp_set_queue_io_cpu(queue);
> + queue->io_cpu = WORK_CPU_UNBOUND;
> queue->request = NULL;
> queue->data_remaining = 0;
> queue->ddgst_remaining = 0;
> @@ -1847,6 +1871,10 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
> kernel_sock_shutdown(queue->sock, SHUT_RDWR);
> nvme_tcp_restore_sock_ops(queue);
> cancel_work_sync(&queue->io_work);
> + if (queue->io_cpu != WORK_CPU_UNBOUND) {
> + atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]);
> + queue->io_cpu = WORK_CPU_UNBOUND;
> + }
Does anything change if we still set the io_cpu in wq_unbound case?
If not, I think we should always do it and simplify the code.
> }
>
> static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> @@ -1891,9 +1919,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
> nvme_tcp_init_recv_ctx(queue);
> nvme_tcp_setup_sock_ops(queue);
>
> - if (idx)
> + if (idx) {
> + nvme_tcp_set_queue_io_cpu(queue);
> ret = nvmf_connect_io_queue(nctrl, idx);
> - else
> + } else
> ret = nvmf_connect_admin_queue(nctrl);
>
> if (!ret) {
> @@ -2920,6 +2949,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
> static int __init nvme_tcp_init_module(void)
> {
> unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
> + int cpu;
>
> BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
> BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
> @@ -2937,6 +2967,9 @@ static int __init nvme_tcp_init_module(void)
> if (!nvme_tcp_wq)
> return -ENOMEM;
>
> + for_each_possible_cpu(cpu)
> + atomic_set(&nvme_tcp_cpu_queues[cpu], 0);
> +
Why?
More information about the Linux-nvme
mailing list