[PATCH 2/3] nvme-tcp: align I/O cpu with blk-mq mapping

Mon Jul 8 05:08:18 PDT 2024

On 08/07/2024 10:10, Hannes Reinecke wrote:
> We should align the 'io_cpu' setting with the blk-mq
> cpu mapping to ensure that we're not bouncing threads
> when doing I/O. To avoid cpu contention this patch also
> adds an atomic counter for the number of queues on each
> cpu to distribute the load across all CPUs in the blk-mq cpu set.
> Additionally we should always set the 'io_cpu' value, as
> in the WQ_UNBOUND case it'll be treated as a hint anyway.
>
> Performance comparison:
>                 baseline rx/tx    blk-mq align
> 4k seq write:  449MiB/s 480MiB/s 524MiB/s
> 4k rand write: 410MiB/s 481MiB/s 524MiB/s
> 4k seq read:   478MiB/s 481MiB/s 566MiB/s
> 4k rand read:  547MiB/s 480MiB/s 511MiB/s
>
> Signed-off-by: Hannes Reinecke <hare at kernel.org>
> ---
>   drivers/nvme/host/tcp.c | 65 +++++++++++++++++++++++++++++++----------
>   1 file changed, 49 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index f621d3ba89b2..a5c42a7b4bee 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -26,6 +26,8 @@
>   
>   struct nvme_tcp_queue;
>   
> +static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
> +
>   /* Define the socket priority to use for connections were it is desirable
>    * that the NIC consider performing optimized packet processing or filtering.
>    * A non-zero value being sufficient to indicate general consideration of any
> @@ -1578,20 +1580,42 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
>   static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
>   {
>   	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
> -	int qid = nvme_tcp_queue_id(queue);
> -	int n = 0;
> -
> -	if (nvme_tcp_default_queue(queue))
> -		n = qid - 1;
> -	else if (nvme_tcp_read_queue(queue))
> -		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
> -	else if (nvme_tcp_poll_queue(queue))
> +	struct blk_mq_tag_set *set = &ctrl->tag_set;
> +	int qid = nvme_tcp_queue_id(queue) - 1;

umm, its not the qid, why change it? I mean it looks harmless, but
I don't see why.

> +	unsigned int *mq_map = NULL;;
> +	int n = 0, cpu, io_cpu, min_queues = WORK_CPU_UNBOUND;

min_queues initialization looks very very weird. I can't even parse it.
besides, why is it declared in this scope?

> +
> +	if (nvme_tcp_default_queue(queue)) {
> +		mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
> +		n = qid;
> +	} else if (nvme_tcp_read_queue(queue)) {
> +		mq_map = set->map[HCTX_TYPE_READ].mq_map;
> +		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
> +	} else if (nvme_tcp_poll_queue(queue)) {
> +		mq_map = set->map[HCTX_TYPE_POLL].mq_map;
>   		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
> -				ctrl->io_queues[HCTX_TYPE_READ] - 1;
> -	if (wq_unbound)
> -		queue->io_cpu = WORK_CPU_UNBOUND;
> -	else
> -		queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
> +				ctrl->io_queues[HCTX_TYPE_READ];
> +	}
> +
> +	if (WARN_ON(!mq_map))
> +		return;
> +	for_each_online_cpu(cpu) {
> +		int num_queues;
> +
> +		if (mq_map[cpu] != qid)
> +			continue;
> +		num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
> +		if (num_queues < min_queues) {
> +			min_queues = num_queues;
> +			io_cpu = cpu;
> +		}
> +	}
> +	if (io_cpu != queue->io_cpu) {
> +		queue->io_cpu = io_cpu;
> +		atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
> +	}

Why is that conditioned ? why not always assign and inc the counter?

> +	dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
> +		qid, queue->io_cpu);
>   }
>   
>   static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
> @@ -1735,7 +1759,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
>   
>   	queue->sock->sk->sk_allocation = GFP_ATOMIC;
>   	queue->sock->sk->sk_use_task_frag = false;
> -	nvme_tcp_set_queue_io_cpu(queue);
> +	queue->io_cpu = WORK_CPU_UNBOUND;
>   	queue->request = NULL;
>   	queue->data_remaining = 0;
>   	queue->ddgst_remaining = 0;
> @@ -1847,6 +1871,10 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
>   	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
>   	nvme_tcp_restore_sock_ops(queue);
>   	cancel_work_sync(&queue->io_work);
> +	if (queue->io_cpu != WORK_CPU_UNBOUND) {
> +		atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]);
> +		queue->io_cpu = WORK_CPU_UNBOUND;
> +	}

Does anything change if we still set the io_cpu in wq_unbound case?
If not, I think we should always do it and simplify the code.

>   }
>   
>   static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
> @@ -1891,9 +1919,10 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
>   	nvme_tcp_init_recv_ctx(queue);
>   	nvme_tcp_setup_sock_ops(queue);
>   
> -	if (idx)
> +	if (idx) {
> +		nvme_tcp_set_queue_io_cpu(queue);
>   		ret = nvmf_connect_io_queue(nctrl, idx);
> -	else
> +	} else
>   		ret = nvmf_connect_admin_queue(nctrl);
>   
>   	if (!ret) {
> @@ -2920,6 +2949,7 @@ static struct nvmf_transport_ops nvme_tcp_transport = {
>   static int __init nvme_tcp_init_module(void)
>   {
>   	unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
> +	int cpu;
>   
>   	BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
>   	BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
> @@ -2937,6 +2967,9 @@ static int __init nvme_tcp_init_module(void)
>   	if (!nvme_tcp_wq)
>   		return -ENOMEM;
>   
> +	for_each_possible_cpu(cpu)
> +		atomic_set(&nvme_tcp_cpu_queues[cpu], 0);
> +

Why?