[PATCH 8/8] nvme-tcp: align I/O cpu with blk-mq mapping

Wed Jul 17 14:34:51 PDT 2024

On 16/07/2024 10:36, Hannes Reinecke wrote:
> We should align the 'io_cpu' setting with the blk-mq
> cpu mapping to ensure that we're not bouncing threads
> when doing I/O. To avoid cpu contention this patch also
> adds an atomic counter for the number of queues on each
> cpu to distribute the load across all CPUs in the blk-mq cpu set.
> Additionally we should always set the 'io_cpu' value, as
> in the WQ_UNBOUND case it'll be treated as a hint anyway.
>
> Signed-off-by: Hannes Reinecke <hare at kernel.org>
> ---
>   drivers/nvme/host/tcp.c | 65 +++++++++++++++++++++++++++++++----------
>   1 file changed, 49 insertions(+), 16 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index f3a94168b2c3..a391a3f7c4d7 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -28,6 +28,8 @@
>   
>   struct nvme_tcp_queue;
>   
> +static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
> +
>   /* Define the socket priority to use for connections were it is desirable
>    * that the NIC consider performing optimized packet processing or filtering.
>    * A non-zero value being sufficient to indicate general consideration of any
> @@ -1799,20 +1801,42 @@ static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
>   static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
>   {
>   	struct nvme_tcp_ctrl *ctrl = queue->ctrl;
> -	int qid = nvme_tcp_queue_id(queue);
> -	int n = 0;
> -
> -	if (nvme_tcp_default_queue(queue))
> -		n = qid - 1;
> -	else if (nvme_tcp_read_queue(queue))
> -		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
> -	else if (nvme_tcp_poll_queue(queue))
> +	struct blk_mq_tag_set *set = &ctrl->tag_set;
> +	int qid = nvme_tcp_queue_id(queue) - 1;
> +	unsigned int *mq_map = NULL;;
> +	int n = 0, cpu, io_cpu, min_queues = WORK_CPU_UNBOUND;

Again, min_queues is a minimum quantity, not an id. It makes zero sense
to use WORK_CPU_UNBOUND as the initializer. Just set it to INT_MAX or 
something.
> +
> +	if (nvme_tcp_default_queue(queue)) {
> +		mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
> +		n = qid;
> +	} else if (nvme_tcp_read_queue(queue)) {
> +		mq_map = set->map[HCTX_TYPE_READ].mq_map;
> +		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
> +	} else if (nvme_tcp_poll_queue(queue)) {
> +		mq_map = set->map[HCTX_TYPE_POLL].mq_map;
>   		n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
> -				ctrl->io_queues[HCTX_TYPE_READ] - 1;
> -	if (wq_unbound)
> -		queue->io_cpu = WORK_CPU_UNBOUND;
> -	else
> -		queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, false);
> +				ctrl->io_queues[HCTX_TYPE_READ];
> +	}
> +
> +	if (WARN_ON(!mq_map))
> +		return;
> +	for_each_online_cpu(cpu) {
> +		int num_queues;
> +
> +		if (mq_map[cpu] != qid)
> +			continue;
> +		num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
> +		if (num_queues < min_queues) {
> +			min_queues = num_queues;
> +			io_cpu = cpu;
> +		}
> +	}
> +	if (io_cpu != queue->io_cpu) {
> +		queue->io_cpu = io_cpu;

Hannes, the code may make sense to you, but not to me.
Please do not add code like:
     if (a != b) {
         b = a;
     }

I think it is a sign that we are doing something wrong here.

> +		atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
> +	}

Again, why can't we always set io_cpu and increment the counter?
If the wq is unbound, it makes no difference, and if the wq is bound, that
is actually what you want to do. What am I missing?

> +	dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
> +		qid, queue->io_cpu);
>   }
>   
>   static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
> @@ -1957,7 +1981,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
>   
>   	queue->sock->sk->sk_allocation = GFP_ATOMIC;
>   	queue->sock->sk->sk_use_task_frag = false;
> -	nvme_tcp_set_queue_io_cpu(queue);
> +	queue->io_cpu = WORK_CPU_UNBOUND;
>   	queue->request = NULL;
>   	queue->data_remaining = 0;
>   	queue->ddgst_remaining = 0;
> @@ -2088,6 +2112,10 @@ static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
>   	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
>   	nvme_tcp_restore_sock_ops(queue);
>   	cancel_work_sync(&queue->io_work);
> +	if (queue->io_cpu != WORK_CPU_UNBOUND) {

I think that we can safely always set queue->io_cpu to a cpu. If the 
unbound_wq
only operates on a subset of the cores, it doesn't matter anyways...

The rest of the patch looks good though.