[PATCH 2/3] nvme-tcp: align I/O cpu with blk-mq mapping

Mon Jul 8 05:43:34 PDT 2024

On 7/8/24 14:08, Sagi Grimberg wrote:
> 
> 
> On 08/07/2024 10:10, Hannes Reinecke wrote:
>> We should align the 'io_cpu' setting with the blk-mq
>> cpu mapping to ensure that we're not bouncing threads
>> when doing I/O. To avoid cpu contention this patch also
>> adds an atomic counter for the number of queues on each
>> cpu to distribute the load across all CPUs in the blk-mq cpu set.
>> Additionally we should always set the 'io_cpu' value, as
>> in the WQ_UNBOUND case it'll be treated as a hint anyway.
>>
>> Performance comparison:
>>                 baseline rx/tx    blk-mq align
>> 4k seq write:  449MiB/s 480MiB/s 524MiB/s
>> 4k rand write: 410MiB/s 481MiB/s 524MiB/s
>> 4k seq read:   478MiB/s 481MiB/s 566MiB/s
>> 4k rand read:  547MiB/s 480MiB/s 511MiB/s
>>
>> Signed-off-by: Hannes Reinecke <hare at kernel.org>
>> ---
>>   drivers/nvme/host/tcp.c | 65 +++++++++++++++++++++++++++++++----------
>>   1 file changed, 49 insertions(+), 16 deletions(-)
>>
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index f621d3ba89b2..a5c42a7b4bee 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -26,6 +26,8 @@
>>   struct nvme_tcp_queue;
>> +static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
>> +
>>   /* Define the socket priority to use for connections were it is 
>> desirable
>>    * that the NIC consider performing optimized packet processing or 
>> filtering.
>>    * A non-zero value being sufficient to indicate general 
>> consideration of any
>> @@ -1578,20 +1580,42 @@ static bool nvme_tcp_poll_queue(struct 
>> nvme_tcp_queue *queue)
>>   static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
>>   {
>>       struct nvme_tcp_ctrl *ctrl = queue->ctrl;
>> -    int qid = nvme_tcp_queue_id(queue);
>> -    int n = 0;
>> -
>> -    if (nvme_tcp_default_queue(queue))
>> -        n = qid - 1;
>> -    else if (nvme_tcp_read_queue(queue))
>> -        n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
>> -    else if (nvme_tcp_poll_queue(queue))
>> +    struct blk_mq_tag_set *set = &ctrl->tag_set;
>> +    int qid = nvme_tcp_queue_id(queue) - 1;
> 
> umm, its not the qid, why change it? I mean it looks harmless, but
> I don't see why.
> 
>> +    unsigned int *mq_map = NULL;;
>> +    int n = 0, cpu, io_cpu, min_queues = WORK_CPU_UNBOUND;
> 
> min_queues initialization looks very very weird. I can't even parse it.
> besides, why is it declared in this scope?
> 
Because it's evaluated in the loop, with the value carried over across
loop iterations.
WORK_CPU_UNBOUND is the init value, to detect if it had been set at all.

>> +
>> +    if (nvme_tcp_default_queue(queue)) {
>> +        mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
>> +        n = qid;
>> +    } else if (nvme_tcp_read_queue(queue)) {
>> +        mq_map = set->map[HCTX_TYPE_READ].mq_map;
>> +        n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
>> +    } else if (nvme_tcp_poll_queue(queue)) {
>> +        mq_map = set->map[HCTX_TYPE_POLL].mq_map;
>>           n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
>> -                ctrl->io_queues[HCTX_TYPE_READ] - 1;
>> -    if (wq_unbound)
>> -        queue->io_cpu = WORK_CPU_UNBOUND;
>> -    else
>> -        queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, -1, 
>> false);
>> +                ctrl->io_queues[HCTX_TYPE_READ];
>> +    }
>> +
>> +    if (WARN_ON(!mq_map))
>> +        return;
>> +    for_each_online_cpu(cpu) {
>> +        int num_queues;
>> +
>> +        if (mq_map[cpu] != qid)
>> +            continue;
>> +        num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
>> +        if (num_queues < min_queues) {
>> +            min_queues = num_queues;
>> +            io_cpu = cpu;
>> +        }
>> +    }
>> +    if (io_cpu != queue->io_cpu) {
>> +        queue->io_cpu = io_cpu;
>> +        atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
>> +    }
> 
> Why is that conditioned ? why not always assign and inc the counter?
> 
Because we're having the conditional

if (mq_map[cpu] != qid)

above, so technically we might not find the correct mapping.
Might be worthwhile doing a WARN_ON() here, but it might trigger
with CPU hotplug...

>> +    dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
>> +        qid, queue->io_cpu);
>>   }
>>   static void nvme_tcp_tls_done(void *data, int status, key_serial_t 
>> pskid)
>> @@ -1735,7 +1759,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl 
>> *nctrl, int qid,
>>       queue->sock->sk->sk_allocation = GFP_ATOMIC;
>>       queue->sock->sk->sk_use_task_frag = false;
>> -    nvme_tcp_set_queue_io_cpu(queue);
>> +    queue->io_cpu = WORK_CPU_UNBOUND;
>>       queue->request = NULL;
>>       queue->data_remaining = 0;
>>       queue->ddgst_remaining = 0;
>> @@ -1847,6 +1871,10 @@ static void __nvme_tcp_stop_queue(struct 
>> nvme_tcp_queue *queue)
>>       kernel_sock_shutdown(queue->sock, SHUT_RDWR);
>>       nvme_tcp_restore_sock_ops(queue);
>>       cancel_work_sync(&queue->io_work);
>> +    if (queue->io_cpu != WORK_CPU_UNBOUND) {
>> +        atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]);
>> +        queue->io_cpu = WORK_CPU_UNBOUND;
>> +    }
> 
> Does anything change if we still set the io_cpu in wq_unbound case?
> If not, I think we should always do it and simplify the code.
> 
See above. WORK_CPU_UNBOUND means that we failed to find the cpu
in the mq_map. Yes, I know, unlikely, but not impossible.

>>   }
>>   static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
>> @@ -1891,9 +1919,10 @@ static int nvme_tcp_start_queue(struct 
>> nvme_ctrl *nctrl, int idx)
>>       nvme_tcp_init_recv_ctx(queue);
>>       nvme_tcp_setup_sock_ops(queue);
>> -    if (idx)
>> +    if (idx) {
>> +        nvme_tcp_set_queue_io_cpu(queue);
>>           ret = nvmf_connect_io_queue(nctrl, idx);
>> -    else
>> +    } else
>>           ret = nvmf_connect_admin_queue(nctrl);
>>       if (!ret) {
>> @@ -2920,6 +2949,7 @@ static struct nvmf_transport_ops 
>> nvme_tcp_transport = {
>>   static int __init nvme_tcp_init_module(void)
>>   {
>>       unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
>> +    int cpu;
>>       BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
>>       BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
>> @@ -2937,6 +2967,9 @@ static int __init nvme_tcp_init_module(void)
>>       if (!nvme_tcp_wq)
>>           return -ENOMEM;
>> +    for_each_possible_cpu(cpu)
>> +        atomic_set(&nvme_tcp_cpu_queues[cpu], 0);
>> +
> 
> Why?

Don't we need to initialize the atomic counters?

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare at suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich