[PATCH 2/3] nvme-tcp: align I/O cpu with blk-mq mapping

Mon Jul 8 07:38:58 PDT 2024

On 08/07/2024 15:43, Hannes Reinecke wrote:
> On 7/8/24 14:08, Sagi Grimberg wrote:
>>
>>
>> On 08/07/2024 10:10, Hannes Reinecke wrote:
>>> We should align the 'io_cpu' setting with the blk-mq
>>> cpu mapping to ensure that we're not bouncing threads
>>> when doing I/O. To avoid cpu contention this patch also
>>> adds an atomic counter for the number of queues on each
>>> cpu to distribute the load across all CPUs in the blk-mq cpu set.
>>> Additionally we should always set the 'io_cpu' value, as
>>> in the WQ_UNBOUND case it'll be treated as a hint anyway.
>>>
>>> Performance comparison:
>>>                 baseline rx/tx    blk-mq align
>>> 4k seq write:  449MiB/s 480MiB/s 524MiB/s
>>> 4k rand write: 410MiB/s 481MiB/s 524MiB/s
>>> 4k seq read:   478MiB/s 481MiB/s 566MiB/s
>>> 4k rand read:  547MiB/s 480MiB/s 511MiB/s
>>>
>>> Signed-off-by: Hannes Reinecke <hare at kernel.org>
>>> ---
>>>   drivers/nvme/host/tcp.c | 65 
>>> +++++++++++++++++++++++++++++++----------
>>>   1 file changed, 49 insertions(+), 16 deletions(-)
>>>
>>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>>> index f621d3ba89b2..a5c42a7b4bee 100644
>>> --- a/drivers/nvme/host/tcp.c
>>> +++ b/drivers/nvme/host/tcp.c
>>> @@ -26,6 +26,8 @@
>>>   struct nvme_tcp_queue;
>>> +static atomic_t nvme_tcp_cpu_queues[NR_CPUS];
>>> +
>>>   /* Define the socket priority to use for connections were it is 
>>> desirable
>>>    * that the NIC consider performing optimized packet processing or 
>>> filtering.
>>>    * A non-zero value being sufficient to indicate general 
>>> consideration of any
>>> @@ -1578,20 +1580,42 @@ static bool nvme_tcp_poll_queue(struct 
>>> nvme_tcp_queue *queue)
>>>   static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
>>>   {
>>>       struct nvme_tcp_ctrl *ctrl = queue->ctrl;
>>> -    int qid = nvme_tcp_queue_id(queue);
>>> -    int n = 0;
>>> -
>>> -    if (nvme_tcp_default_queue(queue))
>>> -        n = qid - 1;
>>> -    else if (nvme_tcp_read_queue(queue))
>>> -        n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
>>> -    else if (nvme_tcp_poll_queue(queue))
>>> +    struct blk_mq_tag_set *set = &ctrl->tag_set;
>>> +    int qid = nvme_tcp_queue_id(queue) - 1;
>>
>> umm, its not the qid, why change it? I mean it looks harmless, but
>> I don't see why.
>>
>>> +    unsigned int *mq_map = NULL;;
>>> +    int n = 0, cpu, io_cpu, min_queues = WORK_CPU_UNBOUND;
>>
>> min_queues initialization looks very very weird. I can't even parse it.
>> besides, why is it declared in this scope?
>>
> Because it's evaluated in the loop, with the value carried over across
> loop iterations.

Right. missed that.

> WORK_CPU_UNBOUND is the init value, to detect if it had been set at all.

as far as I read this patch, min_queues and num_queues are a quantity of
queues. So it is bizzar to see it initiatlized to this completely 
unrelated value
of WORK_CPU_UNBOUND. its really confusing. Still not sure what is tbh.

>
>>> +
>>> +    if (nvme_tcp_default_queue(queue)) {
>>> +        mq_map = set->map[HCTX_TYPE_DEFAULT].mq_map;
>>> +        n = qid;
>>> +    } else if (nvme_tcp_read_queue(queue)) {
>>> +        mq_map = set->map[HCTX_TYPE_READ].mq_map;
>>> +        n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT];
>>> +    } else if (nvme_tcp_poll_queue(queue)) {
>>> +        mq_map = set->map[HCTX_TYPE_POLL].mq_map;
>>>           n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
>>> -                ctrl->io_queues[HCTX_TYPE_READ] - 1;
>>> -    if (wq_unbound)
>>> -        queue->io_cpu = WORK_CPU_UNBOUND;
>>> -    else
>>> -        queue->io_cpu = cpumask_next_wrap(n - 1, cpu_online_mask, 
>>> -1, false);
>>> +                ctrl->io_queues[HCTX_TYPE_READ];
>>> +    }
>>> +
>>> +    if (WARN_ON(!mq_map))
>>> +        return;
>>> +    for_each_online_cpu(cpu) {
>>> +        int num_queues;
>>> +
>>> +        if (mq_map[cpu] != qid)
>>> +            continue;
>>> +        num_queues = atomic_read(&nvme_tcp_cpu_queues[cpu]);
>>> +        if (num_queues < min_queues) {
>>> +            min_queues = num_queues;
>>> +            io_cpu = cpu;
>>> +        }
>>> +    }
>>> +    if (io_cpu != queue->io_cpu) {
>>> +        queue->io_cpu = io_cpu;
>>> +        atomic_inc(&nvme_tcp_cpu_queues[io_cpu]);
>>> +    }
>>
>> Why is that conditioned ? why not always assign and inc the counter?
>>
> Because we're having the conditional
>
> if (mq_map[cpu] != qid)
>
> above, so technically we might not find the correct mapping.
> Might be worthwhile doing a WARN_ON() here, but it might trigger
> with CPU hotplug...

Writing code that looks like:
     if (a != b) {
         a = b;
     }

is silly.

iiuc It is probably because you initialized the queue->io_cpu to
WORK_CPU_UNBOUND and now you don't want to touch it. But it is
still kinda ugly.

I suggest that you always select a io_cpu and always account it.
You said yourself that if the wq is unbound, it doesn't really matter what
the io_cpu is...

And I think that a code comment would help here. its not the most trivial
code...

>
>>> +    dev_dbg(ctrl->ctrl.device, "queue %d: using cpu %d\n",
>>> +        qid, queue->io_cpu);
>>>   }
>>>   static void nvme_tcp_tls_done(void *data, int status, key_serial_t 
>>> pskid)
>>> @@ -1735,7 +1759,7 @@ static int nvme_tcp_alloc_queue(struct 
>>> nvme_ctrl *nctrl, int qid,
>>>       queue->sock->sk->sk_allocation = GFP_ATOMIC;
>>>       queue->sock->sk->sk_use_task_frag = false;
>>> -    nvme_tcp_set_queue_io_cpu(queue);
>>> +    queue->io_cpu = WORK_CPU_UNBOUND;
>>>       queue->request = NULL;
>>>       queue->data_remaining = 0;
>>>       queue->ddgst_remaining = 0;
>>> @@ -1847,6 +1871,10 @@ static void __nvme_tcp_stop_queue(struct 
>>> nvme_tcp_queue *queue)
>>>       kernel_sock_shutdown(queue->sock, SHUT_RDWR);
>>>       nvme_tcp_restore_sock_ops(queue);
>>>       cancel_work_sync(&queue->io_work);
>>> +    if (queue->io_cpu != WORK_CPU_UNBOUND) {
>>> + atomic_dec(&nvme_tcp_cpu_queues[queue->io_cpu]);
>>> +        queue->io_cpu = WORK_CPU_UNBOUND;
>>> +    }
>>
>> Does anything change if we still set the io_cpu in wq_unbound case?
>> If not, I think we should always do it and simplify the code.
>>
> See above. WORK_CPU_UNBOUND means that we failed to find the cpu
> in the mq_map. Yes, I know, unlikely, but not impossible.

If you always select a cpu, it should be fine.

>
>>>   }
>>>   static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
>>> @@ -1891,9 +1919,10 @@ static int nvme_tcp_start_queue(struct 
>>> nvme_ctrl *nctrl, int idx)
>>>       nvme_tcp_init_recv_ctx(queue);
>>>       nvme_tcp_setup_sock_ops(queue);
>>> -    if (idx)
>>> +    if (idx) {
>>> +        nvme_tcp_set_queue_io_cpu(queue);
>>>           ret = nvmf_connect_io_queue(nctrl, idx);
>>> -    else
>>> +    } else
>>>           ret = nvmf_connect_admin_queue(nctrl);
>>>       if (!ret) {
>>> @@ -2920,6 +2949,7 @@ static struct nvmf_transport_ops 
>>> nvme_tcp_transport = {
>>>   static int __init nvme_tcp_init_module(void)
>>>   {
>>>       unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
>>> +    int cpu;
>>>       BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
>>>       BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
>>> @@ -2937,6 +2967,9 @@ static int __init nvme_tcp_init_module(void)
>>>       if (!nvme_tcp_wq)
>>>           return -ENOMEM;
>>> +    for_each_possible_cpu(cpu)
>>> +        atomic_set(&nvme_tcp_cpu_queues[cpu], 0);
>>> +
>>
>> Why?
>
> Don't we need to initialize the atomic counters?

I thought this was the module __exit function, sorry.