[PATCH v12 07/26] nvme-tcp: Add DDP offload control path

Aurelien Aptel aaptel at nvidia.com
Mon Aug 14 09:11:48 PDT 2023


Sagi Grimberg <sagi at grimberg.me> writes:
>> @@ -187,6 +205,9 @@ struct nvme_tcp_ctrl {
>>       struct delayed_work     connect_work;
>>       struct nvme_tcp_request async_req;
>>       u32                     io_queues[HCTX_MAX_TYPES];
>> +
>> +     struct net_device       *offloading_netdev;
>> +     u32                     offload_io_threshold;
>
> ddp_netdev
> ddp_io_threashold

Sure, will change it.

>> +static bool nvme_tcp_ddp_query_limits(struct net_device *netdev,
>> +                                   struct nvme_tcp_queue *queue)
>> +{
>> +     int ret;
>> +
>> +     if (!netdev->netdev_ops->ulp_ddp_ops->limits)
>> +             return false;
>
> Can we expose all of these ops in wrappers ala:
>
> netdev_ulp_ddp_limits(netdev, &limits)
> netdev_ulp_ddp_sk_add(netdev, sk, &nvme_tcp_ddp_ulp_ops)
> netdev_ulp_ddp_sk_del(netdev, sk)
> netdev_ulp_ddp_resync(netdev, skb, seq)
>
> etc...

Sure, we will add simple wrappers in ulp_ddp.h to check for the function
pointers.

>> +static inline bool is_netdev_ulp_offload_active(struct net_device *netdev,
>> +                                             struct nvme_tcp_queue *queue)
>> +{
>> +     if (!netdev || !queue)
>> +             return false;
>
> Is it reasonable to be called here with !netdev or !queue ?

The check is needed only for the IO queue case but we can move it
earlier in nvme_tcp_start_queue().

>> +
>> +     /* If we cannot query the netdev limitations, do not offload */
>> +     if (!nvme_tcp_ddp_query_limits(netdev, queue))
>> +             return false;
>> +
>> +     /* If netdev supports nvme-tcp ddp offload, we can offload */
>> +     if (test_bit(ULP_DDP_C_NVME_TCP_BIT, netdev->ulp_ddp_caps.active))
>> +             return true;
>
> This should be coming from the API itself, have the limits query
> api fail if this is off.

We can move the function to the ULP DDP layer.

> btw, what is the active thing? is this driven from ethtool enable?
> what happens if the user disables it while there is a ulp using it?

The active bits are indeed driven by ethtool according to the design
Jakub suggested.
The nvme-tcp connection will have to be reconnected to see the effect of
changing the bit.

>> +
>> +     return false;
>
> This can be folded to the above function.

We won't be able to check for TLS in a common wrapper. We think this
should be kept.

>> +static int nvme_tcp_offload_socket(struct nvme_tcp_queue *queue)
>> +{
>> +     struct net_device *netdev = queue->ctrl->offloading_netdev;
>> +     struct ulp_ddp_config config = {.type = ULP_DDP_NVME};
>> +     int ret;
>> +
>> +     config.nvmeotcp.pfv = NVME_TCP_PFV_1_0;
>
> Question, what happens if the pfv changes, is the ddp guaranteed to
> work?

The existing HW supports only NVME_TCP_PFV_1_0.
Once a new version will be used, the device driver should fail the
sk_add().

>> +     config.nvmeotcp.cpda = 0;
>> +     config.nvmeotcp.dgst =
>> +             queue->hdr_digest ? NVME_TCP_HDR_DIGEST_ENABLE : 0;
>> +     config.nvmeotcp.dgst |=
>> +             queue->data_digest ? NVME_TCP_DATA_DIGEST_ENABLE : 0;
>> +     config.nvmeotcp.queue_size = queue->ctrl->ctrl.sqsize + 1;
>> +     config.nvmeotcp.queue_id = nvme_tcp_queue_id(queue);
>> +     config.nvmeotcp.io_cpu = queue->io_cpu;
>> +
>> +     /* Socket ops keep a netdev reference. It is put in
>> +      * nvme_tcp_unoffload_socket().  This ref is dropped on
>> +      * NETDEV_GOING_DOWN events to allow the device to go down
>> +      */
>> +     dev_hold(netdev);
>> +     ret = netdev->netdev_ops->ulp_ddp_ops->sk_add(netdev,
>> +                                                   queue->sock->sk,
>> +                                                   &config);
>
> It would be preferred if dev_hold would be taken in sk_add
> and released in sk_del so that the ulp does not need to worry
> acount it.

Sure, we will move the refcount accounting to the ulp ddp wrapper.

>> +     inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = &nvme_tcp_ddp_ulp_ops;
> can also be folded inside an api.

Sure, we will move this to sk_add() and add a parameter for the ops.

>> +static void nvme_tcp_unoffload_socket(struct nvme_tcp_queue *queue)
>> +{
>> +     struct net_device *netdev = queue->ctrl->offloading_netdev;
>> +
>> +     if (!netdev) {
>> +             dev_info_ratelimited(queue->ctrl->ctrl.device, "netdev not found\n");
>> +             return;
>> +     }
>
> Again, is it reasonable to be called here with !netdev?

No it's redundant, we will remove it.

>> +
>> +     clear_bit(NVME_TCP_Q_OFF_DDP, &queue->flags);
>> +
>> +     netdev->netdev_ops->ulp_ddp_ops->sk_del(netdev, queue->sock->sk);
>> +
>> +     inet_csk(queue->sock->sk)->icsk_ulp_ddp_ops = NULL;
>> +     dev_put(netdev); /* held by offload_socket */
>
> Both can be done by the api instead of the ulp itself.

Sure, we will move it.

>> +}
>> +
>> +static void nvme_tcp_offload_apply_limits(struct nvme_tcp_queue *queue,
>> +                                       struct net_device *netdev)
>> +{
>> +     queue->ctrl->offloading_netdev = netdev;
>> +     queue->ctrl->ctrl.max_segments = queue->ddp_limits.max_ddp_sgl_len;
>> +     queue->ctrl->ctrl.max_hw_sectors =
>> +             queue->ddp_limits.max_ddp_sgl_len << (ilog2(SZ_4K) - 9);
>
> this is SECTOR_SHIFT?

Yes it is, we will use it.

>> +/* In presence of packet drops or network packet reordering, the device may lose
>> + * synchronization between the TCP stream and the L5P framing, and require a
>> + * resync with the kernel's TCP stack.
>> + *
>> + * - NIC HW identifies a PDU header at some TCP sequence number,
>> + *   and asks NVMe-TCP to confirm it.
>> + * - When NVMe-TCP observes the requested TCP sequence, it will compare
>> + *   it with the PDU header TCP sequence, and report the result to the
>> + *   NIC driver
>> + */
>> +static void nvme_tcp_resync_response(struct nvme_tcp_queue *queue,
>> +                                  struct sk_buff *skb, unsigned int offset)
>> +{
>> +     u64 pdu_seq = TCP_SKB_CB(skb)->seq + offset - queue->pdu_offset;
>> +     struct net_device *netdev = queue->ctrl->offloading_netdev;
>> +     u64 pdu_val = (pdu_seq << 32) | ULP_DDP_RESYNC_PENDING;
>> +     u64 resync_val;
>> +     u32 resync_seq;
>> +
>> +     resync_val = atomic64_read(&queue->resync_req);
>> +     /* Lower 32 bit flags. Check validity of the request */
>> +     if ((resync_val & ULP_DDP_RESYNC_PENDING) == 0)
>> +             return;
>> +
>> +     /*
>> +      * Obtain and check requested sequence number: is this PDU header
>> +      * before the request?
>> +      */
>> +     resync_seq = resync_val >> 32;
>> +     if (before(pdu_seq, resync_seq))
>> +             return;
>> +
>> +     /*
>> +      * The atomic operation guarantees that we don't miss any NIC driver
>> +      * resync requests submitted after the above checks.
>> +      */
>> +     if (atomic64_cmpxchg(&queue->resync_req, pdu_val,
>> +                          pdu_val & ~ULP_DDP_RESYNC_PENDING) !=
>> +                          atomic64_read(&queue->resync_req))
>> +             netdev->netdev_ops->ulp_ddp_ops->resync(netdev,
>> +                                                     queue->sock->sk,
>> +                                                     pdu_seq);
>
> Who else is doing an atomic on this value? and what happens
> if the cmpxchg fails?

The driver thread can set queue->resync_req concurrently in patch
"net/mlx5e: NVMEoTCP, data-path for DDP+DDGST offload" in function
nvmeotcp_update_resync().

If the cmpxchg fails it means a new resync request was triggered by the
HW, the old request will be dropped and the new one will be processed by
a later PDU.

>> +}
>> +
>> +static bool nvme_tcp_resync_request(struct sock *sk, u32 seq, u32 flags)
>> +{
>> +     struct nvme_tcp_queue *queue = sk->sk_user_data;
>> +
>> +     /*
>> +      * "seq" (TCP seq number) is what the HW assumes is the
>> +      * beginning of a PDU.  The nvme-tcp layer needs to store the
>> +      * number along with the "flags" (ULP_DDP_RESYNC_PENDING) to
>> +      * indicate that a request is pending.
>> +      */
>> +     atomic64_set(&queue->resync_req, (((uint64_t)seq << 32) | flags));
>
> Question, is this coming from multiple contexts? what contexts are
> competing here that make it an atomic operation? It is unclear what is
> going on here tbh.

The driver could get a resync request and set queue->resync_req
concurrently while processing HW CQEs as you can see in patch
"net/mlx5e: NVMEoTCP, data-path for DDP+DDGST offload" in function
nvmeotcp_update_resync().

The resync flow is:

     nvme-tcp                           mlx5                     hw
        |                                |                        |
        |                                |                      sends CQE with
        |                                |                      resync request
        |                                | <----------------------'                                  
        |                         nvmeotcp_update_resync()
  nvme_tcp_resync_request() <-----------'|
  we store the request

Later, while receiving PDUs we check for pending requests.
If there is one, we send call nvme_tcp_resync_response() which calls
into mlx5 to send the response to the HW.

>> @@ -1831,25 +2055,55 @@ static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
>>   {
>>       struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl);
>>       struct nvme_tcp_queue *queue = &ctrl->queues[idx];
>> +     struct net_device *netdev;
>>       int ret;
>>
>>       queue->rd_enabled = true;
>>       nvme_tcp_init_recv_ctx(queue);
>>       nvme_tcp_setup_sock_ops(queue);
>>
>> -     if (idx)
>> +     if (idx) {
>>               ret = nvmf_connect_io_queue(nctrl, idx);
>> -     else
>> +             if (ret)
>> +                     goto err;
>> +
>> +             netdev = queue->ctrl->offloading_netdev;
>> +             if (is_netdev_ulp_offload_active(netdev, queue)) {
>
> Seems redundant to pass netdev as an argument here.

Thanks, we will remove it.

>> +                     ret = nvme_tcp_offload_socket(queue);
>> +                     if (ret) {
>> +                             dev_info(nctrl->device,
>> +                                      "failed to setup offload on queue %d ret=%d\n",
>> +                                      idx, ret);
>> +                     }
>> +             }
>> +     } else {
>>               ret = nvmf_connect_admin_queue(nctrl);
>> +             if (ret)
>> +                     goto err;
>>
>> -     if (!ret) {
>> -             set_bit(NVME_TCP_Q_LIVE, &queue->flags);
>> -     } else {
>> -             if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
>> -                     __nvme_tcp_stop_queue(queue);
>> -             dev_err(nctrl->device,
>> -                     "failed to connect queue: %d ret=%d\n", idx, ret);
>> +             netdev = get_netdev_for_sock(queue->sock->sk);
>
> Is there any chance that this is a different netdev than what is
> already recorded? doesn't make sense to me.

The idea is that we are first starting the admin queue, which looks up
the netdev associated with the socket and stored in the queue. Later,
when the IO queues are started, we use the recorded netdev.

In cases of bonding or vlan, a netdev can have lower device links, which
get_netdev_for_sock() will look up.

Do you see any gap?

>> +             if (!netdev) {
>> +                     dev_info_ratelimited(ctrl->ctrl.device, "netdev not found\n");
>> +                     ctrl->offloading_netdev = NULL;
>> +                     goto done;
>> +             }
>> +             if (is_netdev_ulp_offload_active(netdev, queue))
>> +                     nvme_tcp_offload_apply_limits(queue, netdev);
>> +             /*
>> +              * release the device as no offload context is
>> +              * established yet.
>> +              */
>> +             dev_put(netdev);
>
> the put is unclear, what does it pair with? the get_netdev_for_sock?

Yes, get_netdev_for_sock() takes a reference, which we don't need at
that point so we put it.



More information about the Linux-nvme mailing list