[PATCH v12 01/26] net: Introduce direct data placement tcp offload
Sagi Grimberg
sagi at grimberg.me
Wed Aug 9 00:15:31 PDT 2023
On 7/12/23 19:14, Aurelien Aptel wrote:
> From: Boris Pismenny <borisp at nvidia.com>
>
> This commit introduces direct data placement (DDP) offload for TCP.
>
> The motivation is saving compute resources/cycles that are spent
> to copy data from SKBs to the block layer buffers and CRC
> calculation/verification for received PDUs (Protocol Data Units).
>
> The DDP capability is accompanied by new net_device operations that
> configure hardware contexts.
>
> There is a context per socket, and a context per DDP operation.
> Additionally, a resynchronization routine is used to assist
> hardware handle TCP OOO, and continue the offload. Furthermore,
> we let the offloading driver advertise what is the max hw
> sectors/segments.
>
> The interface includes the following net-device ddp operations:
>
> 1. sk_add - add offload for the queue represented by socket+config pair
> 2. sk_del - remove the offload for the socket/queue
> 3. ddp_setup - request copy offload for buffers associated with an IO
> 4. ddp_teardown - release offload resources for that IO
> 5. limits - query NIC driver for quirks and limitations (e.g.
> max number of scatter gather entries per IO)
> 6. set_caps - request ULP DDP capabilities enablement
> 7. get_stats - query NIC driver for ULP DDP stats
>
> Using this interface, the NIC hardware will scatter TCP payload
> directly to the BIO pages according to the command_id.
>
> To maintain the correctness of the network stack, the driver is
> expected to construct SKBs that point to the BIO pages.
>
> The SKB passed to the network stack from the driver represents
> data as it is on the wire, while it is pointing directly to data
> in destination buffers.
>
> As a result, data from page frags should not be copied out to
> the linear part. To avoid needless copies, such as when using
> skb_condense, we mark the skb->ulp_ddp bit.
> In addition, the skb->ulp_crc will be used by the upper layers to
> determine if CRC re-calculation is required. The two separated skb
> indications are needed to avoid false positives GRO flushing events.
>
> Follow-up patches will use this interface for DDP in NVMe-TCP.
>
> Capability bits stored in net_device allow drivers to report which
> ULP DDP capabilities a device supports. Control over these
> capabilities will be exposed to userspace in later patches.
>
> Signed-off-by: Boris Pismenny <borisp at nvidia.com>
> Signed-off-by: Ben Ben-Ishay <benishay at nvidia.com>
> Signed-off-by: Or Gerlitz <ogerlitz at nvidia.com>
> Signed-off-by: Yoray Zack <yorayz at nvidia.com>
> Signed-off-by: Shai Malin <smalin at nvidia.com>
> Signed-off-by: Aurelien Aptel <aaptel at nvidia.com>
> ---
> include/linux/netdevice.h | 15 +++
> include/linux/skbuff.h | 25 +++-
> include/net/inet_connection_sock.h | 6 +
> include/net/ulp_ddp.h | 191 +++++++++++++++++++++++++++++
> include/net/ulp_ddp_caps.h | 35 ++++++
> net/Kconfig | 20 +++
> net/core/skbuff.c | 3 +-
> net/ipv4/tcp_input.c | 13 +-
> net/ipv4/tcp_ipv4.c | 3 +
> net/ipv4/tcp_offload.c | 3 +
> 10 files changed, 311 insertions(+), 3 deletions(-)
> create mode 100644 include/net/ulp_ddp.h
> create mode 100644 include/net/ulp_ddp_caps.h
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index b828c7a75be2..26e25b2df6fa 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -54,6 +54,10 @@
> #include <net/net_debug.h>
> #include <net/dropreason-core.h>
>
> +#ifdef CONFIG_ULP_DDP
> +#include <net/ulp_ddp_caps.h>
> +#endif
> +
> struct netpoll_info;
> struct device;
> struct ethtool_ops;
> @@ -1418,6 +1422,8 @@ struct netdev_net_notifier {
> * Get hardware timestamp based on normal/adjustable time or free running
> * cycle counter. This function is required if physical clock supports a
> * free running cycle counter.
> + * struct ulp_ddp_dev_ops *ulp_ddp_ops;
> + * ULP DDP operations (see include/net/ulp_ddp.h)
> */
> struct net_device_ops {
> int (*ndo_init)(struct net_device *dev);
> @@ -1652,6 +1658,9 @@ struct net_device_ops {
> ktime_t (*ndo_get_tstamp)(struct net_device *dev,
> const struct skb_shared_hwtstamps *hwtstamps,
> bool cycles);
> +#if IS_ENABLED(CONFIG_ULP_DDP)
> + const struct ulp_ddp_dev_ops *ulp_ddp_ops;
> +#endif
> };
>
> struct xdp_metadata_ops {
> @@ -1825,6 +1834,9 @@ enum netdev_ml_priv_type {
> * @mpls_features: Mask of features inheritable by MPLS
> * @gso_partial_features: value(s) from NETIF_F_GSO\*
> *
> + * @ulp_ddp_caps: Bitflags keeping track of supported and enabled
> + * ULP DDP capabilities.
> + *
> * @ifindex: interface index
> * @group: The group the device belongs to
> *
> @@ -2121,6 +2133,9 @@ struct net_device {
> netdev_features_t mpls_features;
> netdev_features_t gso_partial_features;
>
> +#ifdef CONFIG_ULP_DDP
> + struct ulp_ddp_netdev_caps ulp_ddp_caps;
> +#endif
> unsigned int min_mtu;
> unsigned int max_mtu;
> unsigned short type;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 91ed66952580..cfa65945874d 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -813,6 +813,8 @@ typedef unsigned char *sk_buff_data_t;
> * delivery_time in mono clock base (i.e. EDT). Otherwise, the
> * skb->tstamp has the (rcv) timestamp at ingress and
> * delivery_time at egress.
> + * @ulp_ddp: DDP offloaded
> + * @ulp_crc: CRC offloaded
> * @napi_id: id of the NAPI struct this skb came from
> * @sender_cpu: (aka @napi_id) source CPU in XPS
> * @alloc_cpu: CPU which did the skb allocation.
> @@ -992,7 +994,10 @@ struct sk_buff {
> #if IS_ENABLED(CONFIG_IP_SCTP)
> __u8 csum_not_inet:1;
> #endif
> -
> +#ifdef CONFIG_ULP_DDP
> + __u8 ulp_ddp:1;
> + __u8 ulp_crc:1;
> +#endif
> #ifdef CONFIG_NET_SCHED
> __u16 tc_index; /* traffic control index */
> #endif
> @@ -5040,5 +5045,23 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
> ssize_t skb_splice_from_iter(struct sk_buff *skb, struct iov_iter *iter,
> ssize_t maxsize, gfp_t gfp);
>
> +static inline bool skb_is_ulp_ddp(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_ULP_DDP
> + return skb->ulp_ddp;
> +#else
> + return 0;
> +#endif
> +}
> +
> +static inline bool skb_is_ulp_crc(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_ULP_DDP
> + return skb->ulp_crc;
> +#else
> + return 0;
> +#endif
> +}
> +
> #endif /* __KERNEL__ */
> #endif /* _LINUX_SKBUFF_H */
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index c2b15f7e5516..b11fbbc95541 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -68,6 +68,8 @@ struct inet_connection_sock_af_ops {
> * @icsk_ulp_ops Pluggable ULP control hook
> * @icsk_ulp_data ULP private data
> * @icsk_clean_acked Clean acked data hook
> + * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook
> + * @icsk_ulp_ddp_data ULP direct data placement private data
> * @icsk_ca_state: Congestion control state
> * @icsk_retransmits: Number of unrecovered [RTO] timeouts
> * @icsk_pending: Scheduled timer event
> @@ -98,6 +100,10 @@ struct inet_connection_sock {
> const struct tcp_ulp_ops *icsk_ulp_ops;
> void __rcu *icsk_ulp_data;
> void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
> +#ifdef CONFIG_ULP_DDP
> + const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops;
> + void __rcu *icsk_ulp_ddp_data;
> +#endif
> unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
> __u8 icsk_ca_state:5,
> icsk_ca_initialized:1,
> diff --git a/include/net/ulp_ddp.h b/include/net/ulp_ddp.h
> new file mode 100644
> index 000000000000..b85fda4450b4
> --- /dev/null
> +++ b/include/net/ulp_ddp.h
> @@ -0,0 +1,191 @@
> +/* SPDX-License-Identifier: GPL-2.0
> + *
> + * ulp_ddp.h
> + * Author: Boris Pismenny <borisp at nvidia.com>
> + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
> + */
> +#ifndef _ULP_DDP_H
> +#define _ULP_DDP_H
> +
> +#include <linux/netdevice.h>
> +#include <net/inet_connection_sock.h>
> +#include <net/sock.h>
> +
> +#include "ulp_ddp_caps.h"
> +
> +enum ulp_ddp_type {
> + ULP_DDP_NVME = 1,
> +};
> +
> +/**
> + * struct nvme_tcp_ddp_limits - nvme tcp driver limitations
> + *
> + * @full_ccid_range: true if the driver supports the full CID range
> + */
> +struct nvme_tcp_ddp_limits {
> + bool full_ccid_range;
> +};
> +
> +/**
> + * struct ulp_ddp_limits - Generic ulp ddp limits: tcp ddp
> + * protocol limits.
> + * Add new instances of ulp_ddp_limits in the union below (nvme-tcp, etc.).
> + *
> + * @type: type of this limits struct
> + * @max_ddp_sgl_len: maximum sgl size supported (zero means no limit)
> + * @io_threshold: minimum payload size required to offload
> + * @tls: support for ULP over TLS
> + * @nvmeotcp: NVMe-TCP specific limits
> + */
> +struct ulp_ddp_limits {
> + enum ulp_ddp_type type;
> + int max_ddp_sgl_len;
> + int io_threshold;
> + bool tls:1;
Is this a catch-all for tls 1.2/1.3 or future ones?
> + union {
> + struct nvme_tcp_ddp_limits nvmeotcp;
> + };
> +};
> +
> +/**
> + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue
> + *
> + * @pfv: pdu version (e.g., NVME_TCP_PFV_1_0)
> + * @cpda: controller pdu data alignment (dwords, 0's based)
> + * @dgst: digest types enabled (header or data, see enum nvme_tcp_digest_option).
> + * The netdev will offload crc if it is supported.
> + * @queue_size: number of nvme-tcp IO queue elements
> + * @queue_id: queue identifier
> + * @io_cpu: cpu core running the IO thread for this queue
> + */
> +struct nvme_tcp_ddp_config {
> + u16 pfv;
> + u8 cpda;
> + u8 dgst;
> + int queue_size;
> + int queue_id;
> + int io_cpu;
> +};
> +
> +/**
> + * struct ulp_ddp_config - Generic ulp ddp configuration
> + * Add new instances of ulp_ddp_config in the union below (nvme-tcp, etc.).
> + *
> + * @type: type of this config struct
> + * @nvmeotcp: NVMe-TCP specific config
> + */
> +struct ulp_ddp_config {
> + enum ulp_ddp_type type;
> + union {
> + struct nvme_tcp_ddp_config nvmeotcp;
> + };
> +};
> +
> +/**
> + * struct ulp_ddp_io - ulp ddp configuration for an IO request.
> + *
> + * @command_id: identifier on the wire associated with these buffers
> + * @nents: number of entries in the sg_table
> + * @sg_table: describing the buffers for this IO request
> + * @first_sgl: first SGL in sg_table
> + */
> +struct ulp_ddp_io {
> + u32 command_id;
> + int nents;
> + struct sg_table sg_table;
> + struct scatterlist first_sgl[SG_CHUNK_SIZE];
> +};
> +
> +struct ethtool_ulp_ddp_stats;
> +struct netlink_ext_ack;
> +
> +/**
> + * struct ulp_ddp_dev_ops - operations used by an upper layer protocol
> + * to configure ddp offload
> + *
> + * @limits: query ulp driver limitations and quirks.
> + * @sk_add: add offload for the queue represented by socket+config
> + * pair. this function is used to configure either copy, crc
> + * or both offloads.
> + * @sk_del: remove offload from the socket, and release any device
> + * related resources.
> + * @setup: request copy offload for buffers associated with a
> + * command_id in ulp_ddp_io.
> + * @teardown: release offload resources association between buffers
> + * and command_id in ulp_ddp_io.
> + * @resync: respond to the driver's resync_request. Called only if
> + * resync is successful.
> + * @set_caps: set device ULP DDP capabilities.
> + * returns a negative error code or zero.
> + * @get_stats: query ULP DDP statistics.
> + */
> +struct ulp_ddp_dev_ops {
> + int (*limits)(struct net_device *netdev,
> + struct ulp_ddp_limits *limits);
> + int (*sk_add)(struct net_device *netdev,
> + struct sock *sk,
> + struct ulp_ddp_config *config);
> + void (*sk_del)(struct net_device *netdev,
> + struct sock *sk);
> + int (*setup)(struct net_device *netdev,
> + struct sock *sk,
> + struct ulp_ddp_io *io);
> + void (*teardown)(struct net_device *netdev,
> + struct sock *sk,
> + struct ulp_ddp_io *io,
> + void *ddp_ctx);
> + void (*resync)(struct net_device *netdev,
> + struct sock *sk, u32 seq);
> + int (*set_caps)(struct net_device *dev, unsigned long *bits,
> + struct netlink_ext_ack *extack);
> + int (*get_stats)(struct net_device *dev,
> + struct ethtool_ulp_ddp_stats *stats);
> +};
It would be beneficial to have proper wrappers to these that
can also do some housekeeping work around the callbacks.
More information about the Linux-nvme
mailing list