[PATCH v9 01/25] net: Introduce direct data placement tcp offload
Paolo Abeni
pabeni at redhat.com
Fri Jan 20 00:52:34 PST 2023
On Tue, 2023-01-17 at 17:35 +0200, Aurelien Aptel wrote:
> From: Boris Pismenny <borisp at nvidia.com>
>
> This commit introduces direct data placement (DDP) offload for TCP.
>
> The motivation is saving compute resources/cycles that are spent
> to copy data from SKBs to the block layer buffers and CRC
> calculation/verification for received PDUs (Protocol Data Units).
>
> The DDP capability is accompanied by new net_device operations that
> configure hardware contexts.
>
> There is a context per socket, and a context per DDP operation.
> Additionally, a resynchronization routine is used to assist
> hardware handle TCP OOO, and continue the offload. Furthermore,
> we let the offloading driver advertise what is the max hw
> sectors/segments.
>
> The interface includes five net-device ddp operations:
>
> 1. sk_add - add offload for the queue represented by socket+config pair
> 2. sk_del - remove the offload for the socket/queue
> 3. ddp_setup - request copy offload for buffers associated with an IO
> 4. ddp_teardown - release offload resources for that IO
> 5. limits - query NIC driver for quirks and limitations (e.g.
> max number of scatter gather entries per IO)
>
> Using this interface, the NIC hardware will scatter TCP payload
> directly to the BIO pages according to the command_id.
>
> To maintain the correctness of the network stack, the driver is
> expected to construct SKBs that point to the BIO pages.
>
> The SKB passed to the network stack from the driver represents
> data as it is on the wire, while it is pointing directly to data
> in destination buffers.
>
> As a result, data from page frags should not be copied out to
> the linear part. To avoid needless copies, such as when using
> skb_condense, we mark the skb->ulp_ddp bit.
> In addition, the skb->ulp_crc will be used by the upper layers to
> determine if CRC re-calculation is required. The two separated skb
> indications are needed to avoid false positives GRO flushing events.
>
> Follow-up patches will use this interface for DDP in NVMe-TCP.
>
> Capability bits stored in net_device allow drivers to report which
> ULP DDP capabilities a device supports. Control over these
> capabilities will be exposed to userspace in later patches.
>
> Signed-off-by: Boris Pismenny <borisp at nvidia.com>
> Signed-off-by: Ben Ben-Ishay <benishay at nvidia.com>
> Signed-off-by: Or Gerlitz <ogerlitz at nvidia.com>
> Signed-off-by: Yoray Zack <yorayz at nvidia.com>
> Signed-off-by: Shai Malin <smalin at nvidia.com>
> Signed-off-by: Aurelien Aptel <aaptel at nvidia.com>
> ---
> include/linux/netdevice.h | 15 +++
> include/linux/skbuff.h | 24 ++++
> include/net/inet_connection_sock.h | 4 +
> include/net/ulp_ddp.h | 173 +++++++++++++++++++++++++++++
> include/net/ulp_ddp_caps.h | 41 +++++++
> net/Kconfig | 20 ++++
> net/core/skbuff.c | 3 +-
> net/ipv4/tcp_input.c | 8 ++
> net/ipv4/tcp_ipv4.c | 3 +
> net/ipv4/tcp_offload.c | 3 +
> 10 files changed, 293 insertions(+), 1 deletion(-)
> create mode 100644 include/net/ulp_ddp.h
> create mode 100644 include/net/ulp_ddp_caps.h
>
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index aad12a179e54..289cfdade177 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -52,6 +52,10 @@
> #include <net/net_trackers.h>
> #include <net/net_debug.h>
>
> +#ifdef CONFIG_ULP_DDP
> +#include <net/ulp_ddp_caps.h>
> +#endif
> +
> struct netpoll_info;
> struct device;
> struct ethtool_ops;
> @@ -1392,6 +1396,8 @@ struct netdev_net_notifier {
> * Get hardware timestamp based on normal/adjustable time or free running
> * cycle counter. This function is required if physical clock supports a
> * free running cycle counter.
> + * struct ulp_ddp_dev_ops *ulp_ddp_ops;
> + * ULP DDP operations (see include/net/ulp_ddp.h)
> */
> struct net_device_ops {
> int (*ndo_init)(struct net_device *dev);
> @@ -1616,6 +1622,9 @@ struct net_device_ops {
> ktime_t (*ndo_get_tstamp)(struct net_device *dev,
> const struct skb_shared_hwtstamps *hwtstamps,
> bool cycles);
> +#if IS_ENABLED(CONFIG_ULP_DDP)
> + const struct ulp_ddp_dev_ops *ulp_ddp_ops;
> +#endif
> };
>
> /**
> @@ -1783,6 +1792,9 @@ enum netdev_ml_priv_type {
> * @mpls_features: Mask of features inheritable by MPLS
> * @gso_partial_features: value(s) from NETIF_F_GSO\*
> *
> + * @ulp_ddp_caps: Bitflags keeping track of supported and enabled
> + * ULP DDP capabilities.
> + *
> * @ifindex: interface index
> * @group: The group the device belongs to
> *
> @@ -2071,6 +2083,9 @@ struct net_device {
> netdev_features_t mpls_features;
> netdev_features_t gso_partial_features;
>
> +#ifdef CONFIG_ULP_DDP
> + struct ulp_ddp_netdev_caps ulp_ddp_caps;
> +#endif
> unsigned int min_mtu;
> unsigned int max_mtu;
> unsigned short type;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 4c8492401a10..8708c5935e89 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -811,6 +811,8 @@ typedef unsigned char *sk_buff_data_t;
> * delivery_time in mono clock base (i.e. EDT). Otherwise, the
> * skb->tstamp has the (rcv) timestamp at ingress and
> * delivery_time at egress.
> + * @ulp_ddp: DDP offloaded
> + * @ulp_crc: CRC offloaded
> * @napi_id: id of the NAPI struct this skb came from
> * @sender_cpu: (aka @napi_id) source CPU in XPS
> * @alloc_cpu: CPU which did the skb allocation.
> @@ -983,6 +985,10 @@ struct sk_buff {
> __u8 slow_gro:1;
> __u8 csum_not_inet:1;
> __u8 scm_io_uring:1;
> +#ifdef CONFIG_ULP_DDP
> + __u8 ulp_ddp:1;
> + __u8 ulp_crc:1;
> +#endif
>
> #ifdef CONFIG_NET_SCHED
> __u16 tc_index; /* traffic control index */
> @@ -5053,5 +5059,23 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
> }
> #endif
>
> +static inline bool skb_is_ulp_ddp(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_ULP_DDP
> + return skb->ulp_ddp;
> +#else
> + return 0;
> +#endif
> +}
> +
> +static inline bool skb_is_ulp_crc(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_ULP_DDP
> + return skb->ulp_crc;
> +#else
> + return 0;
> +#endif
> +}
> +
> #endif /* __KERNEL__ */
> #endif /* _LINUX_SKBUFF_H */
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index c2b15f7e5516..2ba73167b3bb 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -68,6 +68,8 @@ struct inet_connection_sock_af_ops {
> * @icsk_ulp_ops Pluggable ULP control hook
> * @icsk_ulp_data ULP private data
> * @icsk_clean_acked Clean acked data hook
> + * @icsk_ulp_ddp_ops Pluggable ULP direct data placement control hook
> + * @icsk_ulp_ddp_data ULP direct data placement private data
> * @icsk_ca_state: Congestion control state
> * @icsk_retransmits: Number of unrecovered [RTO] timeouts
> * @icsk_pending: Scheduled timer event
> @@ -98,6 +100,8 @@ struct inet_connection_sock {
> const struct tcp_ulp_ops *icsk_ulp_ops;
> void __rcu *icsk_ulp_data;
> void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
> + const struct ulp_ddp_ulp_ops *icsk_ulp_ddp_ops;
> + void __rcu *icsk_ulp_ddp_data;
The above probably need a
#if IS_ENABLED(CONFIG_ULP_DDP)
compiler guard.
Have you considered avoiding adding the above fields here, and instead
pass them as argument for the setup() H/W offload operation?
I feel like such fields belong more naturally to the DDP offload
context/queue and currently the icsk DDP ops are only used by the
offloading driver. Additionally it looks strange to me 2 consecutive
different set of ULPs inside the same object (sock).
Thanks,
Paolo
More information about the Linux-nvme
mailing list