[PATCH v9 01/25] net: Introduce direct data placement tcp offload

Paolo Abeni pabeni at redhat.com
Fri Jan 20 00:52:34 PST 2023


On Tue, 2023-01-17 at 17:35 +0200, Aurelien Aptel wrote:
> From: Boris Pismenny <borisp at nvidia.com>
> 
> This commit introduces direct data placement (DDP) offload for TCP.
> 
> The motivation is saving compute resources/cycles that are spent
> to copy data from SKBs to the block layer buffers and CRC
> calculation/verification for received PDUs (Protocol Data Units).
> 
> The DDP capability is accompanied by new net_device operations that
> configure hardware contexts.
> 
> There is a context per socket, and a context per DDP operation.
> Additionally, a resynchronization routine is used to assist
> hardware handle TCP OOO, and continue the offload. Furthermore,
> we let the offloading driver advertise what is the max hw
> sectors/segments.
> 
> The interface includes five net-device ddp operations:
> 
>  1. sk_add - add offload for the queue represented by socket+config pair
>  2. sk_del - remove the offload for the socket/queue
>  3. ddp_setup - request copy offload for buffers associated with an IO
>  4. ddp_teardown - release offload resources for that IO
>  5. limits - query NIC driver for quirks and limitations (e.g.
>              max number of scatter gather entries per IO)
> 
> Using this interface, the NIC hardware will scatter TCP payload
> directly to the BIO pages according to the command_id.
> 
> To maintain the correctness of the network stack, the driver is
> expected to construct SKBs that point to the BIO pages.
> 
> The SKB passed to the network stack from the driver represents
> data as it is on the wire, while it is pointing directly to data
> in destination buffers.
> 
> As a result, data from page frags should not be copied out to
> the linear part. To avoid needless copies, such as when using
> skb_condense, we mark the skb->ulp_ddp bit.
> In addition, the skb->ulp_crc will be used by the upper layers to
> determine if CRC re-calculation is required. The two separated skb
> indications are needed to avoid false positives GRO flushing events.
> 
> Follow-up patches will use this interface for DDP in NVMe-TCP.
> 
> Capability bits stored in net_device allow drivers to report which
> ULP DDP capabilities a device supports. Control over these
> capabilities will be exposed to userspace in later patches.
> 
> Signed-off-by: Boris Pismenny <borisp at nvidia.com>
> Signed-off-by: Ben Ben-Ishay <benishay at nvidia.com>
> Signed-off-by: Or Gerlitz <ogerlitz at nvidia.com>
> Signed-off-by: Yoray Zack <yorayz at nvidia.com>
> Signed-off-by: Shai Malin <smalin at nvidia.com>
> Signed-off-by: Aurelien Aptel <aaptel at nvidia.com>
> ---
>  include/linux/netdevice.h          |  15 +++
>  include/linux/skbuff.h             |  24 ++++
>  include/net/inet_connection_sock.h |   4 +
>  include/net/ulp_ddp.h              | 173 +++++++++++++++++++++++++++++
>  include/net/ulp_ddp_caps.h         |  41 +++++++
>  net/Kconfig                        |  20 ++++
>  net/core/skbuff.c                  |   3 +-
>  net/ipv4/tcp_input.c               |   8 ++
>  net/ipv4/tcp_ipv4.c                |   3 +
>  net/ipv4/tcp_offload.c             |   3 +
>  10 files changed, 293 insertions(+), 1 deletion(-)
>  create mode 100644 include/net/ulp_ddp.h
>  create mode 100644 include/net/ulp_ddp_caps.h
> 
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
> index aad12a179e54..289cfdade177 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -52,6 +52,10 @@
>  #include <net/net_trackers.h>
>  #include <net/net_debug.h>
>  
> +#ifdef CONFIG_ULP_DDP
> +#include <net/ulp_ddp_caps.h>
> +#endif
> +
>  struct netpoll_info;
>  struct device;
>  struct ethtool_ops;
> @@ -1392,6 +1396,8 @@ struct netdev_net_notifier {
>   *	Get hardware timestamp based on normal/adjustable time or free running
>   *	cycle counter. This function is required if physical clock supports a
>   *	free running cycle counter.
> + * struct ulp_ddp_dev_ops *ulp_ddp_ops;
> + *	ULP DDP operations (see include/net/ulp_ddp.h)
>   */
>  struct net_device_ops {
>  	int			(*ndo_init)(struct net_device *dev);
> @@ -1616,6 +1622,9 @@ struct net_device_ops {
>  	ktime_t			(*ndo_get_tstamp)(struct net_device *dev,
>  						  const struct skb_shared_hwtstamps *hwtstamps,
>  						  bool cycles);
> +#if IS_ENABLED(CONFIG_ULP_DDP)
> +	const struct ulp_ddp_dev_ops	*ulp_ddp_ops;
> +#endif
>  };
>  
>  /**
> @@ -1783,6 +1792,9 @@ enum netdev_ml_priv_type {
>   *	@mpls_features:	Mask of features inheritable by MPLS
>   *	@gso_partial_features: value(s) from NETIF_F_GSO\*
>   *
> + *	@ulp_ddp_caps:	Bitflags keeping track of supported and enabled
> + *			ULP DDP capabilities.
> + *
>   *	@ifindex:	interface index
>   *	@group:		The group the device belongs to
>   *
> @@ -2071,6 +2083,9 @@ struct net_device {
>  	netdev_features_t	mpls_features;
>  	netdev_features_t	gso_partial_features;
>  
> +#ifdef CONFIG_ULP_DDP
> +	struct ulp_ddp_netdev_caps ulp_ddp_caps;
> +#endif
>  	unsigned int		min_mtu;
>  	unsigned int		max_mtu;
>  	unsigned short		type;
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 4c8492401a10..8708c5935e89 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -811,6 +811,8 @@ typedef unsigned char *sk_buff_data_t;
>   *		delivery_time in mono clock base (i.e. EDT).  Otherwise, the
>   *		skb->tstamp has the (rcv) timestamp at ingress and
>   *		delivery_time at egress.
> + *	@ulp_ddp: DDP offloaded
> + *	@ulp_crc: CRC offloaded
>   *	@napi_id: id of the NAPI struct this skb came from
>   *	@sender_cpu: (aka @napi_id) source CPU in XPS
>   *	@alloc_cpu: CPU which did the skb allocation.
> @@ -983,6 +985,10 @@ struct sk_buff {
>  	__u8			slow_gro:1;
>  	__u8			csum_not_inet:1;
>  	__u8			scm_io_uring:1;
> +#ifdef CONFIG_ULP_DDP
> +	__u8                    ulp_ddp:1;
> +	__u8			ulp_crc:1;
> +#endif
>  
>  #ifdef CONFIG_NET_SCHED
>  	__u16			tc_index;	/* traffic control index */
> @@ -5053,5 +5059,23 @@ static inline void skb_mark_for_recycle(struct sk_buff *skb)
>  }
>  #endif
>  
> +static inline bool skb_is_ulp_ddp(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_ULP_DDP
> +	return skb->ulp_ddp;
> +#else
> +	return 0;
> +#endif
> +}
> +
> +static inline bool skb_is_ulp_crc(struct sk_buff *skb)
> +{
> +#ifdef CONFIG_ULP_DDP
> +	return skb->ulp_crc;
> +#else
> +	return 0;
> +#endif
> +}
> +
>  #endif	/* __KERNEL__ */
>  #endif	/* _LINUX_SKBUFF_H */
> diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
> index c2b15f7e5516..2ba73167b3bb 100644
> --- a/include/net/inet_connection_sock.h
> +++ b/include/net/inet_connection_sock.h
> @@ -68,6 +68,8 @@ struct inet_connection_sock_af_ops {
>   * @icsk_ulp_ops	   Pluggable ULP control hook
>   * @icsk_ulp_data	   ULP private data
>   * @icsk_clean_acked	   Clean acked data hook
> + * @icsk_ulp_ddp_ops	   Pluggable ULP direct data placement control hook
> + * @icsk_ulp_ddp_data	   ULP direct data placement private data
>   * @icsk_ca_state:	   Congestion control state
>   * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
>   * @icsk_pending:	   Scheduled timer event
> @@ -98,6 +100,8 @@ struct inet_connection_sock {
>  	const struct tcp_ulp_ops  *icsk_ulp_ops;
>  	void __rcu		  *icsk_ulp_data;
>  	void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
> +	const struct ulp_ddp_ulp_ops  *icsk_ulp_ddp_ops;
> +	void __rcu		  *icsk_ulp_ddp_data;

The above probably need a

#if IS_ENABLED(CONFIG_ULP_DDP)

compiler guard.

Have you considered avoiding adding the above fields here, and instead
pass them as argument for the setup() H/W offload operation? 

I feel like such fields belong more naturally to the DDP offload
context/queue and currently the icsk DDP ops are only used by the
offloading driver. Additionally it looks strange to me 2 consecutive
different set of ULPs inside the same object (sock).

Thanks,

Paolo




More information about the Linux-nvme mailing list