[PATCH v1 net-next 02/15] net: Introduce direct data placement tcp offload

Boris Pismenny borispismenny at gmail.com
Wed Dec 9 03:25:24 EST 2020


On 09/12/2020 2:57, David Ahern wrote:
> On 12/7/20 2:06 PM, Boris Pismenny wrote:
>> diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h
>> index 934de56644e7..fb35dcac03d2 100644
>> --- a/include/linux/netdev_features.h
>> +++ b/include/linux/netdev_features.h
>> @@ -84,6 +84,7 @@ enum {
>>  	NETIF_F_GRO_FRAGLIST_BIT,	/* Fraglist GRO */
>>  
>>  	NETIF_F_HW_MACSEC_BIT,		/* Offload MACsec operations */
>> +	NETIF_F_HW_TCP_DDP_BIT,		/* TCP direct data placement offload */
>>  
>>  	/*
>>  	 * Add your fresh new feature above and remember to update
>> @@ -157,6 +158,7 @@ enum {
>>  #define NETIF_F_GRO_FRAGLIST	__NETIF_F(GRO_FRAGLIST)
>>  #define NETIF_F_GSO_FRAGLIST	__NETIF_F(GSO_FRAGLIST)
>>  #define NETIF_F_HW_MACSEC	__NETIF_F(HW_MACSEC)
>> +#define NETIF_F_HW_TCP_DDP	__NETIF_F(HW_TCP_DDP)
> 
> All of the DDP naming seems wrong to me. I realize the specific use case
> is targeted payloads of a ULP, but it is still S/W handing H/W specific
> buffers for a payload of a flow.
> 
> 

This is intended to be used strictly by ULPs. DDP is how such things were
called in the past. It is more than zerocopy as explained before, so naming
it zerocopy will be misleading at best. I can propose another name. How about:
"Autonomous copy offload (ACO)" and "Autonomous crc offload (ACRC)"?
This will indicate that it is independent of other offloads, i.e. autonomous,
while also informing users of the functionality (copy/crc). Other names are
welcome too.


>>   * @icsk_listen_portaddr_node	hash to the portaddr listener hashtable
>>   * @icsk_ca_state:	   Congestion control state
>>   * @icsk_retransmits:	   Number of unrecovered [RTO] timeouts
>> @@ -94,6 +96,8 @@ struct inet_connection_sock {
>>  	const struct tcp_ulp_ops  *icsk_ulp_ops;
>>  	void __rcu		  *icsk_ulp_data;
>>  	void (*icsk_clean_acked)(struct sock *sk, u32 acked_seq);
>> +	const struct tcp_ddp_ulp_ops  *icsk_ulp_ddp_ops;
>> +	void __rcu		  *icsk_ulp_ddp_data;
>>  	struct hlist_node         icsk_listen_portaddr_node;
>>  	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
>>  	__u8			  icsk_ca_state:5,
>> diff --git a/include/net/tcp_ddp.h b/include/net/tcp_ddp.h
>> new file mode 100644
>> index 000000000000..df3264be4600
>> --- /dev/null
>> +++ b/include/net/tcp_ddp.h
>> @@ -0,0 +1,129 @@
>> +/* SPDX-License-Identifier: GPL-2.0
>> + *
>> + * tcp_ddp.h
>> + *	Author:	Boris Pismenny <borisp at mellanox.com>
>> + *	Copyright (C) 2020 Mellanox Technologies.
>> + */
>> +#ifndef _TCP_DDP_H
>> +#define _TCP_DDP_H
>> +
>> +#include <linux/netdevice.h>
>> +#include <net/inet_connection_sock.h>
>> +#include <net/sock.h>
>> +
>> +/* limits returned by the offload driver, zero means don't care */
>> +struct tcp_ddp_limits {
>> +	int	 max_ddp_sgl_len;
>> +};
>> +
>> +enum tcp_ddp_type {
>> +	TCP_DDP_NVME = 1,
>> +};
>> +
>> +/**
>> + * struct tcp_ddp_config - Generic tcp ddp configuration: tcp ddp IO queue
>> + * config implementations must use this as the first member.
>> + * Add new instances of tcp_ddp_config below (nvme-tcp, etc.).
>> + */
>> +struct tcp_ddp_config {
>> +	enum tcp_ddp_type    type;
>> +	unsigned char        buf[];
> 
> you have this variable length buf, but it is not used (as far as I can
> tell). But then ...
> 
> 

True. This buf[] is here to indicate that users are expected to extend it with
the ULP specific data as in nvme_tcp_config. We can remove it and leave a comment
if you prefer that.

>> +};
>> +
>> +/**
>> + * struct nvme_tcp_ddp_config - nvme tcp ddp configuration for an IO queue
>> + *
>> + * @pfv:        pdu version (e.g., NVME_TCP_PFV_1_0)
>> + * @cpda:       controller pdu data alignmend (dwords, 0's based)
>> + * @dgst:       digest types enabled.
>> + *              The netdev will offload crc if ddp_crc is supported.
>> + * @queue_size: number of nvme-tcp IO queue elements
>> + * @queue_id:   queue identifier
>> + * @cpu_io:     cpu core running the IO thread for this queue
>> + */
>> +struct nvme_tcp_ddp_config {
>> +	struct tcp_ddp_config   cfg;
> 
> ... how would you use it within another struct like this?
> 

You don't.

>> +
>> +	u16			pfv;
>> +	u8			cpda;
>> +	u8			dgst;
>> +	int			queue_size;
>> +	int			queue_id;
>> +	int			io_cpu;
>> +};
>> +
>> +/**
>> + * struct tcp_ddp_io - tcp ddp configuration for an IO request.
>> + *
>> + * @command_id:  identifier on the wire associated with these buffers
>> + * @nents:       number of entries in the sg_table
>> + * @sg_table:    describing the buffers for this IO request
>> + * @first_sgl:   first SGL in sg_table
>> + */
>> +struct tcp_ddp_io {
>> +	u32			command_id;
>> +	int			nents;
>> +	struct sg_table		sg_table;
>> +	struct scatterlist	first_sgl[SG_CHUNK_SIZE];
>> +};
>> +
>> +/* struct tcp_ddp_dev_ops - operations used by an upper layer protocol to configure ddp offload
>> + *
>> + * @tcp_ddp_limits:    limit the number of scatter gather entries per IO.
>> + *                     the device driver can use this to limit the resources allocated per queue.
>> + * @tcp_ddp_sk_add:    add offload for the queue represennted by the socket+config pair.
>> + *                     this function is used to configure either copy, crc or both offloads.
>> + * @tcp_ddp_sk_del:    remove offload from the socket, and release any device related resources.
>> + * @tcp_ddp_setup:     request copy offload for buffers associated with a command_id in tcp_ddp_io.
>> + * @tcp_ddp_teardown:  release offload resources association between buffers and command_id in
>> + *                     tcp_ddp_io.
>> + * @tcp_ddp_resync:    respond to the driver's resync_request. Called only if resync is successful.
>> + */
>> +struct tcp_ddp_dev_ops {
>> +	int (*tcp_ddp_limits)(struct net_device *netdev,
>> +			      struct tcp_ddp_limits *limits);
>> +	int (*tcp_ddp_sk_add)(struct net_device *netdev,
>> +			      struct sock *sk,
>> +			      struct tcp_ddp_config *config);
>> +	void (*tcp_ddp_sk_del)(struct net_device *netdev,
>> +			       struct sock *sk);
>> +	int (*tcp_ddp_setup)(struct net_device *netdev,
>> +			     struct sock *sk,
>> +			     struct tcp_ddp_io *io);
>> +	int (*tcp_ddp_teardown)(struct net_device *netdev,
>> +				struct sock *sk,
>> +				struct tcp_ddp_io *io,
>> +				void *ddp_ctx);
>> +	void (*tcp_ddp_resync)(struct net_device *netdev,
>> +			       struct sock *sk, u32 seq);
>> +};
>> +
>> +#define TCP_DDP_RESYNC_REQ (1 << 0)
>> +
>> +/**
>> + * struct tcp_ddp_ulp_ops - Interface to register uppper layer Direct Data Placement (DDP) TCP offload
>> + */
>> +struct tcp_ddp_ulp_ops {
>> +	/* NIC requests ulp to indicate if @seq is the start of a message */
>> +	bool (*resync_request)(struct sock *sk, u32 seq, u32 flags);
>> +	/* NIC driver informs the ulp that ddp teardown is done - used for async completions*/
>> +	void (*ddp_teardown_done)(void *ddp_ctx);
>> +};
>> +
>> +/**
>> + * struct tcp_ddp_ctx - Generic tcp ddp context: device driver per queue contexts must
>> + * use this as the first member.
>> + */
>> +struct tcp_ddp_ctx {
>> +	enum tcp_ddp_type    type;
>> +	unsigned char        buf[];
> 
> similar to my comment above, I did not see any uses of the buf element.
> 

Same idea, we will remove it an leave a comment.



More information about the Linux-nvme mailing list