[PATCH] nvme-tcp: send H2CData PDUs based on MAXH2CDATA

Wed Nov 17 01:01:52 PST 2021

Hi Varun,

Sorry for the delay, thanks for the patch, I have a few
comments.

On 10/27/21 3:23 PM, Varun Prakash wrote:
> As per NVMe/TCP specification (revision 1.0a, section 3.6.2.3)
> Maximum Host to Controller Data length (MAXH2CDATA): Specifies the
> maximum number of PDU-Data bytes per H2CData PDU in bytes. This value
> is a multiple of dwords and should be no less than 4,096.
> 
> Current code sets H2CData PDU data_length to r2t_length,
> it does not check MAXH2CDATA value. Fix this by setting H2CData PDU
> data_length to min(req->rem_r2t_len, queue->maxh2cdata).
> 
> Also validate MAXH2CDATA value returned by target in ICResp PDU,
> if it is not a multiple of dword or if it is less than 4096 return
> -EINVAL from nvme_tcp_init_connection().
> 
> Signed-off-by: Varun Prakash <varun at chelsio.com>
> ---
>   drivers/nvme/host/tcp.c  | 86 ++++++++++++++++++++++++++++++++++++++++--------
>   include/linux/nvme-tcp.h |  1 +
>   2 files changed, 74 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 22da856..d7a7d85 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -42,6 +42,7 @@ struct nvme_tcp_request {
>   	void			*pdu;
>   	struct nvme_tcp_queue	*queue;
>   	u32			data_len;
> +	u32			rem_r2t_len;

I suggest that given this is a decrementing value we should
call it h2cdata_left.

>   	u32			pdu_len;
>   	u32			pdu_sent;
>   	u16			ttag;
> @@ -95,6 +96,7 @@ struct nvme_tcp_queue {
>   	struct nvme_tcp_request *request;
>   
>   	int			queue_size;
> +	u32			maxh2cdata;
>   	size_t			cmnd_capsule_len;
>   	struct nvme_tcp_ctrl	*ctrl;
>   	unsigned long		flags;
> @@ -578,23 +580,21 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
>   	struct nvme_tcp_data_pdu *data = req->pdu;
>   	struct nvme_tcp_queue *queue = req->queue;
>   	struct request *rq = blk_mq_rq_from_pdu(req);
> +	u32 r2t_length = le32_to_cpu(pdu->r2t_length);
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	u8 ddgst = nvme_tcp_ddgst_len(queue);
>   
> -	req->pdu_len = le32_to_cpu(pdu->r2t_length);
> -	req->pdu_sent = 0;
> -
> -	if (unlikely(!req->pdu_len)) {
> +	if (unlikely(!r2t_length)) {
>   		dev_err(queue->ctrl->ctrl.device,
>   			"req %d r2t len is %u, probably a bug...\n",
> -			rq->tag, req->pdu_len);
> +			rq->tag, r2t_length);
>   		return -EPROTO;
>   	}
>   
> -	if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
> +	if (unlikely(req->data_sent + r2t_length > req->data_len)) {
>   		dev_err(queue->ctrl->ctrl.device,
>   			"req %d r2t len %u exceeded data len %u (%zu sent)\n",
> -			rq->tag, req->pdu_len, req->data_len,
> +			rq->tag, r2t_length, req->data_len,
>   			req->data_sent);
>   		return -EPROTO;
>   	}

I think we should move these checks to nvme_tcp_handle_r2t anyways, this
should only be setting the h2cdata pdu. This should probably be a prep
patch.

> @@ -607,9 +607,15 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
>   		return -EPROTO;
>   	}
>   
> +	req->rem_r2t_len = r2t_length;
> +	req->pdu_len = min(req->rem_r2t_len, queue->maxh2cdata);
> +	req->pdu_sent = 0;
> +	req->rem_r2t_len -= req->pdu_len;
> +
>   	memset(data, 0, sizeof(*data));
>   	data->hdr.type = nvme_tcp_h2c_data;
> -	data->hdr.flags = NVME_TCP_F_DATA_LAST;
> +	if (!req->rem_r2t_len)
> +		data->hdr.flags = NVME_TCP_F_DATA_LAST;
>   	if (queue->hdr_digest)
>   		data->hdr.flags |= NVME_TCP_F_HDGST;
>   	if (queue->data_digest)
> @@ -923,9 +929,34 @@ static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
>   	nvme_tcp_end_request(blk_mq_rq_from_pdu(req), NVME_SC_HOST_PATH_ERROR);
>   }
>   
> +static void nvme_tcp_update_h2c_data_pdu(struct nvme_tcp_request *req)
> +{
> +	struct nvme_tcp_queue *queue = req->queue;
> +	struct nvme_tcp_data_pdu *data = req->pdu;
> +	u8 hdgst = nvme_tcp_hdgst_len(queue);
> +	u8 ddgst = nvme_tcp_ddgst_len(queue);
> +	u32 pdu_sent = req->pdu_len;
> +
> +	req->pdu_len = min(req->rem_r2t_len, queue->maxh2cdata);
> +	req->pdu_sent = 0;
> +	req->rem_r2t_len -= req->pdu_len;
> +
> +	if (!req->rem_r2t_len)
> +		data->hdr.flags |= NVME_TCP_F_DATA_LAST;
> +	data->hdr.plen =
> +		cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
> +	data->data_offset =
> +		cpu_to_le32(le32_to_cpu(data->data_offset) + pdu_sent);
> +	data->data_length = cpu_to_le32(req->pdu_len);
> +
> +	req->state = NVME_TCP_SEND_H2C_PDU;
> +	req->offset = 0;
> +}

Ideally we don't have two functions one for setup and one for update,
they should be the same.

This can probably be acheived when the r2t checks will move out of
this function, and we can record the r2t_legnth and r2t_offset in the
request and use that in nvme_tcp_setup_h2c_data_pdu.

> +
>   static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
>   {
>   	struct nvme_tcp_queue *queue = req->queue;
> +	u32 rem_r2t_len = req->rem_r2t_len;
>   
>   	while (true) {
>   		struct page *page = nvme_tcp_req_cur_page(req);
> @@ -969,7 +1000,10 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
>   				req->state = NVME_TCP_SEND_DDGST;
>   				req->offset = 0;
>   			} else {
> -				nvme_tcp_done_send_req(queue);
> +				if (rem_r2t_len)
> +					nvme_tcp_update_h2c_data_pdu(req);
> +				else
> +					nvme_tcp_done_send_req(queue);
>   			}
>   			return 1;
>   		}
> @@ -1022,14 +1056,26 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
>   	struct nvme_tcp_data_pdu *pdu = req->pdu;
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	int len = sizeof(*pdu) - req->offset + hdgst;
> +	int flags = MSG_DONTWAIT | MSG_MORE;
>   	int ret;
>   
>   	if (queue->hdr_digest && !req->offset)
>   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>   
> -	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -			offset_in_page(pdu) + req->offset, len,
> -			MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
> +	if (req->rem_r2t_len) {
> +		struct msghdr msg = { .msg_flags = flags };
> +		struct kvec iov = {
> +			.iov_base = (u8 *)pdu + req->offset,
> +			.iov_len = len
> +		};
> +
> +		ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
> +	} else {
> +		ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> +				      offset_in_page(pdu) + req->offset, len,
> +				      flags | MSG_SENDPAGE_NOTLAST);
> +	}

Why is this needed? Seems out-of-place to me...