[PATCH 08/17] nvme-tcp: fixup MSG_SENDPAGE_NOTLAST

Sagi Grimberg sagi at grimberg.me
Wed Apr 19 02:08:40 PDT 2023


Please CC Jakub and Vadim in the next iterations.
I'd also CC netdev to let the networking folks this is going on...

> We can only set MSG_SENDPAGE_NOTLAST when sending the command PDU if the
> actual data is transferred via sendpage(), too.
> If we use sendmsg() to transfer the data we end up with a TX stall on TLS
> as it implements different code paths for sendmsg() and sendpage().
> So check in nvme_tcp_try_send_pdu() if inline data is present and revert
> to sendmsg if the inline data cannot be sent via sendpage().

ifaiu, the TX stalls are not related to sendpage and sendmsg being
separate code paths. The stalls are just a bug in an untested scenario
of sendpage(NOTLAST)+sendmsg..

It needs to be documented in the change log:
"Due to a bug in the tls code, we do xxx to workaround it"

In a separate patch.

> 
> Signed-off-by: Hannes Reinecke <hare at suse.de>
> ---
>   drivers/nvme/host/tcp.c | 18 ++++++++++++++----
>   1 file changed, 14 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 273c1f2760a4..9ecd7db2cd98 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -995,9 +995,11 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
>   		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
>   			flags |= MSG_EOR;
>   		else
> -			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +			flags |= MSG_MORE;
>   
>   		if (sendpage_ok(page)) {
> +			if (flags & MSG_MORE)
> +				flags |= MSG_SENDPAGE_NOTLAST;
>   			ret = kernel_sendpage(queue->sock, page, offset, len,
>   					flags);
>   		} else {

This is fine, lets move this to its own patch.

> @@ -1049,15 +1051,23 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>   	int ret;
>   
>   	if (inline_data || nvme_tcp_queue_more(queue))
> -		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> +		flags |= MSG_MORE;
>   	else
>   		flags |= MSG_EOR;
>   
>   	if (queue->hdr_digest && !req->offset)
>   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>   
> -	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -			offset_in_page(pdu) + req->offset, len,  flags);
> +	if (!inline_data || sendpage_ok(nvme_tcp_req_cur_page(req))) {

This needs a big fat comment to why we are doing this bizzare and
untrivial condition:

Something like:
--
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 7723a4989524..cd8610836de9 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1046,15 +1046,32 @@ static int nvme_tcp_try_send_cmd_pdu(struct 
nvme_tcp_request *req)
         int ret;

         if (inline_data || nvme_tcp_queue_more(queue))
-               flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+               flags |= MSG_MORE;
         else
                 flags |= MSG_EOR;

         if (queue->hdr_digest && !req->offset)
                 nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));

-       ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
-                       offset_in_page(pdu) + req->offset, len,  flags);
+       /**
+        * FIXME: Due to a bug in the tls code, issuing sendpage with msg
+        * flag MSG_SENDPAGE_NOTLAST and then issuing sendmsg, we can hit
+        * some TX stalls. Our connect data (which is the request 
inline_data)
+        * is a slab allocated page and hence we must not use sendpage to
+        * send it, and this hits exactly the tls issue. workaround it by
+        * sending sendmsg on the command itself as if we are running with
+        * a tls socket and the inline data cannot be sendpage'd.
+        * When the tls code is resolve, this workaround needs to be 
removed.
+        */
+       if (nvme_tcp_tls_enabled(queue) && inline_data &&
+           sendpage_ok(nvme_tcp_req_cur_page(req))) {
+               ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
+                               offset_in_page(pdu) + req->offset, len, 
flags);
+       } else {
+               flags |= MSG_SENDPAGE_NOTLAST;
+               ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+                               offset_in_page(pdu) + req->offset, len, 
flags);
+       }
         if (unlikely(ret <= 0))
                 return ret;
--

> +		if (flags & MSG_MORE)
> +			flags |= MSG_SENDPAGE_NOTLAST;
> +		ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> +				offset_in_page(pdu) + req->offset, len,  flags);
> +	} else {
> +		ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
> +				offset_in_page(pdu) + req->offset,
> +				len, flags);
> +	}
>   	if (unlikely(ret <= 0))
>   		return ret;
>   



More information about the Linux-nvme mailing list