[PATCH 08/17] nvme-tcp: fixup MSG_SENDPAGE_NOTLAST
Sagi Grimberg
sagi at grimberg.me
Wed Apr 19 02:08:40 PDT 2023
Please CC Jakub and Vadim in the next iterations.
I'd also CC netdev to let the networking folks this is going on...
> We can only set MSG_SENDPAGE_NOTLAST when sending the command PDU if the
> actual data is transferred via sendpage(), too.
> If we use sendmsg() to transfer the data we end up with a TX stall on TLS
> as it implements different code paths for sendmsg() and sendpage().
> So check in nvme_tcp_try_send_pdu() if inline data is present and revert
> to sendmsg if the inline data cannot be sent via sendpage().
ifaiu, the TX stalls are not related to sendpage and sendmsg being
separate code paths. The stalls are just a bug in an untested scenario
of sendpage(NOTLAST)+sendmsg..
It needs to be documented in the change log:
"Due to a bug in the tls code, we do xxx to workaround it"
In a separate patch.
>
> Signed-off-by: Hannes Reinecke <hare at suse.de>
> ---
> drivers/nvme/host/tcp.c | 18 ++++++++++++++----
> 1 file changed, 14 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 273c1f2760a4..9ecd7db2cd98 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -995,9 +995,11 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
> if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
> flags |= MSG_EOR;
> else
> - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> + flags |= MSG_MORE;
>
> if (sendpage_ok(page)) {
> + if (flags & MSG_MORE)
> + flags |= MSG_SENDPAGE_NOTLAST;
> ret = kernel_sendpage(queue->sock, page, offset, len,
> flags);
> } else {
This is fine, lets move this to its own patch.
> @@ -1049,15 +1051,23 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
> int ret;
>
> if (inline_data || nvme_tcp_queue_more(queue))
> - flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> + flags |= MSG_MORE;
> else
> flags |= MSG_EOR;
>
> if (queue->hdr_digest && !req->offset)
> nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>
> - ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> - offset_in_page(pdu) + req->offset, len, flags);
> + if (!inline_data || sendpage_ok(nvme_tcp_req_cur_page(req))) {
This needs a big fat comment to why we are doing this bizzare and
untrivial condition:
Something like:
--
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 7723a4989524..cd8610836de9 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1046,15 +1046,32 @@ static int nvme_tcp_try_send_cmd_pdu(struct
nvme_tcp_request *req)
int ret;
if (inline_data || nvme_tcp_queue_more(queue))
- flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
+ flags |= MSG_MORE;
else
flags |= MSG_EOR;
if (queue->hdr_digest && !req->offset)
nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
- ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
- offset_in_page(pdu) + req->offset, len, flags);
+ /**
+ * FIXME: Due to a bug in the tls code, issuing sendpage with msg
+ * flag MSG_SENDPAGE_NOTLAST and then issuing sendmsg, we can hit
+ * some TX stalls. Our connect data (which is the request
inline_data)
+ * is a slab allocated page and hence we must not use sendpage to
+ * send it, and this hits exactly the tls issue. workaround it by
+ * sending sendmsg on the command itself as if we are running with
+ * a tls socket and the inline data cannot be sendpage'd.
+ * When the tls code is resolve, this workaround needs to be
removed.
+ */
+ if (nvme_tcp_tls_enabled(queue) && inline_data &&
+ sendpage_ok(nvme_tcp_req_cur_page(req))) {
+ ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
+ offset_in_page(pdu) + req->offset, len,
flags);
+ } else {
+ flags |= MSG_SENDPAGE_NOTLAST;
+ ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
+ offset_in_page(pdu) + req->offset, len,
flags);
+ }
if (unlikely(ret <= 0))
return ret;
--
> + if (flags & MSG_MORE)
> + flags |= MSG_SENDPAGE_NOTLAST;
> + ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> + offset_in_page(pdu) + req->offset, len, flags);
> + } else {
> + ret = sock_no_sendpage(queue->sock, virt_to_page(pdu),
> + offset_in_page(pdu) + req->offset,
> + len, flags);
> + }
> if (unlikely(ret <= 0))
> return ret;
>
More information about the Linux-nvme
mailing list