[PATCH RFC] nvme-tcp: Implement recvmsg() receive flow

Alistair Francis Alistair.Francis at wdc.com
Wed Feb 25 05:15:12 PST 2026


On Wed, 2026-02-25 at 12:41 +0100, Hannes Reinecke wrote:
> On 2/25/26 11:56, Alistair Francis wrote:
> > On Fri, 2025-09-12 at 13:58 +0200, Hannes Reinecke wrote:
> > > Switch to use recvmsg() so that we get access to TLS control
> > > messages eg for handling TLS KeyUpdate.
> > > 
> > > Signed-off-by: Hannes Reinecke <hare at kernel.org>
> > > ---
> > >   drivers/nvme/host/tcp.c | 204 ++++++++++++++++++++++-----------
> > > -----
> > > --
> > >   1 file changed, 111 insertions(+), 93 deletions(-)
> > > 
> > > diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> > > index c0fe8cfb7229..9ef1d4aea838 100644
> > > --- a/drivers/nvme/host/tcp.c
> > > +++ b/drivers/nvme/host/tcp.c
> > > @@ -17,6 +17,7 @@
> > >   #include <net/tls_prot.h>
> > >   #include <net/handshake.h>
> > >   #include <linux/blk-mq.h>
> > > +#include <linux/iov_iter.h>
> > >   #include <net/busy_poll.h>
> > >   #include <trace/events/sock.h>
> > >   
> > > @@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32
> > > *crcp,
> > >   	}
> > >   }
> > >   
> > > +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t
> > > progress,
> > > size_t len,
> > > +				  void *priv, void *priv2)
> > > +{
> > > +	u32 *crcp = priv;
> > > +
> > > +	*crcp = crc32c(*crcp, iter_base, len);
> > > +        return 0;
> > > +}
> > > +
> > > +static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32
> > > *crcp,
> > > +			       size_t maxsize)
> > > +{
> > > +	struct iov_iter tmp = req->iter;
> > > +	int err = 0;
> > > +
> > > +	tmp.count = maxsize;
> > > +	if (iterate_and_advance_kernel(&tmp, maxsize, crcp,
> > > &err,
> > > +				       nvme_tcp_ddgst_step) !=
> > > maxsize)
> > > +		return err;
> > > +	return 0;
> > > +}
> > > +
> > >   static inline __le32 nvme_tcp_ddgst_final(u32 crc)
> > >   {
> > >   	return cpu_to_le32(~crc);
> > > @@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct
> > > nvme_tcp_queue *queue,
> > >   		"Received C2HTermReq (FES = %s)\n", msg);
> > >   }
> > >   
> > > -static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue,
> > > struct
> > > sk_buff *skb,
> > > -		unsigned int *offset, size_t *len)
> > > +static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> > >   {
> > > -	struct nvme_tcp_hdr *hdr;
> > >   	char *pdu = queue->pdu;
> > > -	size_t rcv_len = min_t(size_t, *len, queue-
> > > >pdu_remaining);
> > > +	struct msghdr msg = {
> > > +		.msg_flags = MSG_DONTWAIT,
> > > +	};
> > > +	struct kvec iov = {
> > > +		.iov_base = pdu + queue->pdu_offset,
> > > +		.iov_len = queue->pdu_remaining,
> > > +	};
> > > +	struct nvme_tcp_hdr *hdr;
> > >   	int ret;
> > >   
> > > -	ret = skb_copy_bits(skb, *offset,
> > > -		&pdu[queue->pdu_offset], rcv_len);
> > > -	if (unlikely(ret))
> > > +	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
> > > +			     iov.iov_len, msg.msg_flags);
> > > +	if (ret <= 0)
> > >   		return ret;
> > >   
> > > -	queue->pdu_remaining -= rcv_len;
> > > -	queue->pdu_offset += rcv_len;
> > > -	*offset += rcv_len;
> > > -	*len -= rcv_len;
> > > +	queue->pdu_remaining -= ret;
> > > +	queue->pdu_offset += ret;
> > >   	if (queue->pdu_remaining)
> > >   		return 0;
> > >   
> > > @@ -907,20 +933,19 @@ static inline void
> > > nvme_tcp_end_request(struct
> > > request *rq, u16 status)
> > >   		nvme_complete_rq(rq);
> > >   }
> > >   
> > > -static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue,
> > > struct
> > > sk_buff *skb,
> > > -			      unsigned int *offset, size_t *len)
> > > +static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
> > >   {
> > >   	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
> > >   	struct request *rq =
> > >   		nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu-
> > > > command_id);
> > >   	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
> > >   
> > > -	while (true) {
> > > -		int recv_len, ret;
> > > +	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
> > > +		return 0;
> > >   
> > > -		recv_len = min_t(size_t, *len, queue-
> > > > data_remaining);
> > > -		if (!recv_len)
> > > -			break;
> > > +	while (queue->data_remaining) {
> > > +		struct msghdr msg;
> > > +		int ret;
> > >   
> > >   		if (!iov_iter_count(&req->iter)) {
> > >   			req->curr_bio = req->curr_bio->bi_next;
> > > @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct
> > > nvme_tcp_queue *queue, struct sk_buff *skb,
> > >   		}
> > >   
> > >   		/* we can read only from what is left in this
> > > bio */
> > > -		recv_len = min_t(size_t, recv_len,
> > > -				iov_iter_count(&req->iter));
> > > +		memset(&msg, 0, sizeof(msg));
> > > +		msg.msg_iter = req->iter;
> > > +		msg.msg_flags = MSG_DONTWAIT;
> > >   
> > > -		if (queue->data_digest)
> > > -			ret =
> > > skb_copy_and_crc32c_datagram_iter(skb,
> > > *offset,
> > > -				&req->iter, recv_len, &queue-
> > > > rcv_crc);
> > > -		else
> > > -			ret = skb_copy_datagram_iter(skb,
> > > *offset,
> > > -					&req->iter, recv_len);
> > > -		if (ret) {
> > > +		ret = sock_recvmsg(queue->sock, &msg,
> > > msg.msg_flags);
> > 
> > This doesn't work unfortunately.
> > 
> > The problem is what happens if queue->data_remaining is smaller
> > then
> > iov_iter_count(&req->iter)?
> > 
> > queue->data_remaining is set by the data length in the c2h, while
> > the
> > length of the request iov_iter_count(&req->iter) is set when the
> > request is submitted.
> > 
> > If queue->data_remaining ends up being smaller then
> > iov_iter_count(&req->iter) then we need to read less data then the
> > actual count of req->iter.
> > 
> > So we need a iov_iter_truncate(), but then we end up overwriting
> > the
> > data on the next iteration as we have no way to keep...
> > 
> Question is, though: what _is_ in the remaining iov?

Which remaining iov?

> The most reasonable explanation would be that it's the start of the
> next PDU (which we haven't accounted for, and hence haven't set
> up pointers correctly).

The next PDU seems fine, it's just the next data that goes on the
current (and correct) IOV, just overwriting the previous data as there
is no offset.

> I can see this happening for TLS when the sender doesn't space
> the records correctly (ie if the PDU end is not falling on a
> TLS record boundary).
> 
> But yeah, I can see the issue. While we can (and do)
> advance the iterator to complete the request, we still
> have the remaining data in the iterator.

> What we can do, though, is to copy the remaining data over
> to 'queue->pdu' (as we assume it's the start of the next PDU),
> set up pointers, and let it rip.

Ah, I think that's the other way around of what I'm talking about. That
sounds more like you overflow the current iterator. I'm talking about
an underflow, where we don't transfer enough data to complete a
request. So when we continue the transfer we overwrite the previous
data.

> 
> Sounds okay-ish.
> 
> And it would certainly do away with the restriction of PDUs
> having to be spaced on TLS record boundaries...
> 
> 
> > > +		if (ret < 0) {
> > >   			dev_err(queue->ctrl->ctrl.device,
> > > -				"queue %d failed to copy request
> > > %#x
> > > data",
> > > +				"queue %d failed to receive
> > > request
> > > %#x data",
> > >   				nvme_tcp_queue_id(queue), rq-
> > > >tag);
> > >   			return ret;
> > >   		}
> > > -
> > > -		*len -= recv_len;
> > > -		*offset += recv_len;
> > 
> > ...this offset.
> > 
> Yikes. The is wrong, indeed.
> 
> > PDU and ddgst in this patch continue using an offset as they use
> > kernel_recvmsg() instead of sock_recvmsg() directly.
> > 
> Argl. Rather not. I don't want to repackage iterators ...
> 
> > So I think either:
> > 
> >   1. I'm missing something and there is a nice way to do this,
> > please
> > let me know if that's the case.
> See above. We could copy the remaining iter data onto 'queue->pdu',
> and restart parsing.

I don't think that addresses the issue. The next PDU seems fine, we
don't lose that, it's the next data that we end up putting in the wrong
place.

> (It _must_ be the start of a PDU anyway, otherwise queue->remaining 
> would've covered it).
> 
> >   2. We need to convert this sock_recvmsg() to kernel_recvmsg(),
> > with
> > the overhead of an extra iov_iter (I don't know if that is high or
> > not)
> 
> Rather not.

Agreed, but unless 1 works then we need 3.

> 
> >   3. We use Chuck's read_sock with cmsg approach [1]
> 
> Another possibility.
> Especially as now Chuck has worked on improving the TLS record
> spacing,
> so this might be a possible avenue.

I do think this is the way to go, albeit a little slower then I was
hoping for.

Alistair

> 
> Cheers,
> 
> Hannes


More information about the Linux-nvme mailing list