[PATCH RFC] nvme-tcp: Implement recvmsg() receive flow

Wed Feb 25 03:41:20 PST 2026

On 2/25/26 11:56, Alistair Francis wrote:
> On Fri, 2025-09-12 at 13:58 +0200, Hannes Reinecke wrote:
>> Switch to use recvmsg() so that we get access to TLS control
>> messages eg for handling TLS KeyUpdate.
>>
>> Signed-off-by: Hannes Reinecke <hare at kernel.org>
>> ---
>>   drivers/nvme/host/tcp.c | 204 ++++++++++++++++++++++----------------
>> --
>>   1 file changed, 111 insertions(+), 93 deletions(-)
>>
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index c0fe8cfb7229..9ef1d4aea838 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -17,6 +17,7 @@
>>   #include <net/tls_prot.h>
>>   #include <net/handshake.h>
>>   #include <linux/blk-mq.h>
>> +#include <linux/iov_iter.h>
>>   #include <net/busy_poll.h>
>>   #include <trace/events/sock.h>
>>   
>> @@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32
>> *crcp,
>>   	}
>>   }
>>   
>> +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress,
>> size_t len,
>> +				  void *priv, void *priv2)
>> +{
>> +	u32 *crcp = priv;
>> +
>> +	*crcp = crc32c(*crcp, iter_base, len);
>> +        return 0;
>> +}
>> +
>> +static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32
>> *crcp,
>> +			       size_t maxsize)
>> +{
>> +	struct iov_iter tmp = req->iter;
>> +	int err = 0;
>> +
>> +	tmp.count = maxsize;
>> +	if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
>> +				       nvme_tcp_ddgst_step) !=
>> maxsize)
>> +		return err;
>> +	return 0;
>> +}
>> +
>>   static inline __le32 nvme_tcp_ddgst_final(u32 crc)
>>   {
>>   	return cpu_to_le32(~crc);
>> @@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct
>> nvme_tcp_queue *queue,
>>   		"Received C2HTermReq (FES = %s)\n", msg);
>>   }
>>   
>> -static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct
>> sk_buff *skb,
>> -		unsigned int *offset, size_t *len)
>> +static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
>>   {
>> -	struct nvme_tcp_hdr *hdr;
>>   	char *pdu = queue->pdu;
>> -	size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
>> +	struct msghdr msg = {
>> +		.msg_flags = MSG_DONTWAIT,
>> +	};
>> +	struct kvec iov = {
>> +		.iov_base = pdu + queue->pdu_offset,
>> +		.iov_len = queue->pdu_remaining,
>> +	};
>> +	struct nvme_tcp_hdr *hdr;
>>   	int ret;
>>   
>> -	ret = skb_copy_bits(skb, *offset,
>> -		&pdu[queue->pdu_offset], rcv_len);
>> -	if (unlikely(ret))
>> +	ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
>> +			     iov.iov_len, msg.msg_flags);
>> +	if (ret <= 0)
>>   		return ret;
>>   
>> -	queue->pdu_remaining -= rcv_len;
>> -	queue->pdu_offset += rcv_len;
>> -	*offset += rcv_len;
>> -	*len -= rcv_len;
>> +	queue->pdu_remaining -= ret;
>> +	queue->pdu_offset += ret;
>>   	if (queue->pdu_remaining)
>>   		return 0;
>>   
>> @@ -907,20 +933,19 @@ static inline void nvme_tcp_end_request(struct
>> request *rq, u16 status)
>>   		nvme_complete_rq(rq);
>>   }
>>   
>> -static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct
>> sk_buff *skb,
>> -			      unsigned int *offset, size_t *len)
>> +static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
>>   {
>>   	struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
>>   	struct request *rq =
>>   		nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu-
>>> command_id);
>>   	struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
>>   
>> -	while (true) {
>> -		int recv_len, ret;
>> +	if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
>> +		return 0;
>>   
>> -		recv_len = min_t(size_t, *len, queue-
>>> data_remaining);
>> -		if (!recv_len)
>> -			break;
>> +	while (queue->data_remaining) {
>> +		struct msghdr msg;
>> +		int ret;
>>   
>>   		if (!iov_iter_count(&req->iter)) {
>>   			req->curr_bio = req->curr_bio->bi_next;
>> @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct
>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>   		}
>>   
>>   		/* we can read only from what is left in this bio */
>> -		recv_len = min_t(size_t, recv_len,
>> -				iov_iter_count(&req->iter));
>> +		memset(&msg, 0, sizeof(msg));
>> +		msg.msg_iter = req->iter;
>> +		msg.msg_flags = MSG_DONTWAIT;
>>   
>> -		if (queue->data_digest)
>> -			ret = skb_copy_and_crc32c_datagram_iter(skb,
>> *offset,
>> -				&req->iter, recv_len, &queue-
>>> rcv_crc);
>> -		else
>> -			ret = skb_copy_datagram_iter(skb, *offset,
>> -					&req->iter, recv_len);
>> -		if (ret) {
>> +		ret = sock_recvmsg(queue->sock, &msg,
>> msg.msg_flags);
> 
> This doesn't work unfortunately.
> 
> The problem is what happens if queue->data_remaining is smaller then
> iov_iter_count(&req->iter)?
> 
> queue->data_remaining is set by the data length in the c2h, while the
> length of the request iov_iter_count(&req->iter) is set when the
> request is submitted.
> 
> If queue->data_remaining ends up being smaller then
> iov_iter_count(&req->iter) then we need to read less data then the
> actual count of req->iter.
> 
> So we need a iov_iter_truncate(), but then we end up overwriting the
> data on the next iteration as we have no way to keep...
> 
Question is, though: what _is_ in the remaining iov?
The most reasonable explanation would be that it's the start of the
next PDU (which we haven't accounted for, and hence haven't set
up pointers correctly).
I can see this happening for TLS when the sender doesn't space
the records correctly (ie if the PDU end is not falling on a
TLS record boundary).

But yeah, I can see the issue. While we can (and do)
advance the iterator to complete the request, we still
have the remaining data in the iterator.

What we can do, though, is to copy the remaining data over
to 'queue->pdu' (as we assume it's the start of the next PDU),
set up pointers, and let it rip.

Sounds okay-ish.

And it would certainly do away with the restriction of PDUs
having to be spaced on TLS record boundaries...

>> +		if (ret < 0) {
>>   			dev_err(queue->ctrl->ctrl.device,
>> -				"queue %d failed to copy request %#x
>> data",
>> +				"queue %d failed to receive request
>> %#x data",
>>   				nvme_tcp_queue_id(queue), rq->tag);
>>   			return ret;
>>   		}
>> -
>> -		*len -= recv_len;
>> -		*offset += recv_len;
> 
> ...this offset.
> 
Yikes. The is wrong, indeed.

> PDU and ddgst in this patch continue using an offset as they use
> kernel_recvmsg() instead of sock_recvmsg() directly.
> 
Argl. Rather not. I don't want to repackage iterators ...

> So I think either:
> 
>   1. I'm missing something and there is a nice way to do this, please
> let me know if that's the case.
See above. We could copy the remaining iter data onto 'queue->pdu',
and restart parsing.
(It _must_ be the start of a PDU anyway, otherwise queue->remaining 
would've covered it).

>   2. We need to convert this sock_recvmsg() to kernel_recvmsg(), with
> the overhead of an extra iov_iter (I don't know if that is high or not)

Rather not.

>   3. We use Chuck's read_sock with cmsg approach [1]

Another possibility.
Especially as now Chuck has worked on improving the TLS record spacing,
so this might be a possible avenue.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare at suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich