[PATCHv2] nvme-tcp: Implement recvmsg() receive flow

Wed Nov 26 00:58:26 PST 2025

On 26/11/2025 9:32, Sagi Grimberg wrote:
>
>
> On 20/10/2025 11:58, Hannes Reinecke wrote:
>> The nvme-tcp code is using the ->read_sock() interface to
>> read data from the wire. While this interface gives us access
>> to the skbs themselves (and so might be able to reduce latency)
>> it does not interpret the skbs.
>> Additionally for TLS these skbs have to be re-constructed from
>> the TLS stream data, rendering any advantage questionable.
>> But the main drawback for TLS is that we do not get access to
>> the TLS control messages, so if we receive any of those message
>> the only choice we have is to tear down the connection and restart.
>> This patch switches the receive side over to use recvmsg(), which
>> provides us full access to the TLS control messages and is also
>> more efficient when working with TLS as skbs do not need to be
>> artificially constructed.
>
> Hannes,
>
> I generally agree with this approach. I'd like to point out though
> that this is going to give up running RX from directly from softirq 
> context.
> I've gone back and forth on weather nvme-tcp should do that, but never
> got to do a thorough comparison between the two. This probably shuts
> the door on that option.
>
> Having said that, I am fine with this approach. recvmsg is a saner 
> interface.
>
>>
>> Signed-off-by: Hannes Reinecke <hare at kernel.org>
>> ---
>>   drivers/nvme/host/tcp.c | 203 ++++++++++++++++++++++------------------
>>   1 file changed, 110 insertions(+), 93 deletions(-)
>>
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index 9a96df1a511c..081f53fa9fc4 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -17,6 +17,7 @@
>>   #include <net/tls_prot.h>
>>   #include <net/handshake.h>
>>   #include <linux/blk-mq.h>
>> +#include <linux/iov_iter.h>
>>   #include <net/busy_poll.h>
>>   #include <trace/events/sock.h>
>>   @@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32 
>> *crcp,
>>       }
>>   }
>>   +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t 
>> progress, size_t len,
>> +                  void *priv, void *priv2)
>> +{
>> +    u32 *crcp = priv;
>> +
>> +    *crcp = crc32c(*crcp, iter_base, len);
>> +    return 0;
>> +}
>> +
>> +static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32 *crcp,
>> +                   size_t maxsize)
>> +{
>> +    struct iov_iter tmp = req->iter;
>> +    int err = 0;
>> +
>> +    tmp.count = maxsize;
>> +    if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
>> +                       nvme_tcp_ddgst_step) != maxsize)
>> +        return err;
>> +    return 0;
>> +}
>> +
>>   static inline __le32 nvme_tcp_ddgst_final(u32 crc)
>>   {
>>       return cpu_to_le32(~crc);
>> @@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct 
>> nvme_tcp_queue *queue,
>>           "Received C2HTermReq (FES = %s)\n", msg);
>>   }
>>   -static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct 
>> sk_buff *skb,
>> -        unsigned int *offset, size_t *len)
>> +static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
>>   {
>> -    struct nvme_tcp_hdr *hdr;
>>       char *pdu = queue->pdu;
>> -    size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
>> +    struct msghdr msg = {
>> +        .msg_flags = MSG_DONTWAIT,
>> +    };
>> +    struct kvec iov = {
>> +        .iov_base = pdu + queue->pdu_offset,
>> +        .iov_len = queue->pdu_remaining,
>> +    };
>> +    struct nvme_tcp_hdr *hdr;
>>       int ret;
>>   -    ret = skb_copy_bits(skb, *offset,
>> -        &pdu[queue->pdu_offset], rcv_len);
>> -    if (unlikely(ret))
>> +    ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
>> +                 iov.iov_len, msg.msg_flags);
>> +    if (ret <= 0)
>>           return ret;
>>   -    queue->pdu_remaining -= rcv_len;
>> -    queue->pdu_offset += rcv_len;
>> -    *offset += rcv_len;
>> -    *len -= rcv_len;
>> +    queue->pdu_remaining -= ret;
>> +    queue->pdu_offset += ret;
>>       if (queue->pdu_remaining)
>>           return 0;
>>   @@ -907,20 +933,19 @@ static inline void 
>> nvme_tcp_end_request(struct request *rq, u16 status)
>>           nvme_complete_rq(rq);
>>   }
>>   -static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct 
>> sk_buff *skb,
>> -                  unsigned int *offset, size_t *len)
>> +static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
>>   {
>>       struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
>>       struct request *rq =
>>           nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu->command_id);
>>       struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
>>   -    while (true) {
>> -        int recv_len, ret;
>> +    if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
>> +        return 0;
>>   -        recv_len = min_t(size_t, *len, queue->data_remaining);
>> -        if (!recv_len)
>> -            break;
>> +    while (queue->data_remaining) {
>> +        struct msghdr msg;
>> +        int ret;
>>             if (!iov_iter_count(&req->iter)) {
>>               req->curr_bio = req->curr_bio->bi_next;
>> @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct 
>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>           }
>>             /* we can read only from what is left in this bio */
>> -        recv_len = min_t(size_t, recv_len,
>> -                iov_iter_count(&req->iter));
>> +        memset(&msg, 0, sizeof(msg));
>> +        msg.msg_iter = req->iter;
>> +        msg.msg_flags = MSG_DONTWAIT;

Shouldn't this be initialized outside of the loop? then the loop becomes 
while (msg_data_left(&msg)) ?

>>   -        if (queue->data_digest)
>> -            ret = skb_copy_and_crc32c_datagram_iter(skb, *offset,
>> -                &req->iter, recv_len, &queue->rcv_crc);
>> -        else
>> -            ret = skb_copy_datagram_iter(skb, *offset,
>> -                    &req->iter, recv_len);
>> -        if (ret) {
>> +        ret = sock_recvmsg(queue->sock, &msg, msg.msg_flags);
>> +        if (ret < 0) {
>>               dev_err(queue->ctrl->ctrl.device,
>> -                "queue %d failed to copy request %#x data",
>> +                "queue %d failed to receive request %#x data",
>>                   nvme_tcp_queue_id(queue), rq->tag);
>>               return ret;
>>           }
>> -
>> -        *len -= recv_len;
>> -        *offset += recv_len;
>> -        queue->data_remaining -= recv_len;
>> +        if (queue->data_digest)
>> +            nvme_tcp_ddgst_calc(req, &queue->rcv_crc, ret);
>> +        queue->data_remaining -= ret;
>> +        if (queue->data_remaining)
>> +            nvme_tcp_advance_req(req, ret);
>>       }
>>         if (!queue->data_remaining) {
>> @@ -968,7 +990,7 @@ static int nvme_tcp_recv_data(struct 
>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>           } else {
>>               if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
>>                   nvme_tcp_end_request(rq,
>> -                        le16_to_cpu(req->status));
>> +                             le16_to_cpu(req->status));
>>                   queue->nr_cqe++;
>>               }
>>               nvme_tcp_init_recv_ctx(queue);
>> @@ -978,24 +1000,9 @@ static int nvme_tcp_recv_data(struct 
>> nvme_tcp_queue *queue, struct sk_buff *skb,
>>       return 0;
>>   }
>>   -static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
>> -        struct sk_buff *skb, unsigned int *offset, size_t *len)
>> +static int __nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue)
>>   {
>>       struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
>> -    char *ddgst = (char *)&queue->recv_ddgst;
>> -    size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
>> -    off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
>> -    int ret;
>> -
>> -    ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
>> -    if (unlikely(ret))
>> -        return ret;
>> -
>> -    queue->ddgst_remaining -= recv_len;
>> -    *offset += recv_len;
>> -    *len -= recv_len;
>> -    if (queue->ddgst_remaining)
>> -        return 0;
>>         if (queue->recv_ddgst != queue->exp_ddgst) {
>>           struct request *rq = nvme_cid_to_rq(nvme_tcp_tagset(queue),
>> @@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct 
>> nvme_tcp_queue *queue,
>>       return 0;
>>   }
>>   -static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct 
>> sk_buff *skb,
>> -                 unsigned int offset, size_t len)
>> +static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
>>   {
>> -    struct nvme_tcp_queue *queue = desc->arg.data;
>> -    size_t consumed = len;
>> -    int result;
>> +    char *ddgst = (char *)&queue->recv_ddgst;
>> +    off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
>> +    struct msghdr msg = {
>> +        .msg_flags = MSG_WAITALL,
>> +    };
>
> I don't think we want to use MSG_WAITALL ever. Why are you opting to 
> do that?
>
>> +    struct kvec iov = {
>> +        .iov_base = (u8 *)ddgst + off,
>> +        .iov_len = queue->ddgst_remaining,
>> +    };
>> +    int ret;
>>   -    if (unlikely(!queue->rd_enabled))
>> -        return -EFAULT;
>> +    if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DDGST)
>> +        return 0;
>>   -    while (len) {
>> -        switch (nvme_tcp_recv_state(queue)) {
>> -        case NVME_TCP_RECV_PDU:
>> -            result = nvme_tcp_recv_pdu(queue, skb, &offset, &len);
>> -            break;
>> -        case NVME_TCP_RECV_DATA:
>> -            result = nvme_tcp_recv_data(queue, skb, &offset, &len);
>> -            break;
>> -        case NVME_TCP_RECV_DDGST:
>> -            result = nvme_tcp_recv_ddgst(queue, skb, &offset, &len);
>> -            break;
>> -        default:
>> -            result = -EFAULT;
>> -        }
>> -        if (result) {
>> -            dev_err(queue->ctrl->ctrl.device,
>> -                "receive failed:  %d\n", result);
>> -            queue->rd_enabled = false;
>> - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
>> -            return result;
>> -        }
>> -    }
>> +    ret = kernel_recvmsg(queue->sock, &msg, &iov, 1, iov.iov_len,
>> +                 msg.msg_flags);
>> +    if (ret <= 0)
>> +        return ret;
>>   -    return consumed;
>> +    queue->ddgst_remaining -= ret;
>> +    if (queue->ddgst_remaining)
>> +        return 0;
>> +
>> +    return __nvme_tcp_recv_ddgst(queue);
>>   }
>>     static void nvme_tcp_data_ready(struct sock *sk)
>> @@ -1356,20 +1355,38 @@ static int nvme_tcp_try_send(struct 
>> nvme_tcp_queue *queue)
>>       return ret;
>>   }
>>   -static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
>> +static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
>>   {
>> -    struct socket *sock = queue->sock;
>> -    struct sock *sk = sock->sk;
>> -    read_descriptor_t rd_desc;
>> -    int consumed;
>> +    int result;
>> +    int nr_cqe = queue->nr_cqe;
>> +
>> +    if (unlikely(!queue->rd_enabled))
>> +        return -EFAULT;
>> +
>> +    do {
>> +        switch (nvme_tcp_recv_state(queue)) {
>> +        case NVME_TCP_RECV_PDU:
>> +            result = nvme_tcp_recvmsg_pdu(queue);
>> +            break;
>> +        case NVME_TCP_RECV_DATA:
>> +            result = nvme_tcp_recvmsg_data(queue);
>> +            break;
>> +        case NVME_TCP_RECV_DDGST:
>> +            result = nvme_tcp_recvmsg_ddgst(queue);
>> +            break;
>> +        default:
>> +            result = -EFAULT;
>> +        }
>> +    } while (result >= 0);
>> +
>> +    if (result < 0 && result != -EAGAIN) {
>> +        dev_err(queue->ctrl->ctrl.device,
>> +            "receive failed:  %d\n", result);
>> +        queue->rd_enabled = false;
>> +        nvme_tcp_error_recovery(&queue->ctrl->ctrl);
>> +    }
>>   -    rd_desc.arg.data = queue;
>> -    rd_desc.count = 1;
>> -    lock_sock(sk);
>> -    queue->nr_cqe = 0;
>> -    consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
>> -    release_sock(sk);
>> -    return consumed == -EAGAIN ? 0 : consumed;
>> +    return result < 0 ? result : (queue->nr_cqe = nr_cqe);
>
> This is changing the semantics of the retcode - can you please 
> document in the commit msg
> the change?
>
>>   }
>>     static void nvme_tcp_io_work(struct work_struct *w)
>> @@ -1391,7 +1408,7 @@ static void nvme_tcp_io_work(struct work_struct 
>> *w)
>>                   break;
>>           }
>>   -        result = nvme_tcp_try_recv(queue);
>> +        result = nvme_tcp_try_recvmsg(queue);
>>           if (result > 0)
>>               pending = true;
>>           else if (unlikely(result < 0))
>> @@ -2800,7 +2817,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx 
>> *hctx, struct io_comp_batch *iob)
>>       set_bit(NVME_TCP_Q_POLLING, &queue->flags);
>>       if (sk_can_busy_loop(sk) && 
>> skb_queue_empty_lockless(&sk->sk_receive_queue))
>>           sk_busy_loop(sk, true);
>> -    ret = nvme_tcp_try_recv(queue);
>> +    ret = nvme_tcp_try_recvmsg(queue);
>>       clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
>>       return ret < 0 ? ret : queue->nr_cqe;
>>   }
>