[PATCH RFC] nvme-tcp: Implement recvmsg() receive flow
Alistair Francis
Alistair.Francis at wdc.com
Wed Feb 25 02:56:55 PST 2026
On Fri, 2025-09-12 at 13:58 +0200, Hannes Reinecke wrote:
> Switch to use recvmsg() so that we get access to TLS control
> messages eg for handling TLS KeyUpdate.
>
> Signed-off-by: Hannes Reinecke <hare at kernel.org>
> ---
> drivers/nvme/host/tcp.c | 204 ++++++++++++++++++++++----------------
> --
> 1 file changed, 111 insertions(+), 93 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index c0fe8cfb7229..9ef1d4aea838 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -17,6 +17,7 @@
> #include <net/tls_prot.h>
> #include <net/handshake.h>
> #include <linux/blk-mq.h>
> +#include <linux/iov_iter.h>
> #include <net/busy_poll.h>
> #include <trace/events/sock.h>
>
> @@ -476,6 +477,28 @@ static inline void nvme_tcp_ddgst_update(u32
> *crcp,
> }
> }
>
> +static size_t nvme_tcp_ddgst_step(void *iter_base, size_t progress,
> size_t len,
> + void *priv, void *priv2)
> +{
> + u32 *crcp = priv;
> +
> + *crcp = crc32c(*crcp, iter_base, len);
> + return 0;
> +}
> +
> +static int nvme_tcp_ddgst_calc(struct nvme_tcp_request *req, u32
> *crcp,
> + size_t maxsize)
> +{
> + struct iov_iter tmp = req->iter;
> + int err = 0;
> +
> + tmp.count = maxsize;
> + if (iterate_and_advance_kernel(&tmp, maxsize, crcp, &err,
> + nvme_tcp_ddgst_step) !=
> maxsize)
> + return err;
> + return 0;
> +}
> +
> static inline __le32 nvme_tcp_ddgst_final(u32 crc)
> {
> return cpu_to_le32(~crc);
> @@ -827,23 +850,26 @@ static void nvme_tcp_handle_c2h_term(struct
> nvme_tcp_queue *queue,
> "Received C2HTermReq (FES = %s)\n", msg);
> }
>
> -static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct
> sk_buff *skb,
> - unsigned int *offset, size_t *len)
> +static int nvme_tcp_recvmsg_pdu(struct nvme_tcp_queue *queue)
> {
> - struct nvme_tcp_hdr *hdr;
> char *pdu = queue->pdu;
> - size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
> + struct msghdr msg = {
> + .msg_flags = MSG_DONTWAIT,
> + };
> + struct kvec iov = {
> + .iov_base = pdu + queue->pdu_offset,
> + .iov_len = queue->pdu_remaining,
> + };
> + struct nvme_tcp_hdr *hdr;
> int ret;
>
> - ret = skb_copy_bits(skb, *offset,
> - &pdu[queue->pdu_offset], rcv_len);
> - if (unlikely(ret))
> + ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
> + iov.iov_len, msg.msg_flags);
> + if (ret <= 0)
> return ret;
>
> - queue->pdu_remaining -= rcv_len;
> - queue->pdu_offset += rcv_len;
> - *offset += rcv_len;
> - *len -= rcv_len;
> + queue->pdu_remaining -= ret;
> + queue->pdu_offset += ret;
> if (queue->pdu_remaining)
> return 0;
>
> @@ -907,20 +933,19 @@ static inline void nvme_tcp_end_request(struct
> request *rq, u16 status)
> nvme_complete_rq(rq);
> }
>
> -static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct
> sk_buff *skb,
> - unsigned int *offset, size_t *len)
> +static int nvme_tcp_recvmsg_data(struct nvme_tcp_queue *queue)
> {
> struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
> struct request *rq =
> nvme_cid_to_rq(nvme_tcp_tagset(queue), pdu-
> >command_id);
> struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
>
> - while (true) {
> - int recv_len, ret;
> + if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DATA)
> + return 0;
>
> - recv_len = min_t(size_t, *len, queue-
> >data_remaining);
> - if (!recv_len)
> - break;
> + while (queue->data_remaining) {
> + struct msghdr msg;
> + int ret;
>
> if (!iov_iter_count(&req->iter)) {
> req->curr_bio = req->curr_bio->bi_next;
> @@ -940,25 +965,22 @@ static int nvme_tcp_recv_data(struct
> nvme_tcp_queue *queue, struct sk_buff *skb,
> }
>
> /* we can read only from what is left in this bio */
> - recv_len = min_t(size_t, recv_len,
> - iov_iter_count(&req->iter));
> + memset(&msg, 0, sizeof(msg));
> + msg.msg_iter = req->iter;
> + msg.msg_flags = MSG_DONTWAIT;
>
> - if (queue->data_digest)
> - ret = skb_copy_and_crc32c_datagram_iter(skb,
> *offset,
> - &req->iter, recv_len, &queue-
> >rcv_crc);
> - else
> - ret = skb_copy_datagram_iter(skb, *offset,
> - &req->iter, recv_len);
> - if (ret) {
> + ret = sock_recvmsg(queue->sock, &msg,
> msg.msg_flags);
This doesn't work unfortunately.
The problem is what happens if queue->data_remaining is smaller then
iov_iter_count(&req->iter)?
queue->data_remaining is set by the data length in the c2h, while the
length of the request iov_iter_count(&req->iter) is set when the
request is submitted.
If queue->data_remaining ends up being smaller then
iov_iter_count(&req->iter) then we need to read less data then the
actual count of req->iter.
So we need a iov_iter_truncate(), but then we end up overwriting the
data on the next iteration as we have no way to keep...
> + if (ret < 0) {
> dev_err(queue->ctrl->ctrl.device,
> - "queue %d failed to copy request %#x
> data",
> + "queue %d failed to receive request
> %#x data",
> nvme_tcp_queue_id(queue), rq->tag);
> return ret;
> }
> -
> - *len -= recv_len;
> - *offset += recv_len;
...this offset.
PDU and ddgst in this patch continue using an offset as they use
kernel_recvmsg() instead of sock_recvmsg() directly.
So I think either:
1. I'm missing something and there is a nice way to do this, please
let me know if that's the case.
2. We need to convert this sock_recvmsg() to kernel_recvmsg(), with
the overhead of an extra iov_iter (I don't know if that is high or not)
3. We use Chuck's read_sock with cmsg approach [1]
1:
https://lore.kernel.org/kernel-tls-handshake/20260217222033.1929211-1-cel@kernel.org/T/#t
Alistair
> - queue->data_remaining -= recv_len;
> + if (queue->data_digest)
> + nvme_tcp_ddgst_calc(req, &queue->rcv_crc,
> ret);
> + queue->data_remaining -= ret;
> + if (queue->data_remaining)
> + nvme_tcp_advance_req(req, ret);
> }
>
> if (!queue->data_remaining) {
> @@ -968,7 +990,7 @@ static int nvme_tcp_recv_data(struct
> nvme_tcp_queue *queue, struct sk_buff *skb,
> } else {
> if (pdu->hdr.flags &
> NVME_TCP_F_DATA_SUCCESS) {
> nvme_tcp_end_request(rq,
> - le16_to_cpu(req-
> >status));
> +
> le16_to_cpu(req->status));
> queue->nr_cqe++;
> }
> nvme_tcp_init_recv_ctx(queue);
> @@ -978,24 +1000,9 @@ static int nvme_tcp_recv_data(struct
> nvme_tcp_queue *queue, struct sk_buff *skb,
> return 0;
> }
>
> -static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
> - struct sk_buff *skb, unsigned int *offset, size_t
> *len)
> +static int __nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue)
> {
> struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
> - char *ddgst = (char *)&queue->recv_ddgst;
> - size_t recv_len = min_t(size_t, *len, queue-
> >ddgst_remaining);
> - off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
> - int ret;
> -
> - ret = skb_copy_bits(skb, *offset, &ddgst[off], recv_len);
> - if (unlikely(ret))
> - return ret;
> -
> - queue->ddgst_remaining -= recv_len;
> - *offset += recv_len;
> - *len -= recv_len;
> - if (queue->ddgst_remaining)
> - return 0;
>
> if (queue->recv_ddgst != queue->exp_ddgst) {
> struct request *rq =
> nvme_cid_to_rq(nvme_tcp_tagset(queue),
> @@ -1023,40 +1030,32 @@ static int nvme_tcp_recv_ddgst(struct
> nvme_tcp_queue *queue,
> return 0;
> }
>
> -static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff
> *skb,
> - unsigned int offset, size_t len)
> +static int nvme_tcp_recvmsg_ddgst(struct nvme_tcp_queue *queue)
> {
> - struct nvme_tcp_queue *queue = desc->arg.data;
> - size_t consumed = len;
> - int result;
> + char *ddgst = (char *)&queue->recv_ddgst;
> + off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
> + struct msghdr msg = {
> + .msg_flags = MSG_WAITALL,
> + };
> + struct kvec iov = {
> + .iov_base = (u8 *)ddgst + off,
> + .iov_len = queue->ddgst_remaining,
> + };
> + int ret;
>
> - if (unlikely(!queue->rd_enabled))
> - return -EFAULT;
> + if (nvme_tcp_recv_state(queue) != NVME_TCP_RECV_DDGST)
> + return 0;
>
> - while (len) {
> - switch (nvme_tcp_recv_state(queue)) {
> - case NVME_TCP_RECV_PDU:
> - result = nvme_tcp_recv_pdu(queue, skb,
> &offset, &len);
> - break;
> - case NVME_TCP_RECV_DATA:
> - result = nvme_tcp_recv_data(queue, skb,
> &offset, &len);
> - break;
> - case NVME_TCP_RECV_DDGST:
> - result = nvme_tcp_recv_ddgst(queue, skb,
> &offset, &len);
> - break;
> - default:
> - result = -EFAULT;
> - }
> - if (result) {
> - dev_err(queue->ctrl->ctrl.device,
> - "receive failed: %d\n", result);
> - queue->rd_enabled = false;
> - nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> - return result;
> - }
> - }
> + ret = kernel_recvmsg(queue->sock, &msg, &iov, 1,
> iov.iov_len,
> + msg.msg_flags);
> + if (ret <= 0)
> + return ret;
> +
> + queue->ddgst_remaining -= ret;
> + if (queue->ddgst_remaining)
> + return 0;
>
> - return consumed;
> + return __nvme_tcp_recv_ddgst(queue);
> }
>
> static void nvme_tcp_data_ready(struct sock *sk)
> @@ -1353,20 +1352,39 @@ static int nvme_tcp_try_send(struct
> nvme_tcp_queue *queue)
> return ret;
> }
>
> -static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
> +static int nvme_tcp_try_recvmsg(struct nvme_tcp_queue *queue)
> {
> - struct socket *sock = queue->sock;
> - struct sock *sk = sock->sk;
> - read_descriptor_t rd_desc;
> - int consumed;
> + int result;
> + int nr_cqe = queue->nr_cqe;
> +
> + if (unlikely(!queue->rd_enabled))
> + return -EFAULT;
> +
> + do {
> + switch (nvme_tcp_recv_state(queue)) {
> + case NVME_TCP_RECV_PDU:
> + result = nvme_tcp_recvmsg_pdu(queue);
> + break;
> + case NVME_TCP_RECV_DATA:
> + result = nvme_tcp_recvmsg_data(queue);
> + break;
> + case NVME_TCP_RECV_DDGST:
> + result = nvme_tcp_recvmsg_ddgst(queue);
> + break;
> + default:
> + result = -EFAULT;
> + }
> + } while (result >= 0);
> +
> + if (result < 0 && result != -EAGAIN) {
> + dev_err(queue->ctrl->ctrl.device,
> + "receive failed: %d\n", result);
> + queue->rd_enabled = false;
> + nvme_tcp_error_recovery(&queue->ctrl->ctrl);
> + } else if (result == -EAGAIN)
> + result = 0;
>
> - rd_desc.arg.data = queue;
> - rd_desc.count = 1;
> - lock_sock(sk);
> - queue->nr_cqe = 0;
> - consumed = sock->ops->read_sock(sk, &rd_desc,
> nvme_tcp_recv_skb);
> - release_sock(sk);
> - return consumed == -EAGAIN ? 0 : consumed;
> + return result < 0 ? result : (queue->nr_cqe = nr_cqe);
> }
>
> static void nvme_tcp_io_work(struct work_struct *w)
> @@ -1388,7 +1406,7 @@ static void nvme_tcp_io_work(struct work_struct
> *w)
> break;
> }
>
> - result = nvme_tcp_try_recv(queue);
> + result = nvme_tcp_try_recvmsg(queue);
> if (result > 0)
> pending = true;
> else if (unlikely(result < 0))
> @@ -2794,7 +2812,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx
> *hctx, struct io_comp_batch *iob)
> set_bit(NVME_TCP_Q_POLLING, &queue->flags);
> if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk-
> >sk_receive_queue))
> sk_busy_loop(sk, true);
> - ret = nvme_tcp_try_recv(queue);
> + ret = nvme_tcp_try_recvmsg(queue);
> clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
> return ret < 0 ? ret : queue->nr_cqe;
> }
More information about the Linux-nvme
mailing list