[PATCH] nvme-tcp: add command tracking trace events

Sagi Grimberg sagi at grimberg.me
Fri May 7 10:46:30 PDT 2021


> Debugging nvme-tcp issues can be difficult due to lack of visibility
> into the driver's execution flow for any particular command. Add trace
> events that detail the command cycle. Also add details to existing error
> messages so they can more readily be matched up with those traces.
> 
> There is no functional change in this patch.

Thanks!

> 
> Signed-off-by: Keith Busch <kbusch at kernel.org>
> ---
>   drivers/nvme/host/tcp.c       |  46 +++++---
>   drivers/nvme/host/trace-tcp.h | 209 ++++++++++++++++++++++++++++++++++
>   2 files changed, 240 insertions(+), 15 deletions(-)
>   create mode 100644 drivers/nvme/host/trace-tcp.h
> 
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 0222e23f5936..bbcf48f6345e 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -218,6 +218,9 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
>   	return nvme_tcp_pdu_data_left(req) <= len;
>   }
>   
> +#define CREATE_TRACE_POINTS
> +#include "trace-tcp.h"
> +
>   static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
>   		unsigned int dir)
>   {
> @@ -525,6 +528,7 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
>   
>   	queue->data_remaining = le32_to_cpu(pdu->data_length);
>   
> +	trace_nvme_tcp_c2h(rq, pdu);
>   	if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
>   	    unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
>   		dev_err(queue->ctrl->ctrl.device,
> @@ -567,10 +571,13 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
>   	struct request *rq = blk_mq_rq_from_pdu(req);
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	u8 ddgst = nvme_tcp_ddgst_len(queue);
> +	unsigned r2t_offset = le32_to_cpu(pdu->r2t_offset);
>   
>   	req->pdu_len = le32_to_cpu(pdu->r2t_length);
>   	req->pdu_sent = 0;
>   
> +	trace_nvme_tcp_r2t(rq, req, r2t_offset);
> +
>   	if (unlikely(!req->pdu_len)) {
>   		dev_err(queue->ctrl->ctrl.device,
>   			"req %d r2t len is %u, probably a bug...\n",
> @@ -580,17 +587,16 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
>   
>   	if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
>   		dev_err(queue->ctrl->ctrl.device,
> -			"req %d r2t len %u exceeded data len %u (%zu sent)\n",
> -			rq->tag, req->pdu_len, req->data_len,
> -			req->data_sent);
> +			"req %d op %d r2t len %u exceeded data len %u (%zu sent) pdu offset %u\n",

Better to log it "pdu r2t_offset".

> +			rq->tag, req_op(rq), req->pdu_len, req->data_len,
> +			req->data_sent, r2t_offset);
>   		return -EPROTO;
>   	}
>   
> -	if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
> +	if (unlikely(r2t_offset < req->data_sent)) {
>   		dev_err(queue->ctrl->ctrl.device,
>   			"req %d unexpected r2t offset %u (expected %zu)\n",
> -			rq->tag, le32_to_cpu(pdu->r2t_offset),
> -			req->data_sent);
> +			rq->tag, r2t_offset, req->data_sent);
>   		return -EPROTO;
>   	}
>   
> @@ -714,6 +720,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>   	}
>   	req = blk_mq_rq_to_pdu(rq);
>   
> +	trace_nvme_tcp_recv_data(rq, queue, pdu, *offset, *len);

This is the skb offset,len is that what you wanted to log?

>   	while (true) {
>   		int recv_len, ret;
>   
> @@ -730,8 +737,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
>   			 */
>   			if (!req->curr_bio) {
>   				dev_err(queue->ctrl->ctrl.device,
> -					"queue %d no space in request %#x",
> -					nvme_tcp_queue_id(queue), rq->tag);
> +					"queue %d no space in request %#x data_remaining %lu",

Better to log it "queue data remaining" because its the queue recv state
machine and not the request.

> +					nvme_tcp_queue_id(queue), rq->tag,
> +					queue->data_remaining);
>   				nvme_tcp_init_recv_ctx(queue);
>   				return -EIO;
>   			}
> @@ -920,6 +928,7 @@ static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
>   static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
>   {
>   	struct nvme_tcp_queue *queue = req->queue;
> +	struct request *rq = blk_mq_rq_from_pdu(req);
>   
>   	while (true) {
>   		struct page *page = nvme_tcp_req_cur_page(req);
> @@ -933,6 +942,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
>   		else
>   			flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
>   
> +		trace_nvme_tcp_data(rq, req, offset, len, last);

Maybe call the trace nvme_tcp_send_data?

>   		if (sendpage_ok(page)) {
>   			ret = kernel_sendpage(queue->sock, page, offset, len,
>   					flags);
> @@ -972,7 +982,8 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	int len = sizeof(*pdu) + hdgst - req->offset;
>   	int flags = MSG_DONTWAIT;
> -	int ret;
> +	struct request *rq = blk_mq_rq_from_pdu(req);
> +	int offset, ret;
>   
>   	if (inline_data || nvme_tcp_queue_more(queue))
>   		flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> @@ -982,8 +993,10 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
>   	if (queue->hdr_digest && !req->offset)
>   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>   
> -	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -			offset_in_page(pdu) + req->offset, len,  flags);
> +	offset = offset_in_page(pdu) + req->offset;
> +	trace_nvme_tcp_cmd_pdu(rq, req, offset, len);
> +	ret = kernel_sendpage(queue->sock, virt_to_page(pdu), offset,
> +			len,  flags);
>   	if (unlikely(ret <= 0))
>   		return ret;
>   
> @@ -1009,14 +1022,16 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
>   	struct nvme_tcp_data_pdu *pdu = req->pdu;
>   	u8 hdgst = nvme_tcp_hdgst_len(queue);
>   	int len = sizeof(*pdu) - req->offset + hdgst;
> -	int ret;
> +	struct request *rq = blk_mq_rq_from_pdu(req);
> +	int offset, ret;
>   
>   	if (queue->hdr_digest && !req->offset)
>   		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));

I can add in the digest pieces as well.

>   
> -	ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> -			offset_in_page(pdu) + req->offset, len,
> -			MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
> +	offset = offset_in_page(pdu) + req->offset;
> +	trace_nvme_tcp_h2c_pdu(rq, req, offset, len);
> +	ret = kernel_sendpage(queue->sock, virt_to_page(pdu), offset,
> +			len, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
>   	if (unlikely(ret <= 0))
>   		return ret;
>   
> @@ -2341,6 +2356,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
>   	if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
>   		return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
>   
> +	trace_nvme_tcp_queue_rq(rq);
>   	ret = nvme_tcp_setup_cmd_pdu(ns, rq);
>   	if (unlikely(ret))
>   		return ret;
> diff --git a/drivers/nvme/host/trace-tcp.h b/drivers/nvme/host/trace-tcp.h
> new file mode 100644
> index 000000000000..4c720c8d32c3
> --- /dev/null
> +++ b/drivers/nvme/host/trace-tcp.h
> @@ -0,0 +1,209 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM nvme_tcp
> +
> +#if !defined(_TRACE_NVME_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_NVME_TCP_H
> +
> +#include <linux/nvme.h>
> +#include <linux/tracepoint.h>
> +#include <linux/trace_seq.h>
> +
> +#include "nvme.h"
> +
> +TRACE_EVENT(nvme_tcp_queue_rq,
> +	TP_PROTO(struct request *rq),
> +	TP_ARGS(rq),
> +	TP_STRUCT__entry(
> +		__field(unsigned, ctrl_id)
> +		__field(unsigned, qid)
> +		__field(unsigned, op)
> +		__field(unsigned, tag)
> +		__field(unsigned, len)
> +	),
> +	TP_fast_assign(
> +		__entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> +		__entry->qid = nvme_req_qid(rq);
> +		__entry->op = req_op(rq);
> +		__entry->tag = rq->tag;
> +		__entry->len = blk_rq_payload_bytes(rq);
> +	),
> +	TP_printk("nvme%u: qid=%u tag=%x op=%x data_len=%u",
> +		__entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> +		__entry->len
> +	)
> +);
> +
> +TRACE_EVENT(nvme_tcp_cmd_pdu,
> +	TP_PROTO(struct request *rq, struct nvme_tcp_request *req, unsigned offset, unsigned len),
> +	TP_ARGS(rq, req, offset, len),
> +	TP_STRUCT__entry(
> +		__field(unsigned, ctrl_id)
> +		__field(unsigned, qid)
> +		__field(unsigned, op)
> +		__field(unsigned, tag)
> +		__field(unsigned, page_offset)
> +		__field(unsigned, len)
> +	),
> +	TP_fast_assign(
> +		__entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> +		__entry->qid = nvme_req_qid(rq);
> +		__entry->op = req_op(rq);
> +		__entry->tag = rq->tag;
> +		__entry->page_offset = offset;
> +		__entry->len = len;
> +	),
> +	TP_printk("nvme%u: qid=%u tag=%x op=%x page_offset=%u send_len=%u",
> +		__entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> +		__entry->page_offset, __entry->len
> +	)
> +);
> +
> +TRACE_EVENT(nvme_tcp_h2c_pdu,
> +	TP_PROTO(struct request *rq, struct nvme_tcp_request *req, unsigned offset, unsigned len),
> +	TP_ARGS(rq, req, offset, len),
> +	TP_STRUCT__entry(
> +		__field(unsigned, ctrl_id)
> +		__field(unsigned, qid)
> +		__field(unsigned, op)
> +		__field(unsigned, tag)
> +		__field(unsigned, page_offset)
> +		__field(unsigned, len)
> +	),
> +	TP_fast_assign(
> +		__entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> +		__entry->qid = nvme_req_qid(rq);
> +		__entry->op = req_op(rq);
> +		__entry->tag = rq->tag;
> +		__entry->page_offset = offset;
> +		__entry->len = len;
> +	),
> +	TP_printk("nvme%u: qid=%u tag=%x op=%x page_offset=%u send_len=%u",
> +		__entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> +		__entry->page_offset, __entry->len
> +	)
> +);
> +
> +TRACE_EVENT(nvme_tcp_data,
> +	TP_PROTO(struct request *rq, struct nvme_tcp_request *req, unsigned offset, unsigned len, bool last),
> +	TP_ARGS(rq, req, offset, len, last),
> +	TP_STRUCT__entry(
> +		__field(unsigned, ctrl_id)
> +		__field(unsigned, qid)
> +		__field(unsigned, op)
> +		__field(unsigned, tag)
> +		__field(unsigned, page_offset)
> +		__field(unsigned, len)
> +		__field(unsigned, sent)
> +		__field(bool, last)
> +	),
> +	TP_fast_assign(
> +		__entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> +		__entry->qid = nvme_req_qid(rq);
> +		__entry->op = req_op(rq);
> +		__entry->tag = rq->tag;
> +		__entry->page_offset = offset;
> +		__entry->len = len;
> +		__entry->sent = req->pdu_sent;
> +		__entry->last = last;
> +	),
> +	TP_printk("nvme%u: qid=%u tag=%x op=%x page_offset=%u send_len=%u pdu_sent=%u last=%u",
> +		__entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> +		__entry->page_offset, __entry->len, __entry->sent, __entry->last
> +	)
> +);
> +
> +TRACE_EVENT(nvme_tcp_r2t,
> +	TP_PROTO(struct request *rq, struct nvme_tcp_request *req, unsigned offset),
> +	TP_ARGS(rq, req, offset),
> +	TP_STRUCT__entry(
> +		__field(unsigned, ctrl_id)
> +		__field(unsigned, qid)
> +		__field(unsigned, op)
> +		__field(unsigned, tag)
> +		__field(unsigned, len)
> +		__field(unsigned, offset)
> +		__field(unsigned, sent)
> +		__field(unsigned, data_len)
> +	),
> +	TP_fast_assign(
> +		__entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> +		__entry->qid = nvme_req_qid(rq);
> +		__entry->op = req_op(rq);
> +		__entry->tag = rq->tag;
> +		__entry->len = req->pdu_len;
> +		__entry->offset = offset;
> +		__entry->sent = req->data_sent;
> +		__entry->data_len = req->data_len;
> +	),
> +	TP_printk("nvme%u: qid=%u tag=%x op=%x r2t_len=%u r2t_offset=%u data_sent=%u data_len=%u",
> +		__entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> +		__entry->len, __entry->offset, __entry->sent, __entry->data_len
> +	)

What may be useful here (and probably for all incoming pdu types) is to
also log the pdu header details.

Something like: "... ch: type=%x flags=%x hlen=%u plen=%u"

Thoughts?

> +);
> +
> +TRACE_EVENT(nvme_tcp_c2h,
> +	TP_PROTO(struct request *rq, struct nvme_tcp_data_pdu *pdu),
> +	TP_ARGS(rq, pdu),
> +	TP_STRUCT__entry(
> +		__field(unsigned, ctrl_id)
> +		__field(unsigned, qid)
> +		__field(unsigned, op)
> +		__field(unsigned, tag)
> +		__field(unsigned, flags)
> +		__field(unsigned, data_len)
> +	),
> +	TP_fast_assign(
> +		__entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> +		__entry->qid = nvme_req_qid(rq);
> +		__entry->op = req_op(rq);
> +		__entry->tag = rq->tag;
> +		__entry->flags = pdu->hdr.flags;
> +		__entry->data_len = le32_to_cpu(pdu->data_length);
> +	),
> +	TP_printk("nvme%u: qid=%u tag=%x op=%x data_len=%u flags=%#x",
> +		__entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> +		__entry->data_len, __entry->flags
> +	)

I see you did capture the flags here...

> +);
> +
> +TRACE_EVENT(nvme_tcp_recv_data,
> +	TP_PROTO(struct request *rq, struct nvme_tcp_queue *queue, struct nvme_tcp_data_pdu *pdu, unsigned offset, unsigned len),
> +	TP_ARGS(rq, queue, pdu, offset, len),
> +	TP_STRUCT__entry(
> +		__field(unsigned, ctrl_id)
> +		__field(unsigned, qid)
> +		__field(unsigned, op)
> +		__field(unsigned, tag)
> +		__field(unsigned, flags)
> +		__field(unsigned, data_len)
> +		__field(unsigned, offset)
> +		__field(unsigned, data_remaining)
> +	),
> +	TP_fast_assign(
> +		__entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> +		__entry->qid = nvme_req_qid(rq);
> +		__entry->op = req_op(rq);
> +		__entry->tag = rq->tag;
> +		__entry->flags = pdu->hdr.flags;
> +		__entry->data_len = len;
> +		__entry->offset = offset;
> +		__entry->data_remaining = queue->data_remaining;
> +	),
> +	TP_printk("nvme%u: qid=%u tag=%x op=%x data_len=%u offset=%u flags=%#x data_remaining=%u",
> +		__entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> +		__entry->data_len, __entry->offset, __entry->flags,
> +		__entry->data_remaining
> +	)
> +);
> +
> +#endif /* _TRACE_NVME_TCP_H */
> +
> +#undef TRACE_INCLUDE_PATH
> +#define TRACE_INCLUDE_PATH .
> +#undef TRACE_INCLUDE_FILE
> +#define TRACE_INCLUDE_FILE trace-tcp
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
> 



More information about the Linux-nvme mailing list