[PATCH] nvme-tcp: add command tracking trace events
Sagi Grimberg
sagi at grimberg.me
Fri May 7 10:46:30 PDT 2021
> Debugging nvme-tcp issues can be difficult due to lack of visibility
> into the driver's execution flow for any particular command. Add trace
> events that detail the command cycle. Also add details to existing error
> messages so they can more readily be matched up with those traces.
>
> There is no functional change in this patch.
Thanks!
>
> Signed-off-by: Keith Busch <kbusch at kernel.org>
> ---
> drivers/nvme/host/tcp.c | 46 +++++---
> drivers/nvme/host/trace-tcp.h | 209 ++++++++++++++++++++++++++++++++++
> 2 files changed, 240 insertions(+), 15 deletions(-)
> create mode 100644 drivers/nvme/host/trace-tcp.h
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 0222e23f5936..bbcf48f6345e 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -218,6 +218,9 @@ static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
> return nvme_tcp_pdu_data_left(req) <= len;
> }
>
> +#define CREATE_TRACE_POINTS
> +#include "trace-tcp.h"
> +
> static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
> unsigned int dir)
> {
> @@ -525,6 +528,7 @@ static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
>
> queue->data_remaining = le32_to_cpu(pdu->data_length);
>
> + trace_nvme_tcp_c2h(rq, pdu);
> if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
> unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
> dev_err(queue->ctrl->ctrl.device,
> @@ -567,10 +571,13 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
> struct request *rq = blk_mq_rq_from_pdu(req);
> u8 hdgst = nvme_tcp_hdgst_len(queue);
> u8 ddgst = nvme_tcp_ddgst_len(queue);
> + unsigned r2t_offset = le32_to_cpu(pdu->r2t_offset);
>
> req->pdu_len = le32_to_cpu(pdu->r2t_length);
> req->pdu_sent = 0;
>
> + trace_nvme_tcp_r2t(rq, req, r2t_offset);
> +
> if (unlikely(!req->pdu_len)) {
> dev_err(queue->ctrl->ctrl.device,
> "req %d r2t len is %u, probably a bug...\n",
> @@ -580,17 +587,16 @@ static int nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req,
>
> if (unlikely(req->data_sent + req->pdu_len > req->data_len)) {
> dev_err(queue->ctrl->ctrl.device,
> - "req %d r2t len %u exceeded data len %u (%zu sent)\n",
> - rq->tag, req->pdu_len, req->data_len,
> - req->data_sent);
> + "req %d op %d r2t len %u exceeded data len %u (%zu sent) pdu offset %u\n",
Better to log it "pdu r2t_offset".
> + rq->tag, req_op(rq), req->pdu_len, req->data_len,
> + req->data_sent, r2t_offset);
> return -EPROTO;
> }
>
> - if (unlikely(le32_to_cpu(pdu->r2t_offset) < req->data_sent)) {
> + if (unlikely(r2t_offset < req->data_sent)) {
> dev_err(queue->ctrl->ctrl.device,
> "req %d unexpected r2t offset %u (expected %zu)\n",
> - rq->tag, le32_to_cpu(pdu->r2t_offset),
> - req->data_sent);
> + rq->tag, r2t_offset, req->data_sent);
> return -EPROTO;
> }
>
> @@ -714,6 +720,7 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
> }
> req = blk_mq_rq_to_pdu(rq);
>
> + trace_nvme_tcp_recv_data(rq, queue, pdu, *offset, *len);
This is the skb offset,len is that what you wanted to log?
> while (true) {
> int recv_len, ret;
>
> @@ -730,8 +737,9 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
> */
> if (!req->curr_bio) {
> dev_err(queue->ctrl->ctrl.device,
> - "queue %d no space in request %#x",
> - nvme_tcp_queue_id(queue), rq->tag);
> + "queue %d no space in request %#x data_remaining %lu",
Better to log it "queue data remaining" because its the queue recv state
machine and not the request.
> + nvme_tcp_queue_id(queue), rq->tag,
> + queue->data_remaining);
> nvme_tcp_init_recv_ctx(queue);
> return -EIO;
> }
> @@ -920,6 +928,7 @@ static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
> static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
> {
> struct nvme_tcp_queue *queue = req->queue;
> + struct request *rq = blk_mq_rq_from_pdu(req);
>
> while (true) {
> struct page *page = nvme_tcp_req_cur_page(req);
> @@ -933,6 +942,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
> else
> flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
>
> + trace_nvme_tcp_data(rq, req, offset, len, last);
Maybe call the trace nvme_tcp_send_data?
> if (sendpage_ok(page)) {
> ret = kernel_sendpage(queue->sock, page, offset, len,
> flags);
> @@ -972,7 +982,8 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
> u8 hdgst = nvme_tcp_hdgst_len(queue);
> int len = sizeof(*pdu) + hdgst - req->offset;
> int flags = MSG_DONTWAIT;
> - int ret;
> + struct request *rq = blk_mq_rq_from_pdu(req);
> + int offset, ret;
>
> if (inline_data || nvme_tcp_queue_more(queue))
> flags |= MSG_MORE | MSG_SENDPAGE_NOTLAST;
> @@ -982,8 +993,10 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
> if (queue->hdr_digest && !req->offset)
> nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
>
> - ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> - offset_in_page(pdu) + req->offset, len, flags);
> + offset = offset_in_page(pdu) + req->offset;
> + trace_nvme_tcp_cmd_pdu(rq, req, offset, len);
> + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), offset,
> + len, flags);
> if (unlikely(ret <= 0))
> return ret;
>
> @@ -1009,14 +1022,16 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
> struct nvme_tcp_data_pdu *pdu = req->pdu;
> u8 hdgst = nvme_tcp_hdgst_len(queue);
> int len = sizeof(*pdu) - req->offset + hdgst;
> - int ret;
> + struct request *rq = blk_mq_rq_from_pdu(req);
> + int offset, ret;
>
> if (queue->hdr_digest && !req->offset)
> nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
I can add in the digest pieces as well.
>
> - ret = kernel_sendpage(queue->sock, virt_to_page(pdu),
> - offset_in_page(pdu) + req->offset, len,
> - MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
> + offset = offset_in_page(pdu) + req->offset;
> + trace_nvme_tcp_h2c_pdu(rq, req, offset, len);
> + ret = kernel_sendpage(queue->sock, virt_to_page(pdu), offset,
> + len, MSG_DONTWAIT | MSG_MORE | MSG_SENDPAGE_NOTLAST);
> if (unlikely(ret <= 0))
> return ret;
>
> @@ -2341,6 +2356,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
> if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
> return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
>
> + trace_nvme_tcp_queue_rq(rq);
> ret = nvme_tcp_setup_cmd_pdu(ns, rq);
> if (unlikely(ret))
> return ret;
> diff --git a/drivers/nvme/host/trace-tcp.h b/drivers/nvme/host/trace-tcp.h
> new file mode 100644
> index 000000000000..4c720c8d32c3
> --- /dev/null
> +++ b/drivers/nvme/host/trace-tcp.h
> @@ -0,0 +1,209 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +
> +#undef TRACE_SYSTEM
> +#define TRACE_SYSTEM nvme_tcp
> +
> +#if !defined(_TRACE_NVME_TCP_H) || defined(TRACE_HEADER_MULTI_READ)
> +#define _TRACE_NVME_TCP_H
> +
> +#include <linux/nvme.h>
> +#include <linux/tracepoint.h>
> +#include <linux/trace_seq.h>
> +
> +#include "nvme.h"
> +
> +TRACE_EVENT(nvme_tcp_queue_rq,
> + TP_PROTO(struct request *rq),
> + TP_ARGS(rq),
> + TP_STRUCT__entry(
> + __field(unsigned, ctrl_id)
> + __field(unsigned, qid)
> + __field(unsigned, op)
> + __field(unsigned, tag)
> + __field(unsigned, len)
> + ),
> + TP_fast_assign(
> + __entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> + __entry->qid = nvme_req_qid(rq);
> + __entry->op = req_op(rq);
> + __entry->tag = rq->tag;
> + __entry->len = blk_rq_payload_bytes(rq);
> + ),
> + TP_printk("nvme%u: qid=%u tag=%x op=%x data_len=%u",
> + __entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> + __entry->len
> + )
> +);
> +
> +TRACE_EVENT(nvme_tcp_cmd_pdu,
> + TP_PROTO(struct request *rq, struct nvme_tcp_request *req, unsigned offset, unsigned len),
> + TP_ARGS(rq, req, offset, len),
> + TP_STRUCT__entry(
> + __field(unsigned, ctrl_id)
> + __field(unsigned, qid)
> + __field(unsigned, op)
> + __field(unsigned, tag)
> + __field(unsigned, page_offset)
> + __field(unsigned, len)
> + ),
> + TP_fast_assign(
> + __entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> + __entry->qid = nvme_req_qid(rq);
> + __entry->op = req_op(rq);
> + __entry->tag = rq->tag;
> + __entry->page_offset = offset;
> + __entry->len = len;
> + ),
> + TP_printk("nvme%u: qid=%u tag=%x op=%x page_offset=%u send_len=%u",
> + __entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> + __entry->page_offset, __entry->len
> + )
> +);
> +
> +TRACE_EVENT(nvme_tcp_h2c_pdu,
> + TP_PROTO(struct request *rq, struct nvme_tcp_request *req, unsigned offset, unsigned len),
> + TP_ARGS(rq, req, offset, len),
> + TP_STRUCT__entry(
> + __field(unsigned, ctrl_id)
> + __field(unsigned, qid)
> + __field(unsigned, op)
> + __field(unsigned, tag)
> + __field(unsigned, page_offset)
> + __field(unsigned, len)
> + ),
> + TP_fast_assign(
> + __entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> + __entry->qid = nvme_req_qid(rq);
> + __entry->op = req_op(rq);
> + __entry->tag = rq->tag;
> + __entry->page_offset = offset;
> + __entry->len = len;
> + ),
> + TP_printk("nvme%u: qid=%u tag=%x op=%x page_offset=%u send_len=%u",
> + __entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> + __entry->page_offset, __entry->len
> + )
> +);
> +
> +TRACE_EVENT(nvme_tcp_data,
> + TP_PROTO(struct request *rq, struct nvme_tcp_request *req, unsigned offset, unsigned len, bool last),
> + TP_ARGS(rq, req, offset, len, last),
> + TP_STRUCT__entry(
> + __field(unsigned, ctrl_id)
> + __field(unsigned, qid)
> + __field(unsigned, op)
> + __field(unsigned, tag)
> + __field(unsigned, page_offset)
> + __field(unsigned, len)
> + __field(unsigned, sent)
> + __field(bool, last)
> + ),
> + TP_fast_assign(
> + __entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> + __entry->qid = nvme_req_qid(rq);
> + __entry->op = req_op(rq);
> + __entry->tag = rq->tag;
> + __entry->page_offset = offset;
> + __entry->len = len;
> + __entry->sent = req->pdu_sent;
> + __entry->last = last;
> + ),
> + TP_printk("nvme%u: qid=%u tag=%x op=%x page_offset=%u send_len=%u pdu_sent=%u last=%u",
> + __entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> + __entry->page_offset, __entry->len, __entry->sent, __entry->last
> + )
> +);
> +
> +TRACE_EVENT(nvme_tcp_r2t,
> + TP_PROTO(struct request *rq, struct nvme_tcp_request *req, unsigned offset),
> + TP_ARGS(rq, req, offset),
> + TP_STRUCT__entry(
> + __field(unsigned, ctrl_id)
> + __field(unsigned, qid)
> + __field(unsigned, op)
> + __field(unsigned, tag)
> + __field(unsigned, len)
> + __field(unsigned, offset)
> + __field(unsigned, sent)
> + __field(unsigned, data_len)
> + ),
> + TP_fast_assign(
> + __entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> + __entry->qid = nvme_req_qid(rq);
> + __entry->op = req_op(rq);
> + __entry->tag = rq->tag;
> + __entry->len = req->pdu_len;
> + __entry->offset = offset;
> + __entry->sent = req->data_sent;
> + __entry->data_len = req->data_len;
> + ),
> + TP_printk("nvme%u: qid=%u tag=%x op=%x r2t_len=%u r2t_offset=%u data_sent=%u data_len=%u",
> + __entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> + __entry->len, __entry->offset, __entry->sent, __entry->data_len
> + )
What may be useful here (and probably for all incoming pdu types) is to
also log the pdu header details.
Something like: "... ch: type=%x flags=%x hlen=%u plen=%u"
Thoughts?
> +);
> +
> +TRACE_EVENT(nvme_tcp_c2h,
> + TP_PROTO(struct request *rq, struct nvme_tcp_data_pdu *pdu),
> + TP_ARGS(rq, pdu),
> + TP_STRUCT__entry(
> + __field(unsigned, ctrl_id)
> + __field(unsigned, qid)
> + __field(unsigned, op)
> + __field(unsigned, tag)
> + __field(unsigned, flags)
> + __field(unsigned, data_len)
> + ),
> + TP_fast_assign(
> + __entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> + __entry->qid = nvme_req_qid(rq);
> + __entry->op = req_op(rq);
> + __entry->tag = rq->tag;
> + __entry->flags = pdu->hdr.flags;
> + __entry->data_len = le32_to_cpu(pdu->data_length);
> + ),
> + TP_printk("nvme%u: qid=%u tag=%x op=%x data_len=%u flags=%#x",
> + __entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> + __entry->data_len, __entry->flags
> + )
I see you did capture the flags here...
> +);
> +
> +TRACE_EVENT(nvme_tcp_recv_data,
> + TP_PROTO(struct request *rq, struct nvme_tcp_queue *queue, struct nvme_tcp_data_pdu *pdu, unsigned offset, unsigned len),
> + TP_ARGS(rq, queue, pdu, offset, len),
> + TP_STRUCT__entry(
> + __field(unsigned, ctrl_id)
> + __field(unsigned, qid)
> + __field(unsigned, op)
> + __field(unsigned, tag)
> + __field(unsigned, flags)
> + __field(unsigned, data_len)
> + __field(unsigned, offset)
> + __field(unsigned, data_remaining)
> + ),
> + TP_fast_assign(
> + __entry->ctrl_id = nvme_req(rq)->ctrl->instance;
> + __entry->qid = nvme_req_qid(rq);
> + __entry->op = req_op(rq);
> + __entry->tag = rq->tag;
> + __entry->flags = pdu->hdr.flags;
> + __entry->data_len = len;
> + __entry->offset = offset;
> + __entry->data_remaining = queue->data_remaining;
> + ),
> + TP_printk("nvme%u: qid=%u tag=%x op=%x data_len=%u offset=%u flags=%#x data_remaining=%u",
> + __entry->ctrl_id, __entry->qid, __entry->tag, __entry->op,
> + __entry->data_len, __entry->offset, __entry->flags,
> + __entry->data_remaining
> + )
> +);
> +
> +#endif /* _TRACE_NVME_TCP_H */
> +
> +#undef TRACE_INCLUDE_PATH
> +#define TRACE_INCLUDE_PATH .
> +#undef TRACE_INCLUDE_FILE
> +#define TRACE_INCLUDE_FILE trace-tcp
> +
> +/* This part must be outside protection */
> +#include <trace/define_trace.h>
>
More information about the Linux-nvme
mailing list