[PATCH 5/8] nvme-tcp: debugfs entries for latency statistics

Hannes Reinecke hare at kernel.org
Tue Jul 16 00:36:13 PDT 2024


From: Hannes Reinecke <hare at suse.de>

Add debugfs entries to display latency statistics:
'send_lat' displays the average latency for sendmsg() calls,
'recv_lat' the average latency for '->read_sock()' calls,
'write_space' the number of 'write_space' callbacks,
'data_ready' the number of 'data_ready' callbacks,
and 'queue_busy' the number -EAGAIN returns from sendmsg().

Signed-off-by: Hannes Reinecke <hare at kernel.org>
---
 drivers/nvme/host/tcp.c | 253 +++++++++++++++++++++++++++++++++++++++-
 1 file changed, 249 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 9caee99955c2..a758fbb3f9bb 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -21,6 +21,8 @@
 #include <net/busy_poll.h>
 #include <trace/events/sock.h>
 
+#include <linux/debugfs.h>
+
 #include "nvme.h"
 #include "fabrics.h"
 
@@ -139,6 +141,13 @@ enum nvme_tcp_recv_state {
 	NVME_TCP_RECV_DDGST,
 };
 
+struct nvme_tcp_stat {
+	u64 samples;
+	u64 batch;
+	s64 mean;
+	u64 sqmean;
+};
+
 struct nvme_tcp_ctrl;
 struct nvme_tcp_queue {
 	struct socket		*sock;
@@ -164,6 +173,14 @@ struct nvme_tcp_queue {
 	/* send state */
 	struct nvme_tcp_request *request;
 
+	/* statistics */
+	struct dentry		*debugfs_dir;
+	u64			data_ready_cnt;
+	u64			write_space_cnt;
+	u64			queue_busy_cnt;
+	struct nvme_tcp_stat	send_lat;
+	struct nvme_tcp_stat	recv_lat;
+
 	u32			maxh2cdata;
 	size_t			cmnd_capsule_len;
 	struct nvme_tcp_ctrl	*ctrl;
@@ -198,6 +215,7 @@ struct nvme_tcp_ctrl {
 	struct sockaddr_storage src_addr;
 	struct nvme_ctrl	ctrl;
 
+	struct dentry           *debugfs_dir;
 	struct work_struct	err_work;
 	struct delayed_work	connect_work;
 	struct nvme_tcp_request async_req;
@@ -207,10 +225,163 @@ struct nvme_tcp_ctrl {
 static LIST_HEAD(nvme_tcp_ctrl_list);
 static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
 static struct workqueue_struct *nvme_tcp_wq;
+static struct dentry *nvme_tcp_debugfs;
 static const struct blk_mq_ops nvme_tcp_mq_ops;
 static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
 static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
 
+static void nvme_tcp_stat_add(struct nvme_tcp_stat *stat, u64 start)
+{
+	u64 usec;
+	s64 delta, delta2;
+
+	usec = ktime_to_us(ktime_get());
+	if (usec < start)
+		return;
+	usec -= start;
+	/* Count in nanoseconds to improve precision */
+	usec *= 1000;
+	stat->batch += usec;
+	/* Welford's algorithm */
+	stat->samples++;
+	delta = (s64)usec - stat->mean;
+	stat->mean += div_s64(delta, (s64)stat->samples);
+	delta2 = (s64)usec - stat->mean;
+	stat->sqmean += (u64)(delta * delta2);
+}
+
+#define NVME_TCP_DEBUGFS_ATTR(field)				       \
+       static int field##_open(struct inode *inode, struct file *file) \
+       { return single_open(file, field##_show, inode->i_private); }   \
+								       \
+       static const struct file_operations field##_fops = {	       \
+	       .open = field##_open,				       \
+	       .read = seq_read,				       \
+	       .release = single_release,			       \
+       }
+
+#define NVME_TCP_DEBUGFS_RW_ATTR(field)					\
+	static int field##_open(struct inode *inode, struct file *file) \
+	{ return single_open(file, field##_show, inode->i_private); }	\
+									\
+	static const struct file_operations field##_fops = {		\
+		.open = field##_open,					\
+		.read = seq_read,					\
+		.write = field##_write,					\
+		.release = single_release,				\
+	}
+
+static int nvme_tcp_queue_data_ready_show(struct seq_file *m, void *p)
+{
+	struct nvme_tcp_queue *queue = m->private;
+
+	seq_printf(m, "%llu\n", queue->data_ready_cnt);
+	return 0;
+}
+NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_data_ready);
+
+static int nvme_tcp_queue_write_space_show(struct seq_file *m, void *p)
+{
+	struct nvme_tcp_queue *queue = m->private;
+
+	seq_printf(m, "%llu\n", queue->write_space_cnt);
+	return 0;
+}
+NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_write_space);
+
+static int nvme_tcp_queue_queue_busy_show(struct seq_file *m, void *p)
+{
+	struct nvme_tcp_queue *queue = m->private;
+
+	seq_printf(m, "%llu\n", queue->queue_busy_cnt);
+	return 0;
+}
+NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_queue_busy);
+
+static int nvme_tcp_queue_recv_lat_show(struct seq_file *m, void *p)
+{
+	struct nvme_tcp_queue *queue = m->private;
+	struct nvme_tcp_stat *st = &queue->recv_lat;
+	u64 var = 0;
+
+	if (st->samples > 1)
+		var = div_u64(st->sqmean, st->samples * 1000);
+
+	seq_printf(m, "%llu %llu %u\n",
+		   st->samples, div_u64(st->batch, st->samples * 1000), int_sqrt64(var));
+	return 0;
+}
+NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_recv_lat);
+
+static int nvme_tcp_queue_send_lat_show(struct seq_file *m, void *p)
+{
+	struct nvme_tcp_queue *queue = m->private;
+	struct nvme_tcp_stat *st = &queue->send_lat;
+	u64 var = 0;
+
+	if (st->samples > 1)
+		var = div_u64(st->sqmean, st->samples * 1000);
+
+	seq_printf(m, "%llu %llu %u\n",
+		   st->samples, div_u64(st->batch, st->samples * 1000), int_sqrt64(var));
+	return 0;
+}
+NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_send_lat);
+
+static int nvme_tcp_ctrl_recv_lat_show(struct seq_file *m, void *p)
+{
+	struct nvme_tcp_ctrl *ctrl = m->private;
+	struct nvme_tcp_stat *st, lat;
+	u64 mean = 0, var = 0;
+	int i;
+
+	lat.samples = 0;
+	lat.mean = 0;
+	lat.sqmean = 0;
+	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
+		st = &ctrl->queues[i].recv_lat;
+		lat.mean += st->mean;
+		lat.sqmean += st->sqmean;
+		lat.samples += st->samples;
+	}
+	if (lat.samples > 1) {
+		mean = lat.mean;
+		var = int_sqrt(div_u64(lat.sqmean, lat.samples));
+	}
+
+	seq_printf(m, "%llu %llu %llu\n",
+		   lat.samples, mean, var);
+	return 0;
+}
+NVME_TCP_DEBUGFS_ATTR(nvme_tcp_ctrl_recv_lat);
+
+static int nvme_tcp_ctrl_send_lat_show(struct seq_file *m, void *p)
+{
+	struct nvme_tcp_ctrl *ctrl = m->private;
+	struct nvme_tcp_stat *st, lat;
+	u64 mean = 0, var = 0;
+	int i;
+
+	lat.samples = 0;
+	lat.mean = 0;
+	lat.sqmean = 0;
+	for (i = 1; i < ctrl->ctrl.queue_count; i++) {
+		st = &ctrl->queues[i].send_lat;
+		lat.mean += st->mean;
+		lat.sqmean += st->sqmean;
+		lat.samples += st->samples;
+	}
+	if (lat.samples > 1) {
+		mean = lat.mean;
+		var = int_sqrt(div_u64(lat.sqmean, lat.samples));
+	}
+
+	seq_printf(m, "%llu %llu %llu\n",
+		   lat.samples, mean, var);
+	return 0;
+}
+NVME_TCP_DEBUGFS_ATTR(nvme_tcp_ctrl_send_lat);
+
 static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
 {
 	return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
@@ -985,8 +1156,10 @@ static void nvme_tcp_data_ready(struct sock *sk)
 	read_lock_bh(&sk->sk_callback_lock);
 	queue = sk->sk_user_data;
 	if (likely(queue && queue->rd_enabled) &&
-	    !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
+	    !test_bit(NVME_TCP_Q_POLLING, &queue->flags)) {
 		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		queue->data_ready_cnt++;
+	}
 	read_unlock_bh(&sk->sk_callback_lock);
 }
 
@@ -999,6 +1172,7 @@ static void nvme_tcp_write_space(struct sock *sk)
 	if (likely(queue && sk_stream_is_writeable(sk))) {
 		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		queue->write_space_cnt++;
 	}
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1067,6 +1241,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 		bool last = nvme_tcp_pdu_last_send(req, len);
 		int req_data_sent = req->data_sent;
 		int ret;
+		u64 start;
 
 		if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
 			msg.msg_flags |= MSG_EOR;
@@ -1079,7 +1254,9 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
 		bvec_set_page(&bvec, page, len, offset);
 		iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
 		queue->nr_send++;
+		start = ktime_to_us(ktime_get());
 		ret = sock_sendmsg(queue->sock, &msg);
+		nvme_tcp_stat_add(&queue->send_lat, start);
 		if (ret <= 0)
 			return ret;
 
@@ -1124,6 +1301,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) + hdgst - req->offset;
 	int ret;
+	u64 start;
 
 	if (inline_data || nvme_tcp_queue_more(queue))
 		msg.msg_flags |= MSG_MORE;
@@ -1136,7 +1314,9 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
 	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
 	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
 	queue->nr_send++;
+	start = ktime_to_us(ktime_get());
 	ret = sock_sendmsg(queue->sock, &msg);
+	nvme_tcp_stat_add(&queue->send_lat, start);
 	if (unlikely(ret <= 0))
 		return ret;
 
@@ -1165,6 +1345,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 	u8 hdgst = nvme_tcp_hdgst_len(queue);
 	int len = sizeof(*pdu) - req->offset + hdgst;
 	int ret;
+	u64 start;
 
 	if (queue->hdr_digest && !req->offset)
 		nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
@@ -1175,7 +1356,9 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
 	bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
 	iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
 	queue->nr_send++;
+	start = ktime_to_us(ktime_get());
 	ret = sock_sendmsg(queue->sock, &msg);
+	nvme_tcp_stat_add(&queue->send_lat, start);
 	if (unlikely(ret <= 0))
 		return ret;
 
@@ -1202,6 +1385,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
 		.iov_base = (u8 *)&req->ddgst + req->offset,
 		.iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
 	};
+	u64 start;
 
 	if (nvme_tcp_queue_more(queue))
 		msg.msg_flags |= MSG_MORE;
@@ -1209,7 +1393,9 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
 		msg.msg_flags |= MSG_EOR;
 
 	queue->nr_send++;
+	start = ktime_to_us(ktime_get());
 	ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
+	nvme_tcp_stat_add(&queue->send_lat, start);
 	if (unlikely(ret <= 0))
 		return ret;
 
@@ -1263,6 +1449,7 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
 		ret = nvme_tcp_try_send_ddgst(req);
 done:
 	if (ret == -EAGAIN) {
+		queue->queue_busy_cnt++;
 		ret = 0;
 	} else if (ret < 0) {
 		dev_err(queue->ctrl->ctrl.device,
@@ -1281,13 +1468,16 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
 	struct sock *sk = sock->sk;
 	read_descriptor_t rd_desc;
 	int consumed;
+	u64 start;
 
 	rd_desc.arg.data = queue;
 	rd_desc.count = 1;
 	lock_sock(sk);
 	queue->nr_cqe = 0;
 	queue->nr_recv = 0;
+	start = ktime_to_us(ktime_get());
 	consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
+	nvme_tcp_stat_add(&queue->recv_lat, start);
 	release_sock(sk);
 	return consumed;
 }
@@ -1331,11 +1521,18 @@ static void nvme_tcp_io_work(struct work_struct *w)
 check:
 	overrun = ktime_to_us(ktime_get()) - start;
 	if (overrun > 10 * deadline) {
+		u64 slat = 0, clat = 0;
+
+		if (queue->send_lat.samples)
+			slat = queue->send_lat.mean;
+		if (queue->recv_lat.samples)
+			clat = queue->recv_lat.mean;
 		dev_dbg(queue->ctrl->ctrl.device,
-			"queue %d: stall (%llu msecs) send %u sqe %u recv %u cqe %u%s%s\n",
+			"queue %d: stall (%llu msecs), send %u lat %llu %u sqes, recv %u lat %llu %u cqes%s%s\n",
 			nvme_tcp_queue_id(queue), div_u64(overrun, 1000),
-			queue->nr_send, queue->nr_sqe, queue->nr_recv, queue->nr_cqe,
-			list_empty(&queue->send_list) ? " empty" : "", queue->request ? " pending" : "");
+			queue->nr_send, slat, queue->nr_sqe, queue->nr_recv, clat, queue->nr_cqe,
+			list_empty(&queue->send_list) ? " empty" : "",
+			queue->request ? " pending" : "");
 	}
 	if (pending)
 		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
@@ -1408,6 +1605,11 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
 	if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
 		return;
 
+	if (queue->debugfs_dir) {
+		debugfs_remove_recursive(queue->debugfs_dir);
+		queue->debugfs_dir = NULL;
+	}
+
 	if (queue->hdr_digest || queue->data_digest)
 		nvme_tcp_free_crypto(queue);
 
@@ -1696,6 +1898,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 	struct nvme_tcp_queue *queue = &ctrl->queues[qid];
 	int ret, rcv_pdu_size;
 	struct file *sock_file;
+	char queue_name[32];
 
 	mutex_init(&queue->queue_lock);
 	queue->ctrl = ctrl;
@@ -1756,6 +1959,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 	queue->ddgst_remaining = 0;
 	queue->pdu_remaining = 0;
 	queue->pdu_offset = 0;
+	queue->write_space_cnt = 0;
+	queue->data_ready_cnt = 0;
+	queue->queue_busy_cnt = 0;
+	memset(&queue->recv_lat, 0, sizeof(struct nvme_tcp_stat));
+	memset(&queue->send_lat, 0, sizeof(struct nvme_tcp_stat));
 	sk_set_memalloc(queue->sock->sk);
 
 	if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
@@ -1802,6 +2010,19 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
 		goto err_crypto;
 	}
 
+	sprintf(queue_name, "%d", nvme_tcp_queue_id(queue));
+	queue->debugfs_dir = debugfs_create_dir(queue_name, ctrl->debugfs_dir);
+	debugfs_create_file("recv_lat", S_IRUSR, queue->debugfs_dir,
+			    queue, &nvme_tcp_queue_recv_lat_fops);
+	debugfs_create_file("send_lat", S_IRUSR, queue->debugfs_dir,
+			    queue, &nvme_tcp_queue_send_lat_fops);
+	debugfs_create_file("data_ready", S_IRUSR, queue->debugfs_dir,
+			    queue, &nvme_tcp_queue_data_ready_fops);
+	debugfs_create_file("write_space", S_IRUSR, queue->debugfs_dir,
+			    queue, &nvme_tcp_queue_write_space_fops);
+	debugfs_create_file("queue_busy", S_IRUSR, queue->debugfs_dir,
+			    queue, &nvme_tcp_queue_queue_busy_fops);
+
 	dev_dbg(nctrl->device, "connecting queue %d\n",
 			nvme_tcp_queue_id(queue));
 
@@ -2420,7 +2641,14 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
 
 static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
 {
+	struct nvme_tcp_ctrl *nctrl = to_tcp_ctrl(ctrl);
+
 	nvme_tcp_teardown_ctrl(ctrl, true);
+
+	if (nctrl->debugfs_dir) {
+		debugfs_remove_recursive(nctrl->debugfs_dir);
+		nctrl->debugfs_dir = NULL;
+	}
 }
 
 static void nvme_reset_ctrl_work(struct work_struct *work)
@@ -2890,6 +3118,13 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
 	if (ret)
 		goto out_put_ctrl;
 
+	ctrl->debugfs_dir = debugfs_create_dir(dev_name(ctrl->ctrl.device),
+					       nvme_tcp_debugfs);
+	debugfs_create_file("recv_lat", S_IRUSR, ctrl->debugfs_dir,
+			    ctrl, &nvme_tcp_ctrl_recv_lat_fops);
+	debugfs_create_file("send_lat", S_IRUSR, ctrl->debugfs_dir,
+			    ctrl, &nvme_tcp_ctrl_send_lat_fops);
+
 	if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
 		WARN_ON_ONCE(1);
 		ret = -EINTR;
@@ -2910,6 +3145,10 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
 	return &ctrl->ctrl;
 
 out_uninit_ctrl:
+	if (ctrl->debugfs_dir) {
+		debugfs_remove_recursive(ctrl->debugfs_dir);
+		ctrl->debugfs_dir = NULL;
+	}
 	nvme_uninit_ctrl(&ctrl->ctrl);
 out_put_ctrl:
 	nvme_put_ctrl(&ctrl->ctrl);
@@ -2951,6 +3190,10 @@ static int __init nvme_tcp_init_module(void)
 	if (!nvme_tcp_wq)
 		return -ENOMEM;
 
+	nvme_tcp_debugfs = debugfs_create_dir("nvme_tcp", NULL);
+	if (!nvme_tcp_debugfs)
+		return -ENOMEM;
+
 	nvmf_register_transport(&nvme_tcp_transport);
 	return 0;
 }
@@ -2961,6 +3204,8 @@ static void __exit nvme_tcp_cleanup_module(void)
 
 	nvmf_unregister_transport(&nvme_tcp_transport);
 
+	debugfs_remove_recursive(nvme_tcp_debugfs);
+
 	mutex_lock(&nvme_tcp_ctrl_mutex);
 	list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
 		nvme_delete_ctrl(&ctrl->ctrl);
-- 
2.35.3




More information about the Linux-nvme mailing list