[PATCH 5/8] nvme-tcp: debugfs entries for latency statistics
Sagi Grimberg
sagi at grimberg.me
Wed Jul 17 14:14:01 PDT 2024
On 16/07/2024 10:36, Hannes Reinecke wrote:
> From: Hannes Reinecke <hare at suse.de>
>
> Add debugfs entries to display latency statistics:
> 'send_lat' displays the average latency for sendmsg() calls,
> 'recv_lat' the average latency for '->read_sock()' calls,
> 'write_space' the number of 'write_space' callbacks,
> 'data_ready' the number of 'data_ready' callbacks,
> and 'queue_busy' the number -EAGAIN returns from sendmsg().
This is for sure something that ebpf is better used IMO.
What I'd recommend is that we add traces and move this
stats logic to an ebpf program (iovisor project would probably
welcome such a program).
>
> Signed-off-by: Hannes Reinecke <hare at kernel.org>
> ---
> drivers/nvme/host/tcp.c | 253 +++++++++++++++++++++++++++++++++++++++-
> 1 file changed, 249 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
> index 9caee99955c2..a758fbb3f9bb 100644
> --- a/drivers/nvme/host/tcp.c
> +++ b/drivers/nvme/host/tcp.c
> @@ -21,6 +21,8 @@
> #include <net/busy_poll.h>
> #include <trace/events/sock.h>
>
> +#include <linux/debugfs.h>
> +
> #include "nvme.h"
> #include "fabrics.h"
>
> @@ -139,6 +141,13 @@ enum nvme_tcp_recv_state {
> NVME_TCP_RECV_DDGST,
> };
>
> +struct nvme_tcp_stat {
> + u64 samples;
> + u64 batch;
> + s64 mean;
> + u64 sqmean;
> +};
> +
> struct nvme_tcp_ctrl;
> struct nvme_tcp_queue {
> struct socket *sock;
> @@ -164,6 +173,14 @@ struct nvme_tcp_queue {
> /* send state */
> struct nvme_tcp_request *request;
>
> + /* statistics */
> + struct dentry *debugfs_dir;
> + u64 data_ready_cnt;
> + u64 write_space_cnt;
> + u64 queue_busy_cnt;
> + struct nvme_tcp_stat send_lat;
> + struct nvme_tcp_stat recv_lat;
> +
> u32 maxh2cdata;
> size_t cmnd_capsule_len;
> struct nvme_tcp_ctrl *ctrl;
> @@ -198,6 +215,7 @@ struct nvme_tcp_ctrl {
> struct sockaddr_storage src_addr;
> struct nvme_ctrl ctrl;
>
> + struct dentry *debugfs_dir;
> struct work_struct err_work;
> struct delayed_work connect_work;
> struct nvme_tcp_request async_req;
> @@ -207,10 +225,163 @@ struct nvme_tcp_ctrl {
> static LIST_HEAD(nvme_tcp_ctrl_list);
> static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
> static struct workqueue_struct *nvme_tcp_wq;
> +static struct dentry *nvme_tcp_debugfs;
> static const struct blk_mq_ops nvme_tcp_mq_ops;
> static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
> static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
>
> +static void nvme_tcp_stat_add(struct nvme_tcp_stat *stat, u64 start)
> +{
> + u64 usec;
> + s64 delta, delta2;
> +
> + usec = ktime_to_us(ktime_get());
> + if (usec < start)
> + return;
> + usec -= start;
> + /* Count in nanoseconds to improve precision */
> + usec *= 1000;
> + stat->batch += usec;
> + /* Welford's algorithm */
> + stat->samples++;
> + delta = (s64)usec - stat->mean;
> + stat->mean += div_s64(delta, (s64)stat->samples);
> + delta2 = (s64)usec - stat->mean;
> + stat->sqmean += (u64)(delta * delta2);
> +}
> +
> +#define NVME_TCP_DEBUGFS_ATTR(field) \
> + static int field##_open(struct inode *inode, struct file *file) \
> + { return single_open(file, field##_show, inode->i_private); } \
> + \
> + static const struct file_operations field##_fops = { \
> + .open = field##_open, \
> + .read = seq_read, \
> + .release = single_release, \
> + }
> +
> +#define NVME_TCP_DEBUGFS_RW_ATTR(field) \
> + static int field##_open(struct inode *inode, struct file *file) \
> + { return single_open(file, field##_show, inode->i_private); } \
> + \
> + static const struct file_operations field##_fops = { \
> + .open = field##_open, \
> + .read = seq_read, \
> + .write = field##_write, \
> + .release = single_release, \
> + }
> +
> +static int nvme_tcp_queue_data_ready_show(struct seq_file *m, void *p)
> +{
> + struct nvme_tcp_queue *queue = m->private;
> +
> + seq_printf(m, "%llu\n", queue->data_ready_cnt);
> + return 0;
> +}
> +NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_data_ready);
> +
> +static int nvme_tcp_queue_write_space_show(struct seq_file *m, void *p)
> +{
> + struct nvme_tcp_queue *queue = m->private;
> +
> + seq_printf(m, "%llu\n", queue->write_space_cnt);
> + return 0;
> +}
> +NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_write_space);
> +
> +static int nvme_tcp_queue_queue_busy_show(struct seq_file *m, void *p)
> +{
> + struct nvme_tcp_queue *queue = m->private;
> +
> + seq_printf(m, "%llu\n", queue->queue_busy_cnt);
> + return 0;
> +}
> +NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_queue_busy);
> +
> +static int nvme_tcp_queue_recv_lat_show(struct seq_file *m, void *p)
> +{
> + struct nvme_tcp_queue *queue = m->private;
> + struct nvme_tcp_stat *st = &queue->recv_lat;
> + u64 var = 0;
> +
> + if (st->samples > 1)
> + var = div_u64(st->sqmean, st->samples * 1000);
> +
> + seq_printf(m, "%llu %llu %u\n",
> + st->samples, div_u64(st->batch, st->samples * 1000), int_sqrt64(var));
> + return 0;
> +}
> +NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_recv_lat);
> +
> +static int nvme_tcp_queue_send_lat_show(struct seq_file *m, void *p)
> +{
> + struct nvme_tcp_queue *queue = m->private;
> + struct nvme_tcp_stat *st = &queue->send_lat;
> + u64 var = 0;
> +
> + if (st->samples > 1)
> + var = div_u64(st->sqmean, st->samples * 1000);
> +
> + seq_printf(m, "%llu %llu %u\n",
> + st->samples, div_u64(st->batch, st->samples * 1000), int_sqrt64(var));
> + return 0;
> +}
> +NVME_TCP_DEBUGFS_ATTR(nvme_tcp_queue_send_lat);
> +
> +static int nvme_tcp_ctrl_recv_lat_show(struct seq_file *m, void *p)
> +{
> + struct nvme_tcp_ctrl *ctrl = m->private;
> + struct nvme_tcp_stat *st, lat;
> + u64 mean = 0, var = 0;
> + int i;
> +
> + lat.samples = 0;
> + lat.mean = 0;
> + lat.sqmean = 0;
> + for (i = 1; i < ctrl->ctrl.queue_count; i++) {
> + st = &ctrl->queues[i].recv_lat;
> + lat.mean += st->mean;
> + lat.sqmean += st->sqmean;
> + lat.samples += st->samples;
> + }
> + if (lat.samples > 1) {
> + mean = lat.mean;
> + var = int_sqrt(div_u64(lat.sqmean, lat.samples));
> + }
> +
> + seq_printf(m, "%llu %llu %llu\n",
> + lat.samples, mean, var);
> + return 0;
> +}
> +NVME_TCP_DEBUGFS_ATTR(nvme_tcp_ctrl_recv_lat);
> +
> +static int nvme_tcp_ctrl_send_lat_show(struct seq_file *m, void *p)
> +{
> + struct nvme_tcp_ctrl *ctrl = m->private;
> + struct nvme_tcp_stat *st, lat;
> + u64 mean = 0, var = 0;
> + int i;
> +
> + lat.samples = 0;
> + lat.mean = 0;
> + lat.sqmean = 0;
> + for (i = 1; i < ctrl->ctrl.queue_count; i++) {
> + st = &ctrl->queues[i].send_lat;
> + lat.mean += st->mean;
> + lat.sqmean += st->sqmean;
> + lat.samples += st->samples;
> + }
> + if (lat.samples > 1) {
> + mean = lat.mean;
> + var = int_sqrt(div_u64(lat.sqmean, lat.samples));
> + }
> +
> + seq_printf(m, "%llu %llu %llu\n",
> + lat.samples, mean, var);
> + return 0;
> +}
> +NVME_TCP_DEBUGFS_ATTR(nvme_tcp_ctrl_send_lat);
> +
> static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
> {
> return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
> @@ -985,8 +1156,10 @@ static void nvme_tcp_data_ready(struct sock *sk)
> read_lock_bh(&sk->sk_callback_lock);
> queue = sk->sk_user_data;
> if (likely(queue && queue->rd_enabled) &&
> - !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
> + !test_bit(NVME_TCP_Q_POLLING, &queue->flags)) {
> queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
> + queue->data_ready_cnt++;
> + }
> read_unlock_bh(&sk->sk_callback_lock);
> }
>
> @@ -999,6 +1172,7 @@ static void nvme_tcp_write_space(struct sock *sk)
> if (likely(queue && sk_stream_is_writeable(sk))) {
> clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
> queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
> + queue->write_space_cnt++;
> }
> read_unlock_bh(&sk->sk_callback_lock);
> }
> @@ -1067,6 +1241,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
> bool last = nvme_tcp_pdu_last_send(req, len);
> int req_data_sent = req->data_sent;
> int ret;
> + u64 start;
>
> if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
> msg.msg_flags |= MSG_EOR;
> @@ -1079,7 +1254,9 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
> bvec_set_page(&bvec, page, len, offset);
> iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> queue->nr_send++;
> + start = ktime_to_us(ktime_get());
> ret = sock_sendmsg(queue->sock, &msg);
> + nvme_tcp_stat_add(&queue->send_lat, start);
> if (ret <= 0)
> return ret;
>
> @@ -1124,6 +1301,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
> u8 hdgst = nvme_tcp_hdgst_len(queue);
> int len = sizeof(*pdu) + hdgst - req->offset;
> int ret;
> + u64 start;
>
> if (inline_data || nvme_tcp_queue_more(queue))
> msg.msg_flags |= MSG_MORE;
> @@ -1136,7 +1314,9 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
> bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
> iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> queue->nr_send++;
> + start = ktime_to_us(ktime_get());
> ret = sock_sendmsg(queue->sock, &msg);
> + nvme_tcp_stat_add(&queue->send_lat, start);
> if (unlikely(ret <= 0))
> return ret;
>
> @@ -1165,6 +1345,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
> u8 hdgst = nvme_tcp_hdgst_len(queue);
> int len = sizeof(*pdu) - req->offset + hdgst;
> int ret;
> + u64 start;
>
> if (queue->hdr_digest && !req->offset)
> nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu));
> @@ -1175,7 +1356,9 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
> bvec_set_virt(&bvec, (void *)pdu + req->offset, len);
> iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len);
> queue->nr_send++;
> + start = ktime_to_us(ktime_get());
> ret = sock_sendmsg(queue->sock, &msg);
> + nvme_tcp_stat_add(&queue->send_lat, start);
> if (unlikely(ret <= 0))
> return ret;
>
> @@ -1202,6 +1385,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
> .iov_base = (u8 *)&req->ddgst + req->offset,
> .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
> };
> + u64 start;
>
> if (nvme_tcp_queue_more(queue))
> msg.msg_flags |= MSG_MORE;
> @@ -1209,7 +1393,9 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
> msg.msg_flags |= MSG_EOR;
>
> queue->nr_send++;
> + start = ktime_to_us(ktime_get());
> ret = kernel_sendmsg(queue->sock, &msg, &iov, 1, iov.iov_len);
> + nvme_tcp_stat_add(&queue->send_lat, start);
> if (unlikely(ret <= 0))
> return ret;
>
> @@ -1263,6 +1449,7 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
> ret = nvme_tcp_try_send_ddgst(req);
> done:
> if (ret == -EAGAIN) {
> + queue->queue_busy_cnt++;
> ret = 0;
> } else if (ret < 0) {
> dev_err(queue->ctrl->ctrl.device,
> @@ -1281,13 +1468,16 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
> struct sock *sk = sock->sk;
> read_descriptor_t rd_desc;
> int consumed;
> + u64 start;
>
> rd_desc.arg.data = queue;
> rd_desc.count = 1;
> lock_sock(sk);
> queue->nr_cqe = 0;
> queue->nr_recv = 0;
> + start = ktime_to_us(ktime_get());
> consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
> + nvme_tcp_stat_add(&queue->recv_lat, start);
> release_sock(sk);
> return consumed;
> }
> @@ -1331,11 +1521,18 @@ static void nvme_tcp_io_work(struct work_struct *w)
> check:
> overrun = ktime_to_us(ktime_get()) - start;
> if (overrun > 10 * deadline) {
> + u64 slat = 0, clat = 0;
> +
> + if (queue->send_lat.samples)
> + slat = queue->send_lat.mean;
> + if (queue->recv_lat.samples)
> + clat = queue->recv_lat.mean;
> dev_dbg(queue->ctrl->ctrl.device,
> - "queue %d: stall (%llu msecs) send %u sqe %u recv %u cqe %u%s%s\n",
> + "queue %d: stall (%llu msecs), send %u lat %llu %u sqes, recv %u lat %llu %u cqes%s%s\n",
> nvme_tcp_queue_id(queue), div_u64(overrun, 1000),
> - queue->nr_send, queue->nr_sqe, queue->nr_recv, queue->nr_cqe,
> - list_empty(&queue->send_list) ? " empty" : "", queue->request ? " pending" : "");
> + queue->nr_send, slat, queue->nr_sqe, queue->nr_recv, clat, queue->nr_cqe,
> + list_empty(&queue->send_list) ? " empty" : "",
> + queue->request ? " pending" : "");
> }
> if (pending)
> queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
> @@ -1408,6 +1605,11 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
> if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
> return;
>
> + if (queue->debugfs_dir) {
> + debugfs_remove_recursive(queue->debugfs_dir);
> + queue->debugfs_dir = NULL;
> + }
> +
> if (queue->hdr_digest || queue->data_digest)
> nvme_tcp_free_crypto(queue);
>
> @@ -1696,6 +1898,7 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
> struct nvme_tcp_queue *queue = &ctrl->queues[qid];
> int ret, rcv_pdu_size;
> struct file *sock_file;
> + char queue_name[32];
>
> mutex_init(&queue->queue_lock);
> queue->ctrl = ctrl;
> @@ -1756,6 +1959,11 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
> queue->ddgst_remaining = 0;
> queue->pdu_remaining = 0;
> queue->pdu_offset = 0;
> + queue->write_space_cnt = 0;
> + queue->data_ready_cnt = 0;
> + queue->queue_busy_cnt = 0;
> + memset(&queue->recv_lat, 0, sizeof(struct nvme_tcp_stat));
> + memset(&queue->send_lat, 0, sizeof(struct nvme_tcp_stat));
> sk_set_memalloc(queue->sock->sk);
>
> if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
> @@ -1802,6 +2010,19 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
> goto err_crypto;
> }
>
> + sprintf(queue_name, "%d", nvme_tcp_queue_id(queue));
> + queue->debugfs_dir = debugfs_create_dir(queue_name, ctrl->debugfs_dir);
> + debugfs_create_file("recv_lat", S_IRUSR, queue->debugfs_dir,
> + queue, &nvme_tcp_queue_recv_lat_fops);
> + debugfs_create_file("send_lat", S_IRUSR, queue->debugfs_dir,
> + queue, &nvme_tcp_queue_send_lat_fops);
> + debugfs_create_file("data_ready", S_IRUSR, queue->debugfs_dir,
> + queue, &nvme_tcp_queue_data_ready_fops);
> + debugfs_create_file("write_space", S_IRUSR, queue->debugfs_dir,
> + queue, &nvme_tcp_queue_write_space_fops);
> + debugfs_create_file("queue_busy", S_IRUSR, queue->debugfs_dir,
> + queue, &nvme_tcp_queue_queue_busy_fops);
> +
> dev_dbg(nctrl->device, "connecting queue %d\n",
> nvme_tcp_queue_id(queue));
>
> @@ -2420,7 +2641,14 @@ static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
>
> static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
> {
> + struct nvme_tcp_ctrl *nctrl = to_tcp_ctrl(ctrl);
> +
> nvme_tcp_teardown_ctrl(ctrl, true);
> +
> + if (nctrl->debugfs_dir) {
> + debugfs_remove_recursive(nctrl->debugfs_dir);
> + nctrl->debugfs_dir = NULL;
> + }
> }
>
> static void nvme_reset_ctrl_work(struct work_struct *work)
> @@ -2890,6 +3118,13 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
> if (ret)
> goto out_put_ctrl;
>
> + ctrl->debugfs_dir = debugfs_create_dir(dev_name(ctrl->ctrl.device),
> + nvme_tcp_debugfs);
> + debugfs_create_file("recv_lat", S_IRUSR, ctrl->debugfs_dir,
> + ctrl, &nvme_tcp_ctrl_recv_lat_fops);
> + debugfs_create_file("send_lat", S_IRUSR, ctrl->debugfs_dir,
> + ctrl, &nvme_tcp_ctrl_send_lat_fops);
> +
> if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
> WARN_ON_ONCE(1);
> ret = -EINTR;
> @@ -2910,6 +3145,10 @@ static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
> return &ctrl->ctrl;
>
> out_uninit_ctrl:
> + if (ctrl->debugfs_dir) {
> + debugfs_remove_recursive(ctrl->debugfs_dir);
> + ctrl->debugfs_dir = NULL;
> + }
> nvme_uninit_ctrl(&ctrl->ctrl);
> out_put_ctrl:
> nvme_put_ctrl(&ctrl->ctrl);
> @@ -2951,6 +3190,10 @@ static int __init nvme_tcp_init_module(void)
> if (!nvme_tcp_wq)
> return -ENOMEM;
>
> + nvme_tcp_debugfs = debugfs_create_dir("nvme_tcp", NULL);
> + if (!nvme_tcp_debugfs)
> + return -ENOMEM;
> +
> nvmf_register_transport(&nvme_tcp_transport);
> return 0;
> }
> @@ -2961,6 +3204,8 @@ static void __exit nvme_tcp_cleanup_module(void)
>
> nvmf_unregister_transport(&nvme_tcp_transport);
>
> + debugfs_remove_recursive(nvme_tcp_debugfs);
> +
> mutex_lock(&nvme_tcp_ctrl_mutex);
> list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
> nvme_delete_ctrl(&ctrl->ctrl);
More information about the Linux-nvme
mailing list