[PATCH 6/7] nvme-tcp: SOCK_NOSPACE handling

Wed Jun 26 05:13:46 PDT 2024

When there is no write space on the socket we shouldn't try to
push more data onto it; it'll stall anyway and leads to higher CPU
utilisation. So check for sock_wspace() before queueing new
requests and let the sock write_space() handler restart the
submission.

Signed-off-by: Hannes Reinecke <hare at kernel.org>
---
 drivers/nvme/host/tcp.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 599d4ebf888f..d78cca2f05d4 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -147,6 +147,7 @@ enum nvme_tcp_recv_state {
 struct nvme_tcp_ctrl;
 struct nvme_tcp_queue {
 	struct socket		*sock;
+	struct blk_mq_hw_ctx	*hctx;
 	struct work_struct	io_work;
 	int			io_cpu;
 
@@ -381,6 +382,15 @@ static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
 		nvme_tcp_queue_has_pending(queue);
 }
 
+static inline void nvme_tcp_queue_work(struct nvme_tcp_queue *queue)
+{
+	set_bit(SOCK_NOSPACE, &queue->sock->flags);
+	if (!sock_wspace(queue->sock->sk))
+		return;
+	clear_bit(SOCK_NOSPACE, &queue->sock->flags);
+	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+}
+
 static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 		bool sync, bool last)
 {
@@ -402,7 +412,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
 	}
 
 	if (last && nvme_tcp_queue_has_pending(queue))
-		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		nvme_tcp_queue_work(queue);
 }
 
 static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
@@ -550,6 +560,7 @@ static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
 	struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
 
 	hctx->driver_data = queue;
+	queue->hctx = hctx;
 	return 0;
 }
 
@@ -1004,7 +1015,10 @@ static void nvme_tcp_write_space(struct sock *sk)
 	queue = sk->sk_user_data;
 	if (likely(queue && sk_stream_is_writeable(sk))) {
 		clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
-		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		if (sock_wspace(sk))
+			queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		if (queue->hctx)
+			blk_mq_start_hw_queue(queue->hctx);
 	}
 	read_unlock_bh(&sk->sk_callback_lock);
 }
@@ -1317,7 +1331,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
 
 	} while (!time_after(jiffies, deadline)); /* quota is exhausted */
 
-	queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+	nvme_tcp_queue_work(queue);
 }
 
 static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
@@ -1863,6 +1877,7 @@ static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
 
 static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
 {
+	queue->hctx = NULL;
 	kernel_sock_shutdown(queue->sock, SHUT_RDWR);
 	nvme_tcp_restore_sock_ops(queue);
 	cancel_work_sync(&queue->io_work);
@@ -2614,7 +2629,7 @@ static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
 	struct nvme_tcp_queue *queue = hctx->driver_data;
 
 	if (!llist_empty(&queue->req_list))
-		queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
+		nvme_tcp_queue_work(queue);
 }
 
 static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
@@ -2630,6 +2645,13 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
 	if (!nvme_check_ready(&queue->ctrl->ctrl, rq, queue_ready))
 		return nvme_fail_nonready_command(&queue->ctrl->ctrl, rq);
 
+	set_bit(SOCK_NOSPACE, &queue->sock->flags);
+	if (!sock_wspace(queue->sock->sk)) {
+		blk_mq_stop_hw_queue(hctx);
+		return BLK_STS_DEV_RESOURCE;
+	}
+	clear_bit(SOCK_NOSPACE, &queue->sock->flags);
+
 	ret = nvme_tcp_setup_cmd_pdu(ns, rq);
 	if (unlikely(ret))
 		return ret;
-- 
2.35.3