[PATCH 1/3] nvme-tcp: improve rx/tx fairness
Hannes Reinecke
hare at kernel.org
Mon Jul 8 00:10:11 PDT 2024
We need to restrict both side, rx and tx, to only run for a certain time
to ensure that we're not blocking the other side and induce starvation.
So pass in a 'deadline' value to nvme_tcp_send_all() and nvme_tcp_try_recv()
and break out of the loop if the deadline is reached.
As we now have a timestamp we can also use it to print out a warning
if the actual time spent exceeds the deadline.
Performance comparison:
baseline rx/tx fairness
4k seq write: 449MiB/s 480MiB/s
4k rand write: 410MiB/s 481MiB/s
4k seq read: 478MiB/s 481MiB/s
4k rand read: 547MiB/s 480MiB/s
Random read is ever so disappointing, but that will be fixed with the later
patches.
Signed-off-by: Hannes Reinecke <hare at kernel.org>
---
drivers/nvme/host/tcp.c | 38 +++++++++++++++++++++++++++++---------
1 file changed, 29 insertions(+), 9 deletions(-)
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 0873b3949355..f621d3ba89b2 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -153,6 +153,7 @@ struct nvme_tcp_queue {
size_t data_remaining;
size_t ddgst_remaining;
unsigned int nr_cqe;
+ unsigned long deadline;
/* send state */
struct nvme_tcp_request *request;
@@ -359,14 +360,18 @@ static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
}
}
-static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
+static inline int nvme_tcp_send_all(struct nvme_tcp_queue *queue,
+ unsigned long deadline)
{
int ret;
/* drain the send queue as much as we can... */
do {
ret = nvme_tcp_try_send(queue);
+ if (time_after(jiffies, deadline))
+ break;
} while (ret > 0);
+ return ret;
}
static inline bool nvme_tcp_queue_has_pending(struct nvme_tcp_queue *queue)
@@ -385,6 +390,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
bool sync, bool last)
{
struct nvme_tcp_queue *queue = req->queue;
+ unsigned long deadline = jiffies + msecs_to_jiffies(1);
bool empty;
empty = llist_add(&req->lentry, &queue->req_list) &&
@@ -397,7 +403,7 @@ static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
*/
if (queue->io_cpu == raw_smp_processor_id() &&
sync && empty && mutex_trylock(&queue->send_mutex)) {
- nvme_tcp_send_all(queue);
+ nvme_tcp_send_all(queue, deadline);
mutex_unlock(&queue->send_mutex);
}
@@ -959,9 +965,14 @@ static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
nvme_tcp_error_recovery(&queue->ctrl->ctrl);
return result;
}
+ if (time_after(jiffies, queue->deadline)) {
+ desc->count = 0;
+ break;
+ }
+
}
- return consumed;
+ return consumed - len;
}
static void nvme_tcp_data_ready(struct sock *sk)
@@ -1258,7 +1269,7 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
return ret;
}
-static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
+static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue, unsigned long deadline)
{
struct socket *sock = queue->sock;
struct sock *sk = sock->sk;
@@ -1269,6 +1280,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
rd_desc.count = 1;
lock_sock(sk);
queue->nr_cqe = 0;
+ queue->deadline = deadline;
consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
release_sock(sk);
return consumed;
@@ -1278,14 +1290,15 @@ static void nvme_tcp_io_work(struct work_struct *w)
{
struct nvme_tcp_queue *queue =
container_of(w, struct nvme_tcp_queue, io_work);
- unsigned long deadline = jiffies + msecs_to_jiffies(1);
+ unsigned long tx_deadline = jiffies + msecs_to_jiffies(1);
+ unsigned long rx_deadline = tx_deadline + msecs_to_jiffies(1), overrun;
do {
bool pending = false;
int result;
if (mutex_trylock(&queue->send_mutex)) {
- result = nvme_tcp_try_send(queue);
+ result = nvme_tcp_send_all(queue, tx_deadline);
mutex_unlock(&queue->send_mutex);
if (result > 0)
pending = true;
@@ -1293,7 +1306,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
break;
}
- result = nvme_tcp_try_recv(queue);
+ result = nvme_tcp_try_recv(queue, rx_deadline);
if (result > 0)
pending = true;
else if (unlikely(result < 0))
@@ -1302,7 +1315,13 @@ static void nvme_tcp_io_work(struct work_struct *w)
if (!pending || !queue->rd_enabled)
return;
- } while (!time_after(jiffies, deadline)); /* quota is exhausted */
+ } while (!time_after(jiffies, rx_deadline)); /* quota is exhausted */
+
+ overrun = jiffies - rx_deadline;
+ if (nvme_tcp_queue_id(queue) > 0 &&
+ overrun > msecs_to_jiffies(10))
+ dev_dbg(queue->ctrl->ctrl.device, "queue %d: queue stall (%u msecs)\n",
+ nvme_tcp_queue_id(queue), jiffies_to_msecs(overrun));
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
}
@@ -2666,6 +2685,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
{
struct nvme_tcp_queue *queue = hctx->driver_data;
struct sock *sk = queue->sock->sk;
+ unsigned long deadline = jiffies + msecs_to_jiffies(1);
if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
return 0;
@@ -2673,7 +2693,7 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
set_bit(NVME_TCP_Q_POLLING, &queue->flags);
if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue))
sk_busy_loop(sk, true);
- nvme_tcp_try_recv(queue);
+ nvme_tcp_try_recv(queue, deadline);
clear_bit(NVME_TCP_Q_POLLING, &queue->flags);
return queue->nr_cqe;
}
--
2.35.3
More information about the Linux-nvme
mailing list