[PATCH 3/3] nvme-tcp: fix I/O stalls on congested sockets

Sagi Grimberg sagi at grimberg.me
Fri Apr 18 03:51:14 PDT 2025



On 4/18/25 02:03, Kamaljit Singh wrote:
> Sagi,
> I tried both of these patches but looks like #1 causes an infinite loop. dmesg was full of panics.
> I had tried just #1 and later with #1+#2. Both failed the same way.

That's no good :)

Can you share the panics? This version should be identical to what
Hannes introduced in:

@@ -1389,9 +1389,12 @@ static void nvme_tcp_io_work(struct work_struct *w)
  		result = nvme_tcp_try_recv(queue);
  		if (result > 0)
  			pending = true;
-		else if (unlikely(result < 0))
+		else if (unlikely(result < 0) && result != -EAGAIN)
  			return;


>
>> How about these two (untested) patches:
>> [1 based on your recv-side fix]:
>> --diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index 72d260201d8c..4eb9a2dec07e 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -1348,7 +1348,7 @@ static int nvme_tcp_try_recv(struct nvme_tcp_queue
>> *queue)
>>           queue->nr_cqe = 0;
>>           consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
>>           release_sock(sk);
>> -       return consumed;
>> +       return consumed == -EAGAIN ? 0 : consumed;
>>    }
>>
>>    static void nvme_tcp_io_work(struct work_struct *w)
>> --
>>
>> [2 based on your partial write fix]:
>> diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
>> index 4eb9a2dec07e..daf59e75cf15 100644
>> --- a/drivers/nvme/host/tcp.c
>> +++ b/drivers/nvme/host/tcp.c
>> @@ -129,6 +129,7 @@ enum nvme_tcp_queue_flags {
>>           NVME_TCP_Q_LIVE         = 1,
>>           NVME_TCP_Q_POLLING      = 2,
>>           NVME_TCP_Q_IO_CPU_SET   = 3,
>> +       NVME_TCP_Q_WAKE_SENDER  = 4,
>>    };
>>
>>    enum nvme_tcp_recv_state {
>> @@ -1063,6 +1064,7 @@ static void nvme_tcp_write_space(struct sock *sk)
>>           queue = sk->sk_user_data;
>>           if (likely(queue && sk_stream_is_writeable(sk))) {
>>                   clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
>> +               set_bit(NVME_TCP_Q_WAKE_SENDER, &queue->flags);
>>                   queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
>>           }
>>           read_unlock_bh(&sk->sk_callback_lock);
>> @@ -1357,6 +1359,7 @@ static void nvme_tcp_io_work(struct work_struct *w)
>>                   container_of(w, struct nvme_tcp_queue, io_work);
>>           unsigned long deadline = jiffies + msecs_to_jiffies(1);
>>
>> +       clear_bit(NVME_TCP_Q_WAKE_SENDER, &queue->flags);
>>           do {
>>                   bool pending = false;
>>                   int result;
>> @@ -1376,7 +1379,15 @@ static void nvme_tcp_io_work(struct work_struct *w)
>>                   else if (unlikely(result < 0))
>>                           return;
>>
>> -               if (!pending || !queue->rd_enabled)
>> +               /* did we get some space after spending time in recv ? */
>> +               if (nvme_tcp_queue_has_pending(queue) &&
>> +                   sk_stream_is_writeable(queue->sock->sk))
>> +                       pending = true;
>> +
>> +               if (!queue->rd_enabled)
>> +                       return;
>> +
>> +               if (!pending && !test_bit(NVME_TCP_Q_WAKE_SENDER,
>> &queue->flags))
>>                           return;
>>
>>           } while (!time_after(jiffies, deadline)); /* quota is exhausted */
>> --
>>   
> -Kamaljit




More information about the Linux-nvme mailing list