nvmet_sq_destroy stuck forever when data digest is turned on

Sagi Grimberg sagi at grimberg.me
Tue Sep 19 05:22:40 PDT 2023


> Hi
> 
> I have an issue with nvmet_tcp_release_queue_work hitting hung task after 2 minutes of waiting for nvmet_sq_destroy.
> This issue reproduces only when data digest is on.
> 
> I am inspecting the code of nvmet_tcp_release_queue_work and I see that the code handles 'data in' commands
> This means that it calls nvmet_req_uninit for any command that its data is still in transit.
> 
> There might be commands that the data transfer is already done, but data digest was not received from socket yet (aka rcv_state is NVMET_TCP_RECV_DDGST)
> The data digest will never be read from the socket because the socket is blocked by NVMET_TCP_RECV_ERR
> Hence nvmet_sq_destroy will be stuck forever waiting for nvmet_tcp_try_recv_ddgst to execute.
> 
> Can you suggest a fix for such an issue?

Does this (untested) patch make the issue go away?
--
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 97d07488072d..f5eaf3457ada 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -204,10 +204,16 @@ static inline u16 nvmet_tcp_cmd_tag(struct 
nvmet_tcp_queue *queue,
         return cmd - queue->cmds;
  }

+static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
+{
+       return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
+}
+
  static inline bool nvmet_tcp_has_data_in(struct nvmet_tcp_cmd *cmd)
  {
-       return nvme_is_write(cmd->req.cmd) &&
-               cmd->rbytes_done < cmd->req.transfer_len;
+       u32 total_len = cmd->req.transfer_len + 
nvmet_tcp_ddgst_len(cmd->queue);
+
+       return nvme_is_write(cmd->req.cmd) && cmd->rbytes_done < total_len;
  }

  static inline bool nvmet_tcp_need_data_in(struct nvmet_tcp_cmd *cmd)
@@ -265,11 +271,6 @@ static inline u8 nvmet_tcp_hdgst_len(struct 
nvmet_tcp_queue *queue)
         return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
  }

-static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue)
-{
-       return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
-}
-
  static inline void nvmet_tcp_hdgst(struct ahash_request *hash,
                 void *pdu, size_t len)
  {
@@ -1221,8 +1222,8 @@ static int nvmet_tcp_try_recv_ddgst(struct 
nvmet_tcp_queue *queue)
                 goto out;
         }

-       if (cmd->rbytes_done == cmd->req.transfer_len)
-               nvmet_tcp_execute_request(cmd);
+       cmd->rbytes_done += NVME_TCP_DIGEST_LENGTH;
+       nvmet_tcp_execute_request(cmd);

         ret = 0;
  out:
--



More information about the Linux-nvme mailing list