[PATCH] nvme-tcp: print correct opcode on timeout and error handling

Sagi Grimberg sagi at grimberg.me
Thu Mar 9 01:10:21 PST 2023


>> Sagi,
>>
>> On 3/8/23 03:06, Sagi Grimberg wrote:
>>>
>>>> In timeout and error handling, various information is reported about the
>>>> command in error, but the printed opcode may not be correct
>>>>
>>>> The opcode is obtained from the nvme_command structure referenced by the
>>>> 'cmd' member in the nvme_request structure.
>>>> (i.e. nvme_req(req)->cmd->common.opcode)
>>>>
>>>> For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
>>>> points to a structure within the page fragment allocated by
>>>> nvme_tcp_init_request().  This page fragment is used as a command
>>>> capsule
>>>> PDU, and then may be reused as a h2c data PDU, so the nvme_command
>>>> referenced by the 'cmd' member has already been overwritten.
>>>>
>>>> To fix this problem, when setting up the nvme_command, keep the
>>>> opcode in
>>>> a newly added member in the nvme_tcp_request and use that instead.
>>>
>>> I'm wandering if we should just not reuse the cmd pdu for a subsequent
>>> data pdu...
>>>
>>> It will take more space allocated per request (up to a few MBs for lots
>>> of requests and controllers)...
>>>
>>> but will prevent from propagating this to nvme core...
>>>
>>
>> Is there a way we can do this without increasing the memory usage ?
> 
> The data pdu has to go somewhere. There doesn't appear to be another free area
> other than the command pdu, so if that's off limits to preserve the original
> sqe, then more memory is needed.

If I take a somewhat extreme calculation, where a host has 16 
controllers, 128 queues of depth 128 for each I end up with additional
6MB, which is not insignificant for a driver...

We could go with the original proposal of storing just the opcode, but
it then propagates to the core...

Or...

The cmd pdu has a size of 72 bytes (header + 64 byte command), perhaps
we can still reuse the same PDU space but rather take the last 24 bytes
of the cmd pdu?

Something like:
--
+static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
+{
+       return req->pdu;
+}
+
+static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
+{
+       /* use the pdu space in the back for the data pdu */
+       return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) -
+               sizeof(struct nvme_tcp_data_pdu);
+}
+
  static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request 
*req)
  {
         if (nvme_is_fabrics(req->req.cmd))
@@ -613,7 +625,7 @@ static int nvme_tcp_handle_comp(struct 
nvme_tcp_queue *queue,

  static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
  {
-       struct nvme_tcp_data_pdu *data = req->pdu;
+       struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
         struct nvme_tcp_queue *queue = req->queue;
         struct request *rq = blk_mq_rq_from_pdu(req);
         u32 h2cdata_sent = req->pdu_len;
@@ -1035,7 +1047,7 @@ static int nvme_tcp_try_send_data(struct 
nvme_tcp_request *req)
  static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
  {
         struct nvme_tcp_queue *queue = req->queue;
-       struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+       struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
         bool inline_data = nvme_tcp_has_inline_data(req);
         u8 hdgst = nvme_tcp_hdgst_len(queue);
         int len = sizeof(*pdu) + hdgst - req->offset;
@@ -1074,7 +1086,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct 
nvme_tcp_request *req)
  static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
  {
         struct nvme_tcp_queue *queue = req->queue;
-       struct nvme_tcp_data_pdu *pdu = req->pdu;
+       struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
         u8 hdgst = nvme_tcp_hdgst_len(queue);
         int len = sizeof(*pdu) - req->offset + hdgst;
         int ret;
@@ -2281,7 +2293,7 @@ static enum blk_eh_timer_return 
nvme_tcp_timeout(struct request *rq)
  {
         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
         struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
-       struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+       struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
         u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
         int qid = nvme_tcp_queue_id(req->queue);

@@ -2320,7 +2332,7 @@ static blk_status_t nvme_tcp_map_data(struct 
nvme_tcp_queue *queue,
                         struct request *rq)
  {
         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
-       struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+       struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
         struct nvme_command *c = &pdu->cmd;

         c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -2340,7 +2352,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct 
nvme_ns *ns,
                 struct request *rq)
  {
         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
-       struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+       struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
         struct nvme_tcp_queue *queue = req->queue;
         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
         blk_status_t ret;
--

This way we can still abuse the pdu space without allocating additional
24 bytes per request, and live with it until it bothers something else?

Thoughts?



More information about the Linux-nvme mailing list