[PATCH] nvme-tcp: print correct opcode on timeout and error handling

Thu Mar 9 01:10:21 PST 2023

>> Sagi,
>>
>> On 3/8/23 03:06, Sagi Grimberg wrote:
>>>
>>>> In timeout and error handling, various information is reported about the
>>>> command in error, but the printed opcode may not be correct
>>>>
>>>> The opcode is obtained from the nvme_command structure referenced by the
>>>> 'cmd' member in the nvme_request structure.
>>>> (i.e. nvme_req(req)->cmd->common.opcode)
>>>>
>>>> For the nvme-tcp driver, the 'cmd' member in the nvme_request structure
>>>> points to a structure within the page fragment allocated by
>>>> nvme_tcp_init_request().  This page fragment is used as a command
>>>> capsule
>>>> PDU, and then may be reused as a h2c data PDU, so the nvme_command
>>>> referenced by the 'cmd' member has already been overwritten.
>>>>
>>>> To fix this problem, when setting up the nvme_command, keep the
>>>> opcode in
>>>> a newly added member in the nvme_tcp_request and use that instead.
>>>
>>> I'm wandering if we should just not reuse the cmd pdu for a subsequent
>>> data pdu...
>>>
>>> It will take more space allocated per request (up to a few MBs for lots
>>> of requests and controllers)...
>>>
>>> but will prevent from propagating this to nvme core...
>>>
>>
>> Is there a way we can do this without increasing the memory usage ?
> 
> The data pdu has to go somewhere. There doesn't appear to be another free area
> other than the command pdu, so if that's off limits to preserve the original
> sqe, then more memory is needed.

If I take a somewhat extreme calculation, where a host has 16 
controllers, 128 queues of depth 128 for each I end up with additional
6MB, which is not insignificant for a driver...

We could go with the original proposal of storing just the opcode, but
it then propagates to the core...

Or...

The cmd pdu has a size of 72 bytes (header + 64 byte command), perhaps
we can still reuse the same PDU space but rather take the last 24 bytes
of the cmd pdu?

Something like:
--
+static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
+{
+       return req->pdu;
+}
+
+static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
+{
+       /* use the pdu space in the back for the data pdu */
+       return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) -
+               sizeof(struct nvme_tcp_data_pdu);
+}
+
  static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request 
*req)
  {
         if (nvme_is_fabrics(req->req.cmd))
@@ -613,7 +625,7 @@ static int nvme_tcp_handle_comp(struct 
nvme_tcp_queue *queue,

  static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
  {
-       struct nvme_tcp_data_pdu *data = req->pdu;
+       struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
         struct nvme_tcp_queue *queue = req->queue;
         struct request *rq = blk_mq_rq_from_pdu(req);
         u32 h2cdata_sent = req->pdu_len;
@@ -1035,7 +1047,7 @@ static int nvme_tcp_try_send_data(struct 
nvme_tcp_request *req)
  static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
  {
         struct nvme_tcp_queue *queue = req->queue;
-       struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+       struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
         bool inline_data = nvme_tcp_has_inline_data(req);
         u8 hdgst = nvme_tcp_hdgst_len(queue);
         int len = sizeof(*pdu) + hdgst - req->offset;
@@ -1074,7 +1086,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct 
nvme_tcp_request *req)
  static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
  {
         struct nvme_tcp_queue *queue = req->queue;
-       struct nvme_tcp_data_pdu *pdu = req->pdu;
+       struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
         u8 hdgst = nvme_tcp_hdgst_len(queue);
         int len = sizeof(*pdu) - req->offset + hdgst;
         int ret;
@@ -2281,7 +2293,7 @@ static enum blk_eh_timer_return 
nvme_tcp_timeout(struct request *rq)
  {
         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
         struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
-       struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+       struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
         u8 opc = pdu->cmd.common.opcode, fctype = pdu->cmd.fabrics.fctype;
         int qid = nvme_tcp_queue_id(req->queue);

@@ -2320,7 +2332,7 @@ static blk_status_t nvme_tcp_map_data(struct 
nvme_tcp_queue *queue,
                         struct request *rq)
  {
         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
-       struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+       struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
         struct nvme_command *c = &pdu->cmd;

         c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -2340,7 +2352,7 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct 
nvme_ns *ns,
                 struct request *rq)
  {
         struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
-       struct nvme_tcp_cmd_pdu *pdu = req->pdu;
+       struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
         struct nvme_tcp_queue *queue = req->queue;
         u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
         blk_status_t ret;
--

This way we can still abuse the pdu space without allocating additional
24 bytes per request, and live with it until it bothers something else?

Thoughts?