nvme-tcp: fix a possible UAF when failing to send request

Sun Feb 9 23:41:38 PST 2025

Hello 



    When using the nvme-tcp driver in a storage cluster, the driver may trigger a null pointer causing the host to crash several times.



By analyzing the vmcore, we know the direct cause is that  the request->mq_hctx was used after free. 





CPU1                                                                   CPU2



nvme_tcp_poll                                                          nvme_tcp_try_send  --failed to send reqrest 13 



    nvme_tcp_try_recv                                                      nvme_tcp_fail_request



        nvme_tcp_recv_skb                                                      nvme_tcp_end_request



            nvme_tcp_recv_pdu                                                      nvme_complete_rq 



                nvme_tcp_handle_comp                                                   nvme_retry_req -- request->mq_hctx have been freed, is NULL.               



                    nvme_tcp_process_nvme_cqe                                                                                    



                        nvme_complete_rq



                            nvme_end_req



                                  blk_mq_end_request 







when nvme_tcp_try_send failed to send reqrest 13, it maybe be resulted by selinux or other reasons, this is a problem. then  the nvme_tcp_fail_request would execute。



but the nvme_tcp_recv_pdu may have received the responding pdu and the nvme_tcp_process_nvme_cqe would have completed the request.  request->mq_hctx was used after free. 



the follow patch is to solve it. 



can you give  some suggestions?  thanks!

diff --git a/linux/drivers/nvme/host/core.c b/linux/drivers/nvme/host/core.c



index a65b1dce8..417466674 100644



--- a/linux/drivers/nvme/host/core.c



+++ b/linux/drivers/nvme/host/core.c



@@ -288,6 +288,9 @@ static void nvme_retry_req(struct request *req)



        unsigned long delay = 0;



        u16 crd;



+       if(!req->mq_hctx && req->state == MQ_RQ_IDLE)
+               return;
+


        /* The mask and shift result must be <= 3 */



        crd = (nvme_req(req)->status & NVME_SC_CRD) >> 11;




The details are as follows



[35665.692310] nvme nvme2: failed to send request -13



[35665.692683] nvme nvme2: queue 1 failed to send request 00000000b42f4e2b state 2 pdu 00000000d7fb8da3 type 4 rq_state 1 nrq_status 0



[35665.693323] nvme nvme2: failed to send rq 00000000f86a68c3 state 2 nrq_status 370



[35665.702265] nvme nvme2: unsupported pdu type (3)



[35665.702272] BUG: kernel NULL pointer dereference, address: 0000000000000000



[35665.702542] nvme nvme2: queue 1 receive failed:  -22



[35665.703209] #PF: supervisor write access in kernel mode



[35665.703213] #PF: error_code(0x0002) - not-present page



[35665.703214] PGD 8000003801cce067 P4D 8000003801cce067 PUD 37e6f79067 PMD 0



[35665.703220] Oops: 0002 [#1] SMP PTI



[35665.703658] nvme nvme2: starting error recovery



[35665.704442] CPU: 20 PID: 815 Comm: kworker/20:1H Kdump: loaded Not tainted 5.15.131-17.cl9.x86_64 #1



[35665.705168] nvme nvme2: queue 1 receive again after receive failed



[35665.705809] Hardware name: Inspur aaabbb/YZMB-00882-104, BIOS 4.1.26 09/22/2022



[35665.705812] Workqueue: kblockd blk_mq_requeue_work



[35665.709172] RIP: 0010:_raw_spin_lock+0xc/0x30



[35665.709606] Code: 05 c3 cc cc cc cc 89 c6 e8 31 05 68 ff 66 90 c3 cc cc cc cc 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 31 c0 ba 01 00 00 00 <f0> 0f b1 17 75 05 c3 cc cc cc cc 89 c6 e8 02 05 68 ff 66 90 c3 cc



[35665.710470] RSP: 0000:ffffa67bcd797e08 EFLAGS: 00010246



[35665.710925] RAX: 0000000000000000 RBX: ffff92f6bbcc9840 RCX: ffff92f6bbcc9888



[35665.711393] RDX: 0000000000000001 RSI: 0000000000000000 RDI: 0000000000000000



[35665.711849] RBP: 0000000000000000 R08: ffffa67bcd797e48 R09: ffff932346d576f4



[35665.712275] R10: 0000000000000008 R11: 0000000000000008 R12: 0000000000000000



[35665.712725] R13: ffff92f6bbcc9888 R14: 0000000000000008 R15: 0000000000000000



[35665.713158] FS:  0000000000000000(0000) GS:ffff93527d400000(0000) knlGS:0000000000000000



[35665.713603] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033



[35665.714063] CR2: 0000000000000000 CR3: 000000371aa02006 CR4: 00000000007706e0



[35665.714534] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000



[35665.714961] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400



[35665.715359] PKRU: 55555554



[35665.715788] Call Trace:



[35665.716201]  <TASK>



[35665.716613]  ? show_trace_log_lvl+0x1c1/0x2d9



[35665.717049]  ? show_trace_log_lvl+0x1c1/0x2d9



[35665.717457]  ? blk_mq_request_bypass_insert+0x2c/0xb0



[35665.717950]  ? __die_body.cold+0x8/0xd



[35665.718361]  ? page_fault_oops+0xac/0x140



[35665.718749]  ? blk_mq_start_request+0x30/0xf0



[35665.719144]  ? nvme_tcp_queue_rq+0xc7/0x170 [nvme_tcp]



[35665.719547]  ? exc_page_fault+0x62/0x130



[35665.719938]  ? asm_exc_page_fault+0x22/0x30



[35665.720333]  ? _raw_spin_lock+0xc/0x30



[35665.720723]  blk_mq_request_bypass_insert+0x2c/0xb0



[35665.721101]  blk_mq_requeue_work+0xa5/0x180



[35665.721451]  process_one_work+0x1e8/0x390



[35665.721809]  worker_thread+0x53/0x3d0



[35665.722159]  ? process_one_work+0x390/0x390



[35665.722501]  kthread+0x124/0x150



[35665.722849]  ? set_kthread_struct+0x50/0x50



[35665.723182]  ret_from_fork+0x1f/0x30



[35665.723508]  </TASK>




crash> struct nvme_tcp_request ffff92f6bbcc9950



struct nvme_tcp_request {



  req = {



    cmd = 0xffff92f5b83f6748,



    result = {



      u16 = 0,



      u32 = 0,



      u64 = 0



    },



    genctr = 169 '\251',



    retries = 1 '\001',



    flags = 0 '\000',



    status = 6,



    ctrl = 0xffff92f5e5df7348



  },



  pdu = 0xffff92f5b83f6740,



  queue = 0xffff92f407cc9128,



  data_len = 4096,



  pdu_len = 4096,



  pdu_sent = 0,



  h2cdata_left = 0,



  h2cdata_offset = 0,



  ttag = 62,



  status = 12,



  entry = {



    next = 0xdead000000000100,



    prev = 0xdead000000000122



  },



  lentry = {



    next = 0x0



  },



  ddgst = 0,



  curr_bio = 0xffff9324d639e240,



  iter = {



    iter_type = 2 '\002',



    nofault = false,



    data_source = true,



    iov_offset = 0,



    count = 4096,



    {



      iov = 0xffff92f6bbcc98a8,



      kvec = 0xffff92f6bbcc98a8,



      bvec = 0xffff92f6bbcc98a8,



      xarray = 0xffff92f6bbcc98a8,



      pipe = 0xffff92f6bbcc98a8



    },



    {



      nr_segs = 1,



      {



        head = 1,



        start_head = 0



      },



      xarray_start = 1



    }



  },



  offset = 0,



  data_sent = 0,



  state = NVME_TCP_SEND_DATA



}





crash> nvme_tcp_hdr.type 0xffff92f5b83f6740



  type = 4 '\004',



crash>



crash> struct request  ffff92f6bbcc9840



struct request {



  q = 0xffff92f59d55c240,



  mq_ctx = 0xffffc67bb9a1f040,



  mq_hctx = 0x0,



  cmd_flags = 33556483,



  rq_flags = 139456,



  tag = 87,



  internal_tag = -1,



  __data_len = 0,



  __sector = 66846720,



  bio = 0x0,



  biotail = 0x0,



  queuelist = {



    next = 0xffff92f6bbcc9888,



    prev = 0xffff92f6bbcc9888



  },



  {



    hash = {



      next = 0x0,



      pprev = 0x0



    },



    ipi_list = {



      next = 0x0



    }



  },



  {



    rb_node = {



      __rb_parent_color = 18446685131795018112,



      rb_right = 0x1000,



      rb_left = 0x0



    },



    special_vec = {



      bv_page = 0xffffca64841f3180,



      bv_len = 4096,



      bv_offset = 0



    },



    completion_data = 0xffffca64841f3180,



    error_count = -2078330496



  },



  {



    elv = {



      icq = 0x0,



      priv = {0xffff92f6bbcc98c8, 0xffff92f6bbcc98c8}



    },



    flush = {



      seq = 0,



      list = {



        next = 0xffff92f6bbcc98c8,



        prev = 0xffff92f6bbcc98c8



      },



      saved_end_io = 0x0



    }



  },



  rq_disk = 0xffff92f6bdbff600,



  part = 0xffff92f59f557800,



  start_time_ns = 35665692557229,



  io_start_time_ns = 35665692566268,



  wbt_flags = 0,



  stats_sectors = 0,



  nr_phys_segments = 1,



  nr_integrity_segments = 0,



  write_hint = 0,



  ioprio = 0,



  state = MQ_RQ_IDLE,



  ref = {



    refs = {



      counter = 0



    }



  },



  timeout = 180000,



  deadline = 4330512774,



  {



    csd = {



      node = {



        llist = {



          next = 0x0



        },



        {



          u_flags = 0,



          a_flags = {



            counter = 0



          }



        },



        src = 0,



        dst = 0



      },



      func = 0xffffffff8eed62d0 <__blk_mq_complete_request_remote>,



      info = 0xffff92f6bbcc9840



    },



    fifo_time = 0



  },



  end_io = 0x0,



  end_io_data = 0x0



}



Best regards



zhang.guanghui at cestc.cn