nvmet: race condition while CQE are getting processed concurrently with the DISCONNECTED event

Yi Zhang yizhan at redhat.com
Sun Mar 12 19:22:33 PDT 2017


Hi Sagi

With this patch, issue[1] cannot be reproduced, but I still can 
reproduce issue[2]. thanks

[1]

kernel NULL pointer on nvmet with stress 
rescan_controller/reset_controller test during I/O

[2]
mlx4_core 0000:07:00.0: swiotlb buffer is full and OOM observed during 
stress test on reset_controller


On 03/09/2017 07:45 PM, Sagi Grimberg wrote:
>> Hi Sagi,
>
> Hi Raju (CC'ing Yi)
>
>>
>> I had tried each of the below patches individually,  issue is still 
>> seen with both the patches.
>>
>> with patch #1, from the dmesg I see that NULL pointer dereference 
>> issue is hit before the 3,4,5 (see below) were finished successfully 
>> for that queue
>>
>> Where :
>> 1. rdma_diconnect
>> 2. nvmet_sq_destroy
>> 3. ib_drain_qp
>> 4. rdma_destroy_qp
>> 5. ib_free_cq (which flushes the cq worker)
>
> I took a deeper look here, and I think that the root cause has nothing
> to do with the 2 (still useful) patches I sent. Actually, the fact that
> patch (1) caused you to get the NULL deref even before 3,4,5 tells me
> that the qp and cq are not free at all, and for some reason we see them
> as NULL.
>
> So in nvmet_rdma_recv_done() if the queue is not in state
> NVMET_RDMA_Q_LIVE, we simply restore the rsp back to the queue free
> list:
>
> static inline void
> nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
> {
>         unsigned long flags;
>
>         spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
>         list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
>         spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
> }
>
> However we only set rsp->queue in nvmet_rdma_handle_command() which
> does not take place because, as I mentioned, we are in disconnect
> state...
>
> I think this patch should make this issue go away:
> -- 
> diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
> index 973b674ab55b..06a8c6114098 100644
> --- a/drivers/nvme/target/rdma.c
> +++ b/drivers/nvme/target/rdma.c
> @@ -180,9 +180,9 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
>  {
>         unsigned long flags;
>
> -       spin_lock_irqsave(&rsp->queue->rsps_lock, flags);
> -       list_add_tail(&rsp->free_list, &rsp->queue->free_rsps);
> -       spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
> + spin_lock_irqsave(&rsp->cmd->queue->rsps_lock, flags);
> +       list_add_tail(&rsp->free_list, &rsp->cmd->queue->free_rsps);
> + spin_unlock_irqrestore(&rsp->cmd->queue->rsps_lock, flags);
>  }
>
>  static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int 
> nents)
> @@ -473,7 +473,7 @@ static void nvmet_rdma_process_wr_wait_list(struct 
> nvmet_rdma_queue *queue)
>
>  static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
>  {
> -       struct nvmet_rdma_queue *queue = rsp->queue;
> +       struct nvmet_rdma_queue *queue = rsp->cmd->queue;
>
>         atomic_add(1 + rsp->n_rdma, &queue->sq_wr_avail);
>
> @@ -517,7 +517,7 @@ static void nvmet_rdma_send_done(struct ib_cq *cq, 
> struct ib_wc *wc)
>                      wc->status != IB_WC_WR_FLUSH_ERR)) {
>                 pr_err("SEND for CQE 0x%p failed with status %s (%d).\n",
>                         wc->wr_cqe, ib_wc_status_msg(wc->status), 
> wc->status);
> -               nvmet_rdma_error_comp(rsp->queue);
> +               nvmet_rdma_error_comp(rsp->cmd->queue);
>         }
>  }
>
> @@ -525,7 +525,7 @@ static void nvmet_rdma_queue_response(struct 
> nvmet_req *req)
>  {
>         struct nvmet_rdma_rsp *rsp =
>                 container_of(req, struct nvmet_rdma_rsp, req);
> -       struct rdma_cm_id *cm_id = rsp->queue->cm_id;
> +       struct rdma_cm_id *cm_id = rsp->cmd->queue->cm_id;
>         struct ib_send_wr *first_wr, *bad_wr;
>
>         if (rsp->flags & NVMET_RDMA_REQ_INVALIDATE_RKEY) {
> @@ -541,9 +541,9 @@ static void nvmet_rdma_queue_response(struct 
> nvmet_req *req)
>         else
>                 first_wr = &rsp->send_wr;
>
> -       nvmet_rdma_post_recv(rsp->queue->dev, rsp->cmd);
> +       nvmet_rdma_post_recv(rsp->cmd->queue->dev, rsp->cmd);
>
> - ib_dma_sync_single_for_device(rsp->queue->dev->device,
> + ib_dma_sync_single_for_device(rsp->cmd->queue->dev->device,
>                 rsp->send_sge.addr, rsp->send_sge.length,
>                 DMA_TO_DEVICE);
>
> @@ -614,7 +614,7 @@ static u16 nvmet_rdma_map_sgl_inline(struct 
> nvmet_rdma_rsp *rsp)
>  static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
>                 struct nvme_keyed_sgl_desc *sgl, bool invalidate)
>  {
> -       struct rdma_cm_id *cm_id = rsp->queue->cm_id;
> +       struct rdma_cm_id *cm_id = rsp->cmd->queue->cm_id;
>         u64 addr = le64_to_cpu(sgl->addr);
>         u32 len = get_unaligned_le24(sgl->length);
>         u32 key = get_unaligned_le32(sgl->key);
> @@ -676,7 +676,7 @@ static u16 nvmet_rdma_map_sgl(struct 
> nvmet_rdma_rsp *rsp)
>
>  static bool nvmet_rdma_execute_command(struct nvmet_rdma_rsp *rsp)
>  {
> -       struct nvmet_rdma_queue *queue = rsp->queue;
> +       struct nvmet_rdma_queue *queue = rsp->cmd->queue;
>
>         if (unlikely(atomic_sub_return(1 + rsp->n_rdma,
>                         &queue->sq_wr_avail) < 0)) {
> @@ -703,11 +703,6 @@ static void nvmet_rdma_handle_command(struct 
> nvmet_rdma_queue *queue,
>  {
>         u16 status;
>
> -       cmd->queue = queue;
> -       cmd->n_rdma = 0;
> -       cmd->req.port = queue->port;
> -
> -
>         ib_dma_sync_single_for_cpu(queue->dev->device,
>                 cmd->cmd->sge[0].addr, cmd->cmd->sge[0].length,
>                 DMA_FROM_DEVICE);
> @@ -763,6 +758,8 @@ static void nvmet_rdma_recv_done(struct ib_cq *cq, 
> struct ib_wc *wc)
>         rsp->cmd = cmd;
>         rsp->flags = 0;
>         rsp->req.cmd = cmd->nvme_cmd;
> +       rsp->n_rdma = 0;
> +       rsp->req.port = queue->port;
>
>         if (unlikely(queue->state != NVMET_RDMA_Q_LIVE)) {
>                 unsigned long flags;
> -- 
>
> _______________________________________________
> Linux-nvme mailing list
> Linux-nvme at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-nvme




More information about the Linux-nvme mailing list