[PATCH] nvme-rdma: Use mr pool

Max Gurtovoy maxg at mellanox.com
Wed Nov 8 00:36:14 PST 2017


Hi Sagi,

On 11/8/2017 9:34 AM, Sagi Grimberg wrote:
> 
>> @@ -490,8 +456,27 @@ static int nvme_rdma_create_queue_ib(struct 
>> nvme_rdma_queue *queue)
>>           goto out_destroy_qp;
>>       }
>> +    if (idx == 0)
>> +        queue->ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
>> +                ibdev->attrs.max_fast_reg_page_list_len);
> 
> Please keep that where it was.

We did it for the admin queue (otherwise we init the mr pool with 
max_fr_pages = 0). how you prefer solving this ?

> 
>> +
>> +    ret = ib_mr_pool_init(queue->qp, &queue->qp->rdma_mrs,
>> +                  queue->queue_size,
>> +                  IB_MR_TYPE_MEM_REG,
>> +                  queue->ctrl->max_fr_pages);
>> +
> 
> Why the extra newline?
> 
>> +    if (ret) {
>> +        dev_err(queue->ctrl->ctrl.device,
>> +            "failed to initialize MR pool sized %d for QID %d\n",
>> +            queue->queue_size, idx);
>> +        goto out_destroy_ring;
>> +    }
>> +
>>       return 0;
>> +out_destroy_ring:
>> +    nvme_rdma_free_ring(ibdev, queue->rsp_ring, queue->queue_size,
>> +                sizeof(struct nvme_completion), DMA_FROM_DEVICE);
>>   out_destroy_qp:
>>       ib_destroy_qp(queue->qp);
>>   out_destroy_ib_cq:
>> @@ -764,9 +749,6 @@ static int nvme_rdma_configure_admin_queue(struct 
>> nvme_rdma_ctrl *ctrl,
>>       ctrl->device = ctrl->queues[0].device;
>> -    ctrl->max_fr_pages = min_t(u32, NVME_RDMA_MAX_SEGMENTS,
>> -        ctrl->device->dev->attrs.max_fast_reg_page_list_len);
>> -
>>       if (new) {
>>           ctrl->ctrl.admin_tagset = 
>> nvme_rdma_alloc_tagset(&ctrl->ctrl, true);
>>           if (IS_ERR(ctrl->ctrl.admin_tagset)) {
>> @@ -779,11 +761,6 @@ static int nvme_rdma_configure_admin_queue(struct 
>> nvme_rdma_ctrl *ctrl,
>>               error = PTR_ERR(ctrl->ctrl.admin_q);
>>               goto out_free_tagset;
>>           }
>> -    } else {
>> -        error = blk_mq_reinit_tagset(&ctrl->admin_tag_set,
>> -                         nvme_rdma_reinit_request);
>> -        if (error)
>> -            goto out_free_queue;
>>       }
>>       error = nvme_rdma_start_queue(ctrl, 0);
>> @@ -863,11 +840,6 @@ static int nvme_rdma_configure_io_queues(struct 
>> nvme_rdma_ctrl *ctrl, bool new)
>>               goto out_free_tag_set;
>>           }
>>       } else {
>> -        ret = blk_mq_reinit_tagset(&ctrl->tag_set,
>> -                       nvme_rdma_reinit_request);
>> -        if (ret)
>> -            goto out_free_io_queues;
>> -
>>           blk_mq_update_nr_hw_queues(&ctrl->tag_set,
>>               ctrl->ctrl.queue_count - 1);
>>       }
>> @@ -1065,14 +1037,19 @@ static void nvme_rdma_unmap_data(struct 
>> nvme_rdma_queue *queue,
>>       if (!blk_rq_bytes(rq))
>>           return;
>> -    if (req->mr->need_inval) {
>> -        res = nvme_rdma_inv_rkey(queue, req);
>> -        if (unlikely(res < 0)) {
>> -            dev_err(ctrl->ctrl.device,
>> -                "Queueing INV WR for rkey %#x failed (%d)\n",
>> -                req->mr->rkey, res);
>> -            nvme_rdma_error_recovery(queue->ctrl);
>> +    if (req->mr) {
>> +        if (req->mr->need_inval) {
>> +            res = nvme_rdma_inv_rkey(queue, req);
>> +            if (unlikely(res < 0)) {
>> +                dev_err(ctrl->ctrl.device,
>> +                    "Queueing INV WR for rkey %#x failed (%d)\n",
>> +                    req->mr->rkey, res);
>> +                nvme_rdma_error_recovery(queue->ctrl);
>> +            }
>>           }
>> +
>> +        ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
>> +        req->mr = NULL;
> 
> restoring the MR before the local invalidation completed? that's wrong.
> You should rebase on top of my completion signaling fixes. I have
> v2 in the pipe.

Yup we did it locally, but since your code isn't upstream yet, we 
decided to send it for review (see Israel comment above that code is 
based on 4.14-rc8).

> 
>>       }
>>       ib_dma_unmap_sg(ibdev, req->sg_table.sgl,
>> @@ -1131,12 +1108,18 @@ static int nvme_rdma_map_sg_fr(struct 
>> nvme_rdma_queue *queue,
>>       struct nvme_keyed_sgl_desc *sg = &c->common.dptr.ksgl;
>>       int nr;
>> +    req->mr = ib_mr_pool_get(queue->qp, &queue->qp->rdma_mrs);
>> +    if (!req->mr)
>> +        return -EAGAIN;
> 
> unlikely >
>> +
>>       /*
>>        * Align the MR to a 4K page size to match the ctrl page size and
>>        * the block virtual boundary.
>>        */
>>       nr = ib_map_mr_sg(req->mr, req->sg_table.sgl, count, NULL, SZ_4K);
>>       if (unlikely(nr < count)) {
>> +        ib_mr_pool_put(queue->qp, &queue->qp->rdma_mrs, req->mr);
>> +        req->mr = NULL;
>>           if (nr < 0)
>>               return nr;
>>           return -EINVAL;
>> @@ -1176,7 +1159,6 @@ static int nvme_rdma_map_data(struct 
>> nvme_rdma_queue *queue,
>>       req->num_sge = 1;
>>       req->inline_data = false;
>> -    req->mr->need_inval = false;
>>       c->common.flags |= NVME_CMD_SGL_METABUF;
>> @@ -1362,7 +1344,7 @@ static int nvme_rdma_process_nvme_rsp(struct 
>> nvme_rdma_queue *queue,
>>       if (rq->tag == tag)
>>           ret = 1;
>> -    if ((wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
>> +    if (req->mr && (wc->wc_flags & IB_WC_WITH_INVALIDATE) &&
>>           wc->ex.invalidate_rkey == req->mr->rkey)
>>           req->mr->need_inval = false;
> 
> If you got a remote invalidate without an MR its a protocol error.

Yes this was extra safe approach. We'll remove this check.
> 
>> @@ -1674,7 +1656,7 @@ static blk_status_t nvme_rdma_queue_rq(struct 
>> blk_mq_hw_ctx *hctx,
>>       if (req_op(rq) == REQ_OP_FLUSH)
>>           flush = true;
>>       err = nvme_rdma_post_send(queue, sqe, req->sge, req->num_sge,
>> -            req->mr->need_inval ? &req->reg_wr.wr : NULL, flush);
>> +            req->mr ? &req->reg_wr.wr : NULL, flush);
> 
> Please keep the need_inval condition as well.

In the new code, if (req->mr != NULL) than req->mr->need_inval always 
True. We don't assign MR if we don't need to register the buffers...



More information about the Linux-nvme mailing list