[PATCH] nvme-rdma: avoid repeated request completion for concurrent nvme_rdma_timeout
Sagi Grimberg
sagi at grimberg.me
Wed Jan 13 20:11:10 EST 2021
On 1/5/21 10:36 PM, Chao Leng wrote:
> A crash happens when inject completing request long time(nearly 30s).
> Each name space has a request queue, when inject completing request long
> time, multi request queues may has time out requests at the same time,
> nvme_rdma_timeout will execute concurrently. Multi requests in different
> request queues may be queued in the same rdma queue, multi
> nvme_rdma_timeout may call nvme_rdma_stop_queue at the same time.
> The first nvme_rdma_timeout will clear NVME_RDMA_Q_LIVE and continue
> stopping the rdma queue(drain qp), but the others check NVME_RDMA_Q_LIVE
> is already cleared, and then directly complete the requests, but the
> rdma queue may be not stopped and the request may be already completed
> in qp and wait treated, the request will be repeated completed.
> Add a multex lock to serialize nvme_rdma_stop_queue.
This looks reasonable to me,
Mind sending one for nvme-tcp as well?
>
> Signed-off-by: Chao Leng <lengchao at huawei.com>
> ---
> drivers/nvme/host/rdma.c | 15 +++++++++++----
> 1 file changed, 11 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index df9f6f4549f1..d5c66e3eeb4a 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -97,6 +97,7 @@ struct nvme_rdma_queue {
> struct completion cm_done;
> bool pi_support;
> int cq_size;
> + struct mutex queue_lock;
> };
>
> struct nvme_rdma_ctrl {
> @@ -579,6 +580,7 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
> int ret;
>
> queue = &ctrl->queues[idx];
> + mutex_init(&queue->queue_lock);
> queue->ctrl = ctrl;
> if (idx && ctrl->ctrl.max_integrity_segments)
> queue->pi_support = true;
> @@ -598,7 +600,8 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
> if (IS_ERR(queue->cm_id)) {
> dev_info(ctrl->ctrl.device,
> "failed to create CM ID: %ld\n", PTR_ERR(queue->cm_id));
> - return PTR_ERR(queue->cm_id);
> + ret = PTR_ERR(queue->cm_id);
> + goto out_destroy_mutex;
> }
>
> if (ctrl->ctrl.opts->mask & NVMF_OPT_HOST_TRADDR)
> @@ -628,6 +631,8 @@ static int nvme_rdma_alloc_queue(struct nvme_rdma_ctrl *ctrl,
> out_destroy_cm_id:
> rdma_destroy_id(queue->cm_id);
> nvme_rdma_destroy_queue_ib(queue);
> +out_destroy_mutex:
> + mutex_destroy(&queue->queue_lock);
> return ret;
> }
>
> @@ -639,9 +644,10 @@ static void __nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
>
> static void nvme_rdma_stop_queue(struct nvme_rdma_queue *queue)
> {
> - if (!test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
> - return;
> - __nvme_rdma_stop_queue(queue);
> + mutex_lock(&queue->queue_lock);
> + if (test_and_clear_bit(NVME_RDMA_Q_LIVE, &queue->flags))
> + __nvme_rdma_stop_queue(queue);
> + mutex_unlock(&queue->queue_lock);
> }
>
> static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
> @@ -649,6 +655,7 @@ static void nvme_rdma_free_queue(struct nvme_rdma_queue *queue)
> if (!test_and_clear_bit(NVME_RDMA_Q_ALLOCATED, &queue->flags))
> return;
>
> + mutex_destroy(&queue->queue_lock);
> nvme_rdma_destroy_queue_ib(queue);
> rdma_destroy_id(queue->cm_id);
> }
>
More information about the Linux-nvme
mailing list