nvme-rdma corrupts memory upon timeout
Alon Horev
alon at vastdata.com
Sun Feb 25 23:53:50 PST 2018
This patch still returns to userspace after queuing work and may
result in corruption. Maybe we can flush the work queue after a
timeout?
Just to put things in perspective, we have around 100 subsystems
connected on a single host. This means the time frame between queuing
and execution may be larger than usual.
Thanks, Alon
On Sun, Feb 25, 2018 at 8:14 PM, Sagi Grimberg <sagi at grimberg.me> wrote:
>
>> Does this patch help?
>> --
>> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
>> index 2ef761b5a26e..856ae9a7615a 100644
>> --- a/drivers/nvme/host/rdma.c
>> +++ b/drivers/nvme/host/rdma.c
>> @@ -956,15 +956,15 @@ static void nvme_rdma_error_recovery_work(struct
>> work_struct *work)
>>
>> if (ctrl->ctrl.queue_count > 1) {
>> nvme_stop_queues(&ctrl->ctrl);
>> + nvme_rdma_destroy_io_queues(ctrl, false);
>> blk_mq_tagset_busy_iter(&ctrl->tag_set,
>> nvme_cancel_request,
>> &ctrl->ctrl);
>> - nvme_rdma_destroy_io_queues(ctrl, false);
>> }
>>
>> blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
>> + nvme_rdma_destroy_admin_queue(ctrl, false);
>> blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
>> nvme_cancel_request, &ctrl->ctrl);
>> - nvme_rdma_destroy_admin_queue(ctrl, false);
>>
>> /*
>> * queues are not a live anymore, so restart the queues to fail
>> fast
>> @@ -1724,9 +1724,9 @@ static void nvme_rdma_shutdown_ctrl(struct
>> nvme_rdma_ctrl *ctrl, bool shutdown)
>>
>> if (ctrl->ctrl.queue_count > 1) {
>> nvme_stop_queues(&ctrl->ctrl);
>> + nvme_rdma_destroy_io_queues(ctrl, shutdown);
>> blk_mq_tagset_busy_iter(&ctrl->tag_set,
>> nvme_cancel_request,
>> &ctrl->ctrl);
>> - nvme_rdma_destroy_io_queues(ctrl, shutdown);
>> }
>>
>> if (shutdown)
>> @@ -1735,10 +1735,10 @@ static void nvme_rdma_shutdown_ctrl(struct
>> nvme_rdma_ctrl *ctrl, bool shutdown)
>> nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
>>
>> blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
>> + nvme_rdma_destroy_admin_queue(ctrl, shutdown);
>> blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
>> nvme_cancel_request, &ctrl->ctrl);
>> blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
>> - nvme_rdma_destroy_admin_queue(ctrl, shutdown);
>> }
>>
>> static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
>> --
>
>
> Or maybe this should do a better job:
> --
> diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
> index 4c32518a6c81..e45801fe78c1 100644
> --- a/drivers/nvme/host/rdma.c
> +++ b/drivers/nvme/host/rdma.c
> @@ -956,12 +956,14 @@ static void nvme_rdma_error_recovery_work(struct
> work_struct *work)
>
> if (ctrl->ctrl.queue_count > 1) {
> nvme_stop_queues(&ctrl->ctrl);
> + nvme_rdma_stop_io_queues(ctrl);
> blk_mq_tagset_busy_iter(&ctrl->tag_set,
> nvme_cancel_request, &ctrl->ctrl);
> nvme_rdma_destroy_io_queues(ctrl, false);
> }
>
> blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
> + nvme_rdma_stop_queue(&ctrl->queues[0]);
> blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
> nvme_cancel_request, &ctrl->ctrl);
> nvme_rdma_destroy_admin_queue(ctrl, false);
> @@ -1729,9 +1731,12 @@ static void nvme_rdma_shutdown_ctrl(struct
> nvme_rdma_ctrl *ctrl, bool shutdown)
>
> if (ctrl->ctrl.queue_count > 1) {
> nvme_stop_queues(&ctrl->ctrl);
> + nvme_rdma_stop_io_queues(ctrl);
> blk_mq_tagset_busy_iter(&ctrl->tag_set,
> nvme_cancel_request, &ctrl->ctrl);
> nvme_rdma_destroy_io_queues(ctrl, shutdown);
> + if (shutdown)
> + nvme_start_queues(&ctrl->ctrl);
> }
>
> if (shutdown)
> @@ -1740,10 +1745,11 @@ static void nvme_rdma_shutdown_ctrl(struct
> nvme_rdma_ctrl *ctrl, bool shutdown)
> nvme_disable_ctrl(&ctrl->ctrl, ctrl->ctrl.cap);
>
> blk_mq_quiesce_queue(ctrl->ctrl.admin_q);
> + nvme_rdma_stop_queue(&ctrl->queues[0]);
> blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
> nvme_cancel_request, &ctrl->ctrl);
> - blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
> nvme_rdma_destroy_admin_queue(ctrl, shutdown);
> + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
>
> }
>
> static void nvme_rdma_delete_ctrl(struct nvme_ctrl *ctrl)
> --
--
Alon Horev
+972-524-517-627
More information about the Linux-nvme
mailing list