[PATCH] nvme-rdma: correctly stop keep alive on error path

Ming Lin mlin at kernel.org
Thu Jun 9 14:37:07 PDT 2016


From: Ming Lin <ming.l at samsung.com>

We didn't stop keep alive when blk_mq_alloc_tag_set() fails
in nvme_rdma_create_io_queues().

This caused below hung when unloading nvme-rdma driver.

[  141.253064] blk-mq: failed to allocate request map
[  146.197258] nvme \xffffffc0\xfffffff7\xffffff95\x18\x01\xffffff88\xffffffff\xffffffff1023: keep-alive failed
[  361.235076] INFO: task kworker/0:0:4 blocked for more than 120 seconds.
[  361.241753]       Tainted: G            E   4.7.0-rc2+ #252
[  361.247359] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  361.255239] kworker/0:0     D ffff8801192dbc40     0     4      2 0x00000000
[  361.262363] Workqueue: events nvme_keep_alive_work [nvme_core]
[  361.268242]  ffff8801192dbc40 4554535953425553 ffff8801192dc000 ffff8801192dbd58
[  361.275741]  ffff8801192dbd50 ffff8801192aa7c0 ffff88011e216300 ffff8801192dbc58
[  361.283232]  ffffffff816ef117 7fffffffffffffff ffff8801192dbcc8 ffffffff816f1b8f
[  361.291252] Call Trace:
[  361.294274]  [<ffffffff816ef117>] schedule+0x37/0x90
[  361.299813]  [<ffffffff816f1b8f>] schedule_timeout+0x13f/0x1a0
[  361.306240]  [<ffffffff816f0271>] wait_for_completion+0x91/0xf0
[  361.312709]  [<ffffffff8108fe00>] ? wake_up_q+0x70/0x70
[  361.318517]  [<ffffffff81081f8b>] flush_work+0xeb/0x160
[  361.324286]  [<ffffffff8107fe10>] ? destroy_worker+0x90/0x90
[  361.330516]  [<ffffffffc07f4331>] nvme_rdma_reset_ctrl+0x41/0x50 [nvme_rdma]
[  361.338097]  [<ffffffffc07e64c0>] nvme_keep_alive_work+0xc0/0xd0 [nvme_core]
[  361.345712]  [<ffffffff8108273c>] process_one_work+0x13c/0x360
[  361.352066]  [<ffffffff810832d6>] worker_thread+0x126/0x490
[  361.358193]  [<ffffffff810831b0>] ? cancel_delayed_work_sync+0x10/0x10
[  361.365240]  [<ffffffff810886d4>] kthread+0xc4/0xe0
[  361.370659]  [<ffffffff816f2bdf>] ret_from_fork+0x1f/0x40
[  361.376559]  [<ffffffff81088610>] ? kthread_create_on_node+0x170/0x170

[  361.383668] INFO: task kworker/0:2:147 blocked for more than 120 seconds.
[  361.390979]       Tainted: G            E   4.7.0-rc2+ #252
[  361.397064] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[  361.405443] kworker/0:2     D ffff880035f0bbe0     0   147      2 0x00000000
[  361.413064] Workqueue: nvme_rdma_wq nvme_rdma_reset_ctrl_work [nvme_rdma]
[  361.420474]  ffff880035f0bbe0 ffffffff811ac3a1 ffff880035f0c000 ffff880035f0bd00
[  361.428493]  ffff880035f0bcf8 ffff880117f4dcc0 ffff880117f4dcc0 ffff880035f0bbf8
[  361.436502]  ffffffff816ef117 7fffffffffffffff ffff880035f0bc70 ffffffff816f1b8f
[  361.444521] Call Trace:
[  361.447503]  [<ffffffff811ac3a1>] ? pollwake+0x61/0x70
[  361.453216]  [<ffffffff816ef117>] schedule+0x37/0x90
[  361.458713]  [<ffffffff816f1b8f>] schedule_timeout+0x13f/0x1a0
[  361.465113]  [<ffffffff816f0271>] wait_for_completion+0x91/0xf0
[  361.471560]  [<ffffffff8108fe00>] ? wake_up_q+0x70/0x70
[  361.477339]  [<ffffffff81081f8b>] flush_work+0xeb/0x160
[  361.483078]  [<ffffffff8107fe10>] ? destroy_worker+0x90/0x90
[  361.489285]  [<ffffffff8108307e>] __cancel_work_timer+0x8e/0x1a0
[  361.495811]  [<ffffffff81095ec0>] ? pick_next_entity+0xa0/0x150
[  361.502292]  [<ffffffff810831ae>] cancel_delayed_work_sync+0xe/0x10
[  361.509100]  [<ffffffffc07e65da>] nvme_stop_keep_alive+0x1a/0x20 [nvme_core]
[  361.516734]  [<ffffffffc07f5afb>] nvme_rdma_shutdown_ctrl+0x1b/0xe0 [nvme_rdma]
[  361.524599]  [<ffffffffc07f6119>] nvme_rdma_reset_ctrl_work+0x19/0x120 [nvme_rdma]
[  361.532771]  [<ffffffff8108273c>] process_one_work+0x13c/0x360
[  361.539164]  [<ffffffff8108340b>] worker_thread+0x25b/0x490
[  361.545329]  [<ffffffff816eed2e>] ? __schedule+0x1de/0x590
[  361.551365]  [<ffffffff810831b0>] ? cancel_delayed_work_sync+0x10/0x10
[  361.558481]  [<ffffffff810886d4>] kthread+0xc4/0xe0
[  361.563897]  [<ffffffff816f2bdf>] ret_from_fork+0x1f/0x40
[  361.569863]  [<ffffffff81088610>] ? kthread_create_on_node+0x170/0x170

Signed-off-by: Ming Lin <ming.l at samsung.com>
---
 drivers/nvme/host/rdma.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 11246b8..8263f2f 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -1604,14 +1604,14 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl)
 	if (error)
 		goto out_cleanup_queue;
 
-	nvme_start_keep_alive(&ctrl->ctrl);
-
 	error = nvme_rdma_alloc_qe(ctrl->queues[0].device->dev,
 			&ctrl->async_event_sqe, sizeof(struct nvme_command),
 			DMA_TO_DEVICE);
 	if (error)
 		goto out_cleanup_queue;
 
+	nvme_start_keep_alive(&ctrl->ctrl);
+
 	return 0;
 
 out_cleanup_queue:
@@ -1838,7 +1838,6 @@ static int nvme_rdma_create_io_queues(struct nvme_rdma_ctrl *ctrl)
 	return 0;
 
 out_cleanup_connect_q:
-	nvme_stop_keep_alive(&ctrl->ctrl);
 	blk_cleanup_queue(ctrl->ctrl.connect_q);
 out_free_tag_set:
 	blk_mq_free_tag_set(&ctrl->tag_set);
@@ -1969,6 +1968,7 @@ static struct nvme_ctrl *nvme_rdma_create_ctrl(struct device *dev,
 	return &ctrl->ctrl;
 
 out_remove_admin_queue:
+	nvme_stop_keep_alive(&ctrl->ctrl);
 	nvme_rdma_destroy_admin_queue(ctrl);
 out_kfree_queues:
 	kfree(ctrl->queues);
-- 
1.9.1




More information about the Linux-nvme mailing list