[PATCH] nvme: avoid deadlock warning in sync_io_queues() path

Shin'ichiro Kawasaki shinichiro.kawasaki at wdc.com
Wed Oct 12 01:03:18 PDT 2022


The commit c0feea594e05 ("workqueue: don't skip lockdep work dependency
in cancel_work_sync()") restored a lockdep check and it triggered
deadlock warning in sync_io_queues() path. This function locks
namespaces_rwsem to traverse namespace list and calls blk_sync_queue()
for each namespace. blk_sync_queue() calls cancel_work_sync() to wait
for nvme_timeout() completion which has paths to lock the
namespaces_rwsem again. Hence, the deadlock warning was reported.

This deadlock is not expected to happen in real use cases since the
queues to call blk_sync_queue() are already quiesced at that point. Left
issue is confusion by the deadlock warning. It is not ideal to suppress
the warning with lockdep_off/on() since it will hide unseen deadlock
scenarios. It is desired to suppress the warning without losing deadlock
detection ability.

Instead of lockdep_off/on(), this patch suppresses the warning by adding
another lock named 'namespaces_sync_io_queues_mutex'. It works together
with namespaces_rwsem. Lock only namespaces_rwsem for fast read access
to the namespaces in hot paths. Lock both the locks to modify the list.
Lock only namespaces_sync_io_queues_mutex to call blk_sync_queue() so
that namespaces_rwsem can be locked in nvme_timeout() without the
deadlock warning.

Link: https://lore.kernel.org/linux-block/20220930001943.zdbvolc3gkekfmcv@shindev/
Suggested-by: Tetsuo Handa <penguin-kernel at I-love.SAKURA.ne.jp>
Signed-off-by: Shin'ichiro Kawasaki <shinichiro.kawasaki at wdc.com>
---
 drivers/nvme/host/core.c | 21 +++++++++++++++++++--
 drivers/nvme/host/nvme.h |  1 +
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 8d5a7ae19844..4c535deaf269 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -4234,9 +4234,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 	if (nvme_update_ns_info(ns, info))
 		goto out_unlink_ns;
 
+	mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
 	down_write(&ctrl->namespaces_rwsem);
 	nvme_ns_add_to_ctrl_list(ns);
 	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
 	nvme_get_ctrl(ctrl);
 
 	if (device_add_disk(ctrl->device, ns->disk, nvme_ns_id_attr_groups))
@@ -4252,9 +4254,11 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
 
  out_cleanup_ns_from_list:
 	nvme_put_ctrl(ctrl);
+	mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
 	down_write(&ctrl->namespaces_rwsem);
 	list_del_init(&ns->list);
 	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
  out_unlink_ns:
 	mutex_lock(&ctrl->subsys->lock);
 	list_del_rcu(&ns->siblings);
@@ -4304,9 +4308,11 @@ static void nvme_ns_remove(struct nvme_ns *ns)
 		nvme_cdev_del(&ns->cdev, &ns->cdev_device);
 	del_gendisk(ns->disk);
 
+	mutex_lock(&ns->ctrl->namespaces_sync_io_queues_mutex);
 	down_write(&ns->ctrl->namespaces_rwsem);
 	list_del_init(&ns->list);
 	up_write(&ns->ctrl->namespaces_rwsem);
+	mutex_unlock(&ns->ctrl->namespaces_sync_io_queues_mutex);
 
 	if (last_path)
 		nvme_mpath_shutdown_disk(ns->head);
@@ -4399,12 +4405,14 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
 	struct nvme_ns *ns, *next;
 	LIST_HEAD(rm_list);
 
+	mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
 	down_write(&ctrl->namespaces_rwsem);
 	list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
 		if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
 			list_move_tail(&ns->list, &rm_list);
 	}
 	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
 
 	list_for_each_entry_safe(ns, next, &rm_list, list)
 		nvme_ns_remove(ns);
@@ -4565,9 +4573,11 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
 	/* this is a no-op when called from the controller reset handler */
 	nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
 
+	mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
 	down_write(&ctrl->namespaces_rwsem);
 	list_splice_init(&ctrl->namespaces, &ns_list);
 	up_write(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
 
 	list_for_each_entry_safe(ns, next, &ns_list, list)
 		nvme_ns_remove(ns);
@@ -4901,6 +4911,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	INIT_LIST_HEAD(&ctrl->namespaces);
 	xa_init(&ctrl->cels);
 	init_rwsem(&ctrl->namespaces_rwsem);
+	mutex_init(&ctrl->namespaces_sync_io_queues_mutex);
 	ctrl->dev = dev;
 	ctrl->ops = ops;
 	ctrl->quirks = quirks;
@@ -5121,10 +5132,16 @@ void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
 {
 	struct nvme_ns *ns;
 
-	down_read(&ctrl->namespaces_rwsem);
+	/*
+	 * Guard ctrl->namespaces with namespaces_sync_io_queues_mutex instead
+	 * of namespaces_rwsem to avoid deadlock warning. namespace_rwsem lock
+	 * here for cancel_work_sync() via blk_sync_queue() would conflict with
+	 * nvme_timeout() which locks namespaces_rwsem.
+	 */
+	mutex_lock(&ctrl->namespaces_sync_io_queues_mutex);
 	list_for_each_entry(ns, &ctrl->namespaces, list)
 		blk_sync_queue(ns->queue);
-	up_read(&ctrl->namespaces_rwsem);
+	mutex_unlock(&ctrl->namespaces_sync_io_queues_mutex);
 }
 EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
 
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 1bdf714dcd9e..3f484b6c21d2 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -250,6 +250,7 @@ struct nvme_ctrl {
 	struct blk_mq_tag_set *admin_tagset;
 	struct list_head namespaces;
 	struct rw_semaphore namespaces_rwsem;
+	struct mutex namespaces_sync_io_queues_mutex;
 	struct device ctrl_device;
 	struct device *device;	/* char device */
 #ifdef CONFIG_NVME_HWMON
-- 
2.37.1




More information about the Linux-nvme mailing list