[PATCHv2] nvme: use srcu for iterating namespace list
Sagi Grimberg
sagi at grimberg.me
Fri May 24 08:28:31 PDT 2024
On 23/05/2024 20:20, Keith Busch wrote:
> From: Keith Busch <kbusch at kernel.org>
>
> The nvme pci driver synchronizes with all the namespace queues during a
> reset to ensure that there's no pending timeout work.
>
> Meanwhile the timeout work potentially iterates those same namespaces to
> freeze their queues.
>
> Each of those namespace iterations use the same read lock. If a write
> lock should somehow get between the synchronize and freeze steps, then
> forward progress is deadlocked.
>
> We had been relying on the nvme controller state machine to ensure the
> reset work wouldn't conflict with timeout work. That guarantee may be a
> bit fragile to rely on, so iterate the namespace lists without taking a
> lock to fix potential circular locks, as reported by lockdep.
>
> Link: https://lore.kernel.org/all/20220930001943.zdbvolc3gkekfmcv@shindev/
> Reported-by: Shinichiro Kawasaki <shinichiro.kawasaki at wdc.com>
> Signed-off-by: Keith Busch <kbusch at kernel.org>
> ---
> v2:
> Improved changelog
>
> drivers/nvme/host/core.c | 97 +++++++++++++++++++++--------------
> drivers/nvme/host/ioctl.c | 12 ++---
> drivers/nvme/host/multipath.c | 21 ++++----
> drivers/nvme/host/nvme.h | 3 +-
> 4 files changed, 78 insertions(+), 55 deletions(-)
>
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 7706df2373494..b7cd46f3cef69 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -3684,9 +3684,10 @@ static int nvme_init_ns_head(struct nvme_ns *ns, struct nvme_ns_info *info)
> struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> {
> struct nvme_ns *ns, *ret = NULL;
> + int srcu_idx;
>
> - down_read(&ctrl->namespaces_rwsem);
> - list_for_each_entry(ns, &ctrl->namespaces, list) {
> + srcu_idx = srcu_read_lock(&ctrl->srcu);
> + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
> if (ns->head->ns_id == nsid) {
> if (!nvme_get_ns(ns))
> continue;
> @@ -3696,7 +3697,7 @@ struct nvme_ns *nvme_find_get_ns(struct nvme_ctrl *ctrl, unsigned nsid)
> if (ns->head->ns_id > nsid)
> break;
> }
> - up_read(&ctrl->namespaces_rwsem);
> + srcu_read_unlock(&ctrl->srcu, srcu_idx);
> return ret;
> }
> EXPORT_SYMBOL_NS_GPL(nvme_find_get_ns, NVME_TARGET_PASSTHRU);
> @@ -3710,7 +3711,7 @@ static void nvme_ns_add_to_ctrl_list(struct nvme_ns *ns)
>
> list_for_each_entry_reverse(tmp, &ns->ctrl->namespaces, list) {
> if (tmp->head->ns_id < ns->head->ns_id) {
> - list_add(&ns->list, &tmp->list);
> + list_add_rcu(&ns->list, &tmp->list);
> return;
> }
> }
> @@ -3776,17 +3777,18 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
> if (nvme_update_ns_info(ns, info))
> goto out_unlink_ns;
>
> - down_write(&ctrl->namespaces_rwsem);
> + mutex_lock(&ctrl->namespaces_lock);
> /*
> * Ensure that no namespaces are added to the ctrl list after the queues
> * are frozen, thereby avoiding a deadlock between scan and reset.
> */
> if (test_bit(NVME_CTRL_FROZEN, &ctrl->flags)) {
> - up_write(&ctrl->namespaces_rwsem);
> + mutex_unlock(&ctrl->namespaces_lock);
> goto out_unlink_ns;
> }
> nvme_ns_add_to_ctrl_list(ns);
> - up_write(&ctrl->namespaces_rwsem);
> + mutex_unlock(&ctrl->namespaces_lock);
> + synchronize_srcu(&ctrl->srcu);
> nvme_get_ctrl(ctrl);
>
> if (device_add_disk(ctrl->device, ns->disk, nvme_ns_attr_groups))
> @@ -3809,9 +3811,10 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, struct nvme_ns_info *info)
>
> out_cleanup_ns_from_list:
> nvme_put_ctrl(ctrl);
> - down_write(&ctrl->namespaces_rwsem);
> - list_del_init(&ns->list);
> - up_write(&ctrl->namespaces_rwsem);
> + mutex_lock(&ctrl->namespaces_lock);
> + list_del_rcu(&ns->list);
> + mutex_unlock(&ctrl->namespaces_lock);
> + synchronize_srcu(&ctrl->srcu);
> out_unlink_ns:
> mutex_lock(&ctrl->subsys->lock);
> list_del_rcu(&ns->siblings);
> @@ -3861,9 +3864,10 @@ static void nvme_ns_remove(struct nvme_ns *ns)
> nvme_cdev_del(&ns->cdev, &ns->cdev_device);
> del_gendisk(ns->disk);
>
> - down_write(&ns->ctrl->namespaces_rwsem);
> - list_del_init(&ns->list);
> - up_write(&ns->ctrl->namespaces_rwsem);
> + mutex_lock(&ns->ctrl->namespaces_lock);
> + list_del_rcu(&ns->list);
> + mutex_unlock(&ns->ctrl->namespaces_lock);
> + synchronize_srcu(&ns->ctrl->srcu);
>
> if (last_path)
> nvme_mpath_shutdown_disk(ns->head);
> @@ -3953,16 +3957,17 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
> struct nvme_ns *ns, *next;
> LIST_HEAD(rm_list);
>
> - down_write(&ctrl->namespaces_rwsem);
> + mutex_lock(&ctrl->namespaces_lock);
> list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
> if (ns->head->ns_id > nsid)
> - list_move_tail(&ns->list, &rm_list);
> + list_splice_init_rcu(&ns->list, &rm_list,
> + synchronize_rcu);
> }
> - up_write(&ctrl->namespaces_rwsem);
> + mutex_unlock(&ctrl->namespaces_lock);
> + synchronize_srcu(&ctrl->srcu);
>
> list_for_each_entry_safe(ns, next, &rm_list, list)
> nvme_ns_remove(ns);
> -
> }
>
> static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
> @@ -4132,9 +4137,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
> /* this is a no-op when called from the controller reset handler */
> nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
>
> - down_write(&ctrl->namespaces_rwsem);
> - list_splice_init(&ctrl->namespaces, &ns_list);
> - up_write(&ctrl->namespaces_rwsem);
> + mutex_lock(&ctrl->namespaces_lock);
> + list_splice_init_rcu(&ctrl->namespaces, &ns_list, synchronize_rcu);
> + mutex_unlock(&ctrl->namespaces_lock);
> + synchronize_srcu(&ctrl->srcu);
>
> list_for_each_entry_safe(ns, next, &ns_list, list)
> nvme_ns_remove(ns);
> @@ -4582,6 +4588,7 @@ static void nvme_free_ctrl(struct device *dev)
> key_put(ctrl->tls_key);
> nvme_free_cels(ctrl);
> nvme_mpath_uninit(ctrl);
> + cleanup_srcu_struct(&ctrl->srcu);
> nvme_auth_stop(ctrl);
> nvme_auth_free(ctrl);
> __free_page(ctrl->discard_page);
> @@ -4614,10 +4621,15 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
> ctrl->passthru_err_log_enabled = false;
> clear_bit(NVME_CTRL_FAILFAST_EXPIRED, &ctrl->flags);
> spin_lock_init(&ctrl->lock);
> + mutex_init(&ctrl->namespaces_lock);
> +
> + ret = init_srcu_struct(&ctrl->srcu);
> + if (ret)
> + return ret;
> +
> mutex_init(&ctrl->scan_lock);
> INIT_LIST_HEAD(&ctrl->namespaces);
> xa_init(&ctrl->cels);
> - init_rwsem(&ctrl->namespaces_rwsem);
> ctrl->dev = dev;
> ctrl->ops = ops;
> ctrl->quirks = quirks;
> @@ -4697,6 +4709,7 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
> out:
> if (ctrl->discard_page)
> __free_page(ctrl->discard_page);
> + cleanup_srcu_struct(&ctrl->srcu);
> return ret;
> }
> EXPORT_SYMBOL_GPL(nvme_init_ctrl);
> @@ -4705,22 +4718,24 @@ EXPORT_SYMBOL_GPL(nvme_init_ctrl);
> void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
> {
> struct nvme_ns *ns;
> + int srcu_idx;
>
> - down_read(&ctrl->namespaces_rwsem);
> - list_for_each_entry(ns, &ctrl->namespaces, list)
> + srcu_idx = srcu_read_lock(&ctrl->srcu);
> + list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
> blk_mark_disk_dead(ns->disk);
> - up_read(&ctrl->namespaces_rwsem);
> + srcu_read_unlock(&ctrl->srcu, srcu_idx);
> }
> EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
>
> void nvme_unfreeze(struct nvme_ctrl *ctrl)
> {
> struct nvme_ns *ns;
> + int srcu_idx;
>
> - down_read(&ctrl->namespaces_rwsem);
> - list_for_each_entry(ns, &ctrl->namespaces, list)
> + srcu_idx = srcu_read_lock(&ctrl->srcu);
> + list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
> blk_mq_unfreeze_queue(ns->queue);
> - up_read(&ctrl->namespaces_rwsem);
> + srcu_read_unlock(&ctrl->srcu, srcu_idx);
> clear_bit(NVME_CTRL_FROZEN, &ctrl->flags);
> }
> EXPORT_SYMBOL_GPL(nvme_unfreeze);
> @@ -4728,14 +4743,15 @@ EXPORT_SYMBOL_GPL(nvme_unfreeze);
> int nvme_wait_freeze_timeout(struct nvme_ctrl *ctrl, long timeout)
> {
> struct nvme_ns *ns;
> + int srcu_idx;
>
> - down_read(&ctrl->namespaces_rwsem);
> - list_for_each_entry(ns, &ctrl->namespaces, list) {
> + srcu_idx = srcu_read_lock(&ctrl->srcu);
> + list_for_each_entry_rcu(ns, &ctrl->namespaces, list) {
> timeout = blk_mq_freeze_queue_wait_timeout(ns->queue, timeout);
> if (timeout <= 0)
> break;
> }
> - up_read(&ctrl->namespaces_rwsem);
> + srcu_read_unlock(&ctrl->srcu, srcu_idx);
> return timeout;
> }
> EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
> @@ -4743,23 +4759,25 @@ EXPORT_SYMBOL_GPL(nvme_wait_freeze_timeout);
> void nvme_wait_freeze(struct nvme_ctrl *ctrl)
> {
> struct nvme_ns *ns;
> + int srcu_idx;
>
> - down_read(&ctrl->namespaces_rwsem);
> - list_for_each_entry(ns, &ctrl->namespaces, list)
> + srcu_idx = srcu_read_lock(&ctrl->srcu);
> + list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
> blk_mq_freeze_queue_wait(ns->queue);
> - up_read(&ctrl->namespaces_rwsem);
> + srcu_read_unlock(&ctrl->srcu, srcu_idx);
> }
> EXPORT_SYMBOL_GPL(nvme_wait_freeze);
>
> void nvme_start_freeze(struct nvme_ctrl *ctrl)
> {
> struct nvme_ns *ns;
> + int srcu_idx;
>
> set_bit(NVME_CTRL_FROZEN, &ctrl->flags);
> - down_read(&ctrl->namespaces_rwsem);
> - list_for_each_entry(ns, &ctrl->namespaces, list)
> + srcu_idx = srcu_read_lock(&ctrl->srcu);
> + list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
> blk_freeze_queue_start(ns->queue);
> - up_read(&ctrl->namespaces_rwsem);
> + srcu_read_unlock(&ctrl->srcu, srcu_idx);
> }
> EXPORT_SYMBOL_GPL(nvme_start_freeze);
>
> @@ -4802,11 +4820,12 @@ EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
> void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
> {
> struct nvme_ns *ns;
> + int srcu_idx;
>
> - down_read(&ctrl->namespaces_rwsem);
> - list_for_each_entry(ns, &ctrl->namespaces, list)
> + srcu_idx = srcu_read_lock(&ctrl->srcu);
> + list_for_each_entry_rcu(ns, &ctrl->namespaces, list)
> blk_sync_queue(ns->queue);
> - up_read(&ctrl->namespaces_rwsem);
> + srcu_read_unlock(&ctrl->srcu, srcu_idx);
> }
> EXPORT_SYMBOL_GPL(nvme_sync_io_queues);
>
> diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
> index 499a8bb7cac7d..0f05058692b55 100644
> --- a/drivers/nvme/host/ioctl.c
> +++ b/drivers/nvme/host/ioctl.c
> @@ -789,16 +789,16 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
> bool open_for_write)
> {
> struct nvme_ns *ns;
> - int ret;
> + int ret, srcu_idx;
>
> - down_read(&ctrl->namespaces_rwsem);
> + srcu_idx = srcu_read_lock(&ctrl->srcu);
> if (list_empty(&ctrl->namespaces)) {
> ret = -ENOTTY;
> goto out_unlock;
> }
>
> - ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list);
> - if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) {
> + ns = list_first_or_null_rcu(&ctrl->namespaces, struct nvme_ns, list);
> + if (!ns) {
Hmm, this seems like a functional change. the error suggest that the
list must
contain only a single member to execute. Not sure why though. But this looks
like the change disagrees with the assumption.
More information about the Linux-nvme
mailing list