[PATCH 3/3] nvme: freeze both the ns and multipath queues whe updating queue limits

Thu Aug 29 12:41:34 PDT 2024

On Thu, 2024-08-29 at 09:31 +0300, Christoph Hellwig wrote:
> We really should not have any I/O outstanding while updating the
> queue
> limits, as that could introduce inconsistencies between the multipath
> and underlying nodes.  So always freeze the multipath node first and
> then
> the underlying one, matching the order in nvme_passthru_start.
> 
> Signed-off-by: Christoph Hellwig <hch at lst.de>
> ---
>  drivers/nvme/host/core.c | 41 ++++++++++++++++++++++++++++----------
> --
>  1 file changed, 29 insertions(+), 12 deletions(-)
> 
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index 81d2a09dd00f30..0f01a0bf976fd3 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2095,6 +2095,24 @@ static void nvme_set_chunk_sectors(struct
> nvme_ns *ns, struct nvme_id_ns *id,
>  	lim->chunk_sectors = iob;
>  }
>  
> +/*
> + * Freeze all I/O for a namespace, including clearing out anything
> on the
> + * multipath node.
> + */
> +static void nvme_ns_freeze(struct nvme_ns *ns)
> +{
> +	if (nvme_ns_head_multipath(ns->head))
> +		blk_mq_freeze_queue(ns->head->disk->queue);
> +	blk_mq_freeze_queue(ns->disk->queue);
> +}
> +
> +static void nvme_ns_unfreeze(struct nvme_ns *ns)
> +{
> +	blk_mq_unfreeze_queue(ns->disk->queue);
> +	if (nvme_ns_head_multipath(ns->head))
> +		blk_mq_unfreeze_queue(ns->head->disk->queue);
> +}
> +
>  /*
>   * The queue_limits structure mixes values that are the hardware
> limitations for
>   * bio splitting with what is the device configuration.
> @@ -2116,7 +2134,8 @@ static int nvme_update_ns_info_mpath(struct
> nvme_ns *ns,
>  	struct queue_limits lim;
>  	int ret;
>  
> -	blk_mq_freeze_queue(disk->queue);
> +	WARN_ON_ONCE(disk->queue->mq_freeze_depth < 1);
> +
>  	lim = queue_limits_start_update(disk->queue);
>  	lim.logical_block_size = ns_lim->logical_block_size;
>  	lim.physical_block_size = ns_lim->physical_block_size;
> @@ -2132,7 +2151,6 @@ static int nvme_update_ns_info_mpath(struct
> nvme_ns *ns,
>  	set_capacity_and_notify(disk, get_capacity(ns->disk));
>  	set_disk_ro(disk, nvme_ns_is_readonly(ns, info));
>  	nvme_mpath_revalidate_paths(ns);
> -	blk_mq_unfreeze_queue(disk->queue);
>  
>  	return ret;
>  }
> @@ -2143,7 +2161,7 @@ static int nvme_update_ns_info_generic(struct
> nvme_ns *ns,
>  	struct queue_limits lim;
>  	int ret;
>  
> -	blk_mq_freeze_queue(ns->disk->queue);
> +	nvme_ns_freeze(ns);
>  	lim = queue_limits_start_update(ns->disk->queue);
>  	nvme_set_ctrl_limits(ns->ctrl, &lim);
>  	ret = queue_limits_commit_update(ns->disk->queue, &lim);
> @@ -2154,11 +2172,11 @@ static int nvme_update_ns_info_generic(struct
> nvme_ns *ns,
>  	ns->disk->flags |= GENHD_FL_HIDDEN;
>  	set_bit(NVME_NS_READY, &ns->flags);
>  	set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
> +	if (nvme_ns_head_multipath(ns->head))
> +		ret = nvme_update_ns_info_mpath(ns, info, true);
>  
>  out_unfreeze:
> -	blk_mq_unfreeze_queue(ns->disk->queue);
> -	if (nvme_ns_head_multipath(ns->head) && !ret)
> -		ret = nvme_update_ns_info_mpath(ns, info, true);
> +	nvme_ns_unfreeze(ns);
>  	return ret;
>  }
>  
> @@ -2204,7 +2222,7 @@ static int nvme_update_ns_info_block(struct
> nvme_ns *ns,
>  			goto out;
>  	}
>  
> -	blk_mq_freeze_queue(ns->disk->queue);
> +	nvme_ns_freeze(ns);
>  	ns->head->lba_shift = id->lbaf[lbaf].ds;
>  	ns->head->nuse = le64_to_cpu(id->nuse);
>  	capacity = nvme_lba_to_sect(ns->head, le64_to_cpu(id-
> >nsze));
> @@ -2236,7 +2254,7 @@ static int nvme_update_ns_info_block(struct
> nvme_ns *ns,
>  
>  	ret = queue_limits_commit_update(ns->disk->queue, &lim);
>  	if (ret) {
> -		blk_mq_unfreeze_queue(ns->disk->queue);
> +		nvme_ns_unfreeze(ns);
>  		goto out;
>  	}
>  __
> @@ -2254,7 +2272,9 @@ static int nvme_update_ns_info_block(struct
> nvme_ns *ns,
>  done:
>  	set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
>  	set_bit(NVME_NS_READY, &ns->flags);
> -	blk_mq_unfreeze_queue(ns->disk->queue);
> +	if (nvme_ns_head_multipath(ns->head))
> +		ret = nvme_update_ns_info_mpath(ns, info,
> unsupported);
> +	nvme_ns_unfreeze(ns);
>  
>  	if (blk_queue_is_zoned(ns->queue)) {
>  		ret = blk_revalidate_disk_zones(ns->disk);
> @@ -2266,9 +2286,6 @@ static int nvme_update_ns_info_block(struct
> nvme_ns *ns,
>  out:
>  	kfree(nvm);
>  	kfree(id);
> -
> -	if (nvme_ns_head_multipath(ns->head) && !ret)
> -		ret = nvme_update_ns_info_mpath(ns, info,
> unsupported);
>  	return ret;
>  }
> 

Moving the call to nvme_update_ns_info_mpath() up before the "out"
label has the effect that the prior error conditions that jump to "out"
(for example, failure in queue_limits_commit_update()) will not run the
multipath code any more, while they previously did. I am not saying
this is wrong, but I wanted to make sure we make this change
conciously. I think it should be mentioned in the commit message.

While I was staring at this code yesterday, I made no progress because
my brain went in circles about this change. Looking at it now, it seems
to me that when we jump to "out", nothing has been commited to the
namespace queue, and thus we don't need to apply changes to the
multipath queue either, and the change is thus ok.

Regards
Martin