[PATCH 2/3] nvme-multipath: cannot disconnect controller on stuck partition scan

Thu Oct 10 01:57:23 PDT 2024

On 10/9/24 19:32, Keith Busch wrote:
> On Wed, Oct 09, 2024 at 08:23:45AM +0200, Hannes Reinecke wrote:
>> With my testcase _all_ paths return NS_NOT_READY during partition scan, so
>> I/O is constantly bounced between paths, and partition scan never returns.
>> Doesn't matter where you call it, it's stuck.
> 
> Assuming you mean a differet status, like NVME_SC_INTERNAL_PATH_ERROR, I
> can recreate a stuck scan. Let me get one more shot at fixing this by
> suppressing the scan to another context. This one is successful in my
> tests:
> 
> ---
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index 48e7a8906d012..e1fd53ff2c0ca 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -580,6 +580,20 @@ static int nvme_add_ns_head_cdev(struct nvme_ns_head *head)
>   	return ret;
>   }
>   
> +static void nvme_scan_work(struct work_struct *work)
> +{
> +	struct nvme_ns_head *head =
> +		container_of(work, struct nvme_ns_head, scan_work);
> +
> +	if (WARN_ON_ONCE(!test_and_clear_bit(GD_SUPPRESS_PART_SCAN,
> +					     &head->disk->state)))
> +		return;
> +
> +	mutex_lock(&head->disk->open_mutex);
> +	bdev_disk_changed(head->disk, false);
> +	mutex_unlock(&head->disk->open_mutex);
> +}
> +
>   static void nvme_requeue_work(struct work_struct *work)
>   {
>   	struct nvme_ns_head *head =
> @@ -606,6 +620,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>   	bio_list_init(&head->requeue_list);
>   	spin_lock_init(&head->requeue_lock);
>   	INIT_WORK(&head->requeue_work, nvme_requeue_work);
> +	INIT_WORK(&head->scan_work, nvme_scan_work);
>   
>   	/*
>   	 * Add a multipath node if the subsystems supports multiple controllers.
> @@ -629,6 +644,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
>   		return PTR_ERR(head->disk);
>   	head->disk->fops = &nvme_ns_head_ops;
>   	head->disk->private_data = head;
> +	set_bit(GD_SUPPRESS_PART_SCAN, &head->disk->state);
>   	sprintf(head->disk->disk_name, "nvme%dn%d",
>   			ctrl->subsys->instance, head->instance);
>   	return 0;
> @@ -655,6 +671,7 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
>   			return;
>   		}
>   		nvme_add_ns_head_cdev(head);
> +		kblockd_schedule_work(&head->scan_work);
>   	}
>   
>   	mutex_lock(&head->lock);
> @@ -974,14 +991,14 @@ void nvme_mpath_shutdown_disk(struct nvme_ns_head *head)
>   		return;
>   	if (test_and_clear_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
>   		nvme_cdev_del(&head->cdev, &head->cdev_device);
> +		/*
> +		 * requeue I/O after NVME_NSHEAD_DISK_LIVE has been cleared
> +		 * to allow multipath to fail all I/O.
> +		 */
> +		synchronize_srcu(&head->srcu);
> +		kblockd_schedule_work(&head->requeue_work);
>   		del_gendisk(head->disk);
>   	}

I guess we need to split 'test_and_clear_bit()' into a 'test_bit()' when 
testing for the condition and a 'clear_bit()' after del_gendisk().

Otherwise we're having a race condition with nvme_mpath_set_live:

         if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
                 rc = device_add_disk(&head->subsys->dev, head->disk,
                                      nvme_ns_attr_groups);

which could sneak in from another controller just after we cleared the
NVME_NSHEAD_DISK_LIVE flag, causing device_add_disk() to fail as the
same name is already registered.
Or nvme_cdev_del() to display nice kernel warnings as the cdev was
not registered.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare at suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich