[PATCH 0/3 rfc] Fix nvme-tcp and nvme-rdma controller reset hangs

Sagi Grimberg sagi at grimberg.me
Thu Mar 18 22:45:50 GMT 2021


>>>> Placing the request on the requeue_list is fine, but the question is
>>>> when to kick the requeue_work, nothing guarantees that an alternate path
>>>> exist or will in a sane period. So constantly requeue+kick sounds like
>>>> a really bad practice to me.
>>>
>>> nvme_mpath_set_live(), where you reported the deadlock, kicks the
>>> requeue_list. The difference that NOWAIT provides is that
>>> nvme_mpath_set_live's schronize_srcu() is no longer blocked forever
>>> because the .submit_bio() isn't waiting for entery on a frozen queue, so
>>> now it's free to schedule the dispatch.
>>>
>>> There's probably an optimization to kick it sooner if there's a viable
>>> alternate path, but that could be a follow on.
>>
>> That would be mandatory I think, otherwise this would introduce
>> a regression...
>>
>>> If there's no immediate viable path, then the requests would remain on
>>> the requeue list. That currently happens as long as there's a potential
>>> controller in a reset or connecting state.
>>
>> Well, also worth to keep in mind that now we'll need to clone the bio
>> because we need to override bi_end_io which adds us some overhead
>> in the data path. Unless we make submit_bio return a status which
>> is a much bigger scope of a change I would expect...
> 
> Having submit_bio() return the enter status was where I was going with
> this, but the recursive handling makes this more complicated than I
> initially thought.
> 
> If you use the NOWAIT flag today with a freezing queue, the IO will end
> with BLK_STS_AGAIN and punt retry handling to the application. I'm
> guessing you don't want that to happen, so a little more is required for
> this idea.
> 
> Since it's an error path, perhaps a block operations callback is okay?
> Something like this compile tested patch?

Maybe... I don't see any apparent reason why it would not work...

> ---
> diff --git a/block/blk-core.c b/block/blk-core.c
> index fc60ff208497..423b89005a28 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -475,6 +475,16 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
>   	}
>   }
>   
> +static inline void bio_enter_error(struct bio *bio)
> +{
> +	struct gendisk *disk = bio->bi_bdev->bd_disk;
> +
> +	if (disk->fops->enter_err)
> +		disk->fops->enter_err(bio);
> +	else
> +		bio_wouldblock_error(bio);
> +}
> +
>   static inline int bio_queue_enter(struct bio *bio)
>   {
>   	struct request_queue *q = bio->bi_bdev->bd_disk->queue;
> @@ -484,7 +494,7 @@ static inline int bio_queue_enter(struct bio *bio)
>   	ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
>   	if (unlikely(ret)) {
>   		if (nowait && !blk_queue_dying(q))
> -			bio_wouldblock_error(bio);
> +			bio_enter_error(bio);
>   		else
>   			bio_io_error(bio);
>   	}
> diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
> index edf19bbb904f..2c27eeaa83b0 100644
> --- a/drivers/nvme/host/core.c
> +++ b/drivers/nvme/host/core.c
> @@ -2366,9 +2366,24 @@ static void nvme_ns_head_release(struct gendisk *disk, fmode_t mode)
>   	nvme_put_ns_head(disk->private_data);
>   }
>   
> +void nvme_ns_head_enter_err(struct bio *bio)
> +{
> +	struct nvme_ns_head *head = bio->bi_bdev->bd_disk->private_data;
> +
> +	if (nvme_available_path(head)) {
> +		spin_lock_irq(&head->requeue_lock);
> +		bio_list_add(&head->requeue_list, bio);
> +		spin_unlock_irq(&head->requeue_lock);
> +	} else {
> +		bio->bi_status = BLK_STS_IOERR;
> +		bio_endio(bio);
> +	}
> +}

Nice, you can take the error path in nvme_ns_head_submit_bio
and use it there too:
--
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 5c67a5e96738..8d0ef83f545c 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -318,17 +318,8 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
                 trace_block_bio_remap(bio, disk_devt(ns->head->disk),
                                       bio->bi_iter.bi_sector);
                 ret = submit_bio_noacct(bio);
-       } else if (nvme_available_path(head)) {
-               dev_warn_ratelimited(dev, "no usable path - requeuing 
I/O\n");
-
-               spin_lock_irq(&head->requeue_lock);
-               bio_list_add(&head->requeue_list, bio);
-               spin_unlock_irq(&head->requeue_lock);
         } else {
-               dev_warn_ratelimited(dev, "no available path - failing 
I/O\n");
-
-               bio->bi_status = BLK_STS_IOERR;
-               bio_endio(bio);
+               nvme_ns_head_enter_err(bio);
         }

         srcu_read_unlock(&head->srcu, srcu_idx);
--

And move the prints as well for some logging..

> +
>   const struct block_device_operations nvme_ns_head_ops = {
>   	.owner		= THIS_MODULE,
>   	.submit_bio	= nvme_ns_head_submit_bio,
> +	.enter_err	= nvme_ns_head_enter_err,
>   	.open		= nvme_ns_head_open,
>   	.release	= nvme_ns_head_release,
>   	.ioctl		= nvme_ioctl,
> diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
> index a1d476e1ac02..47595bb09032 100644
> --- a/drivers/nvme/host/multipath.c
> +++ b/drivers/nvme/host/multipath.c
> @@ -274,7 +274,7 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
>   	return ns;
>   }
>   
> -static bool nvme_available_path(struct nvme_ns_head *head)
> +bool nvme_available_path(struct nvme_ns_head *head)
>   {
>   	struct nvme_ns *ns;
>   
> @@ -313,7 +313,7 @@ blk_qc_t nvme_ns_head_submit_bio(struct bio *bio)
>   	ns = nvme_find_path(head);
>   	if (likely(ns)) {
>   		bio_set_dev(bio, ns->disk->part0);
> -		bio->bi_opf |= REQ_NVME_MPATH;
> +		bio->bi_opf |= REQ_NVME_MPATH | REQ_NOWAIT;
>   		trace_block_bio_remap(bio, disk_devt(ns->head->disk),
>   				      bio->bi_iter.bi_sector);
>   		ret = submit_bio_noacct(bio);
> diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
> index 815c032a190e..5dbd6baebd70 100644
> --- a/drivers/nvme/host/nvme.h
> +++ b/drivers/nvme/host/nvme.h
> @@ -677,6 +677,7 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
>   void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
>   struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
>   blk_qc_t nvme_ns_head_submit_bio(struct bio *bio);
> +bool nvme_available_path(struct nvme_ns_head *head);
>   
>   static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
>   {
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index bc6bc8383b43..b5ae1aa292c1 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -1862,6 +1862,7 @@ static inline void blk_ksm_unregister(struct request_queue *q) { }
>   
>   struct block_device_operations {
>   	blk_qc_t (*submit_bio) (struct bio *bio);
> +	void (*enter_err) (struct bio *bio);
>   	int (*open) (struct block_device *, fmode_t);
>   	void (*release) (struct gendisk *, fmode_t);
>   	int (*rw_page)(struct block_device *, sector_t, struct page *, unsigned int);
> --
> 



More information about the Linux-nvme mailing list