[PATCHv2 1/3] block: introduce rq_list_for_each_safe macro

Max Gurtovoy mgurtovoy at nvidia.com
Tue Jan 4 04:15:58 PST 2022


On 1/3/2022 8:15 PM, Keith Busch wrote:
> On Mon, Jan 03, 2022 at 05:23:08PM +0200, Max Gurtovoy wrote:
>> On 12/30/2021 5:30 PM, Keith Busch wrote:
>>> I think it just may work if we export blk_mq_get_driver_tag().
>> do you have a suggestion for the NVMe/PCI driver ?
> The following tests fine with my multi-namespace setups. I have real
> hardware with namespace management capabilities, but qemu can also
> easily emulate it too for anyone who doesn't have one.
>
> ---
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 5668e28be0b7..84f2e73d0c7c 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -41,12 +41,6 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
>   	return sbq_wait_ptr(bt, &hctx->wait_index);
>   }
>   
> -enum {
> -	BLK_MQ_NO_TAG		= -1U,
> -	BLK_MQ_TAG_MIN		= 1,
> -	BLK_MQ_TAG_MAX		= BLK_MQ_NO_TAG - 1,
> -};
> -
>   extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
>   extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
>   
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0d7c9d3e0329..b4540723077a 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1589,6 +1589,7 @@ bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
>   	hctx->tags->rqs[rq->tag] = rq;
>   	return true;
>   }
> +EXPORT_SYMBOL_GPL(__blk_mq_get_driver_tag);
>   
>   static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
>   				int flags, void *key)
> @@ -2582,11 +2583,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
>   		 * same queue, caller must ensure that's the case.
>   		 *
>   		 * Since we pass off the full list to the driver at this point,
> -		 * we do not increment the active request count for the queue.
> -		 * Bypass shared tags for now because of that.
> +		 * we are counting on the driver to increment the active
> +		 * request count for the queue.
>   		 */
> -		if (q->mq_ops->queue_rqs &&
> -		    !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
> +		if (q->mq_ops->queue_rqs) {
>   			blk_mq_run_dispatch_ops(q,
>   				__blk_mq_flush_plug_list(q, plug));
>   			if (rq_list_empty(plug->mq_list))
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 948791ea2a3e..0f37ae906901 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -268,21 +268,6 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
>   	__blk_mq_put_driver_tag(rq->mq_hctx, rq);
>   }
>   
> -bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
> -
> -static inline bool blk_mq_get_driver_tag(struct request *rq)
> -{
> -	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> -
> -	if (rq->tag != BLK_MQ_NO_TAG &&
> -	    !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
> -		hctx->tags->rqs[rq->tag] = rq;
> -		return true;
> -	}
> -
> -	return __blk_mq_get_driver_tag(hctx, rq);
> -}
> -
>   static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
>   {
>   	int cpu;
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 50deb8b69c40..f50483475c12 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -992,8 +992,9 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
>   		return false;
>   	if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
>   		return false;
> +	if (!blk_mq_get_driver_tag(req))
> +		return false;
>   
> -	req->mq_hctx->tags->rqs[req->tag] = req;
>   	return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
>   }
>   
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 550996cf419c..8fb544a35330 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -1072,6 +1072,27 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
>   }
>   void blk_dump_rq_flags(struct request *, char *);
>   
> +enum {
> +	BLK_MQ_NO_TAG		= -1U,
> +	BLK_MQ_TAG_MIN		= 1,
> +	BLK_MQ_TAG_MAX		= BLK_MQ_NO_TAG - 1,
> +};
> +
> +bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
> +
> +static inline bool blk_mq_get_driver_tag(struct request *rq)
> +{
> +	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> +
> +	if (rq->tag != BLK_MQ_NO_TAG &&
> +	    !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
> +		hctx->tags->rqs[rq->tag] = rq;
> +		return true;
> +	}
> +
> +	return __blk_mq_get_driver_tag(hctx, rq);
> +}
> +
>   #ifdef CONFIG_BLK_DEV_ZONED
>   static inline unsigned int blk_rq_zone_no(struct request *rq)
>   {
> --

This patch worked for me with 2 namespaces for NVMe PCI.

I'll check it later on with my RDMA queue_rqs patches as well. There we 
have also a tagset sharing with the connect_q (and not only with 
multiple namespaces).

But the connect_q is using a reserved tags only (for the connect commands).

I saw some strange things that I couldn't understand:

1. running randread fio with libaio ioengine didn't call nvme_queue_rqs 
- expected

*2. running randwrite fio with libaio ioengine did call nvme_queue_rqs - 
Not expected !!*

*3. running randread fio with io_uring ioengine (and --iodepth_batch=32) 
didn't call nvme_queue_rqs - Not expected !!*

4. running randwrite fio with io_uring ioengine (and --iodepth_batch=32) 
did call nvme_queue_rqs - expected

5. *running randread fio with io_uring ioengine (and --iodepth_batch=32 
--runtime=30) didn't finish after 30 seconds and stuck for 300 seconds 
(fio jobs required "kill -9 fio" to remove refcounts from nvme_core)   - 
Not expected !!*

*debug pring: fio: job 'task_nvme0n1' (state=5) hasn't exited in 300 
seconds, it appears to be stuck. Doing forceful exit of this job.
*

*6. ***running randwrite fio with io_uring ioengine (and  
--iodepth_batch=32 --runtime=30) didn't finish after 30 seconds and 
stuck for 300 seconds (fio jobs required "kill -9 fio" to remove 
refcounts from nvme_core)   - Not expected !!**

***debug pring: fio: job 'task_nvme0n1' (state=5) hasn't exited in 300 
seconds, it appears to be stuck. Doing forceful exit of this job.***


any idea what could cause these unexpected scenarios ? at least 
unexpected for me :)


******




More information about the Linux-nvme mailing list