[PATCHv2 1/3] block: introduce rq_list_for_each_safe macro
Max Gurtovoy
mgurtovoy at nvidia.com
Tue Jan 4 04:15:58 PST 2022
On 1/3/2022 8:15 PM, Keith Busch wrote:
> On Mon, Jan 03, 2022 at 05:23:08PM +0200, Max Gurtovoy wrote:
>> On 12/30/2021 5:30 PM, Keith Busch wrote:
>>> I think it just may work if we export blk_mq_get_driver_tag().
>> do you have a suggestion for the NVMe/PCI driver ?
> The following tests fine with my multi-namespace setups. I have real
> hardware with namespace management capabilities, but qemu can also
> easily emulate it too for anyone who doesn't have one.
>
> ---
> diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
> index 5668e28be0b7..84f2e73d0c7c 100644
> --- a/block/blk-mq-tag.h
> +++ b/block/blk-mq-tag.h
> @@ -41,12 +41,6 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
> return sbq_wait_ptr(bt, &hctx->wait_index);
> }
>
> -enum {
> - BLK_MQ_NO_TAG = -1U,
> - BLK_MQ_TAG_MIN = 1,
> - BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
> -};
> -
> extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
> extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0d7c9d3e0329..b4540723077a 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -1589,6 +1589,7 @@ bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
> hctx->tags->rqs[rq->tag] = rq;
> return true;
> }
> +EXPORT_SYMBOL_GPL(__blk_mq_get_driver_tag);
>
> static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
> int flags, void *key)
> @@ -2582,11 +2583,10 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
> * same queue, caller must ensure that's the case.
> *
> * Since we pass off the full list to the driver at this point,
> - * we do not increment the active request count for the queue.
> - * Bypass shared tags for now because of that.
> + * we are counting on the driver to increment the active
> + * request count for the queue.
> */
> - if (q->mq_ops->queue_rqs &&
> - !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
> + if (q->mq_ops->queue_rqs) {
> blk_mq_run_dispatch_ops(q,
> __blk_mq_flush_plug_list(q, plug));
> if (rq_list_empty(plug->mq_list))
> diff --git a/block/blk-mq.h b/block/blk-mq.h
> index 948791ea2a3e..0f37ae906901 100644
> --- a/block/blk-mq.h
> +++ b/block/blk-mq.h
> @@ -268,21 +268,6 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
> __blk_mq_put_driver_tag(rq->mq_hctx, rq);
> }
>
> -bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
> -
> -static inline bool blk_mq_get_driver_tag(struct request *rq)
> -{
> - struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> -
> - if (rq->tag != BLK_MQ_NO_TAG &&
> - !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
> - hctx->tags->rqs[rq->tag] = rq;
> - return true;
> - }
> -
> - return __blk_mq_get_driver_tag(hctx, rq);
> -}
> -
> static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
> {
> int cpu;
> diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
> index 50deb8b69c40..f50483475c12 100644
> --- a/drivers/nvme/host/pci.c
> +++ b/drivers/nvme/host/pci.c
> @@ -992,8 +992,9 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
> return false;
> if (unlikely(!nvme_check_ready(&nvmeq->dev->ctrl, req, true)))
> return false;
> + if (!blk_mq_get_driver_tag(req))
> + return false;
>
> - req->mq_hctx->tags->rqs[req->tag] = req;
> return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
> }
>
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 550996cf419c..8fb544a35330 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -1072,6 +1072,27 @@ static inline int blk_rq_map_sg(struct request_queue *q, struct request *rq,
> }
> void blk_dump_rq_flags(struct request *, char *);
>
> +enum {
> + BLK_MQ_NO_TAG = -1U,
> + BLK_MQ_TAG_MIN = 1,
> + BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
> +};
> +
> +bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
> +
> +static inline bool blk_mq_get_driver_tag(struct request *rq)
> +{
> + struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
> +
> + if (rq->tag != BLK_MQ_NO_TAG &&
> + !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
> + hctx->tags->rqs[rq->tag] = rq;
> + return true;
> + }
> +
> + return __blk_mq_get_driver_tag(hctx, rq);
> +}
> +
> #ifdef CONFIG_BLK_DEV_ZONED
> static inline unsigned int blk_rq_zone_no(struct request *rq)
> {
> --
This patch worked for me with 2 namespaces for NVMe PCI.
I'll check it later on with my RDMA queue_rqs patches as well. There we
have also a tagset sharing with the connect_q (and not only with
multiple namespaces).
But the connect_q is using a reserved tags only (for the connect commands).
I saw some strange things that I couldn't understand:
1. running randread fio with libaio ioengine didn't call nvme_queue_rqs
- expected
*2. running randwrite fio with libaio ioengine did call nvme_queue_rqs -
Not expected !!*
*3. running randread fio with io_uring ioengine (and --iodepth_batch=32)
didn't call nvme_queue_rqs - Not expected !!*
4. running randwrite fio with io_uring ioengine (and --iodepth_batch=32)
did call nvme_queue_rqs - expected
5. *running randread fio with io_uring ioengine (and --iodepth_batch=32
--runtime=30) didn't finish after 30 seconds and stuck for 300 seconds
(fio jobs required "kill -9 fio" to remove refcounts from nvme_core) -
Not expected !!*
*debug pring: fio: job 'task_nvme0n1' (state=5) hasn't exited in 300
seconds, it appears to be stuck. Doing forceful exit of this job.
*
*6. ***running randwrite fio with io_uring ioengine (and
--iodepth_batch=32 --runtime=30) didn't finish after 30 seconds and
stuck for 300 seconds (fio jobs required "kill -9 fio" to remove
refcounts from nvme_core) - Not expected !!**
***debug pring: fio: job 'task_nvme0n1' (state=5) hasn't exited in 300
seconds, it appears to be stuck. Doing forceful exit of this job.***
any idea what could cause these unexpected scenarios ? at least
unexpected for me :)
******
More information about the Linux-nvme
mailing list