[RFC PATCH 2/2] nvmet_tcp: enable polling nvme for completions

Wed Jan 13 21:29:45 EST 2021

> nvmet_tcp: enable polling nvme for completions
> 
> NVMe device queue interrupt activity can conflict with CPUs
> processing NVMe/TCP network queues.  This patch adds the ability
> to poll an NVMe device for completions, when NVMe device poll
> queues are configured.
> 
> A new non-blocking blk_poll_once() function is introduced to
> provide a one shot option to poll an individual NVMe device
> queue.
> 
> Per-queue tracking of outstanding requests to specific NVMe
> devices established to facilitate directed use of the new
> blk_poll_once() function.
> 
> These changes are viable after adding the previous patch
> that added the idle_poll_period option to io_work(). If this
> option is not enabled then use of NVMe device polling is
> bypassed.
> 
> Signed-off-by: Mark Wunderlich <mark.wunderlich at intel.com>
> ---
>   block/blk-mq.c                    |   22 +++++++++
>   drivers/nvme/target/io-cmd-bdev.c |    6 ++
>   drivers/nvme/target/nvmet.h       |    4 ++
>   drivers/nvme/target/tcp.c         |   90 ++++++++++++++++++++++++++++++++++++-
>   include/linux/blkdev.h            |    1
>   5 files changed, 120 insertions(+), 3 deletions(-)

Patches for block, nvmet core and nvmet-tcp should be split into
sub-patches.

> 
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 55bcee5dc032..ceacf327f65e 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3832,6 +3832,28 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
>   	return blk_mq_poll_hybrid_sleep(q, rq);
>   }
>   
> +int blk_poll_once(struct request_queue *q)
> +{
> +	struct blk_mq_hw_ctx *hctx;
> +	int ret;
> +
> +	if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
> +		return 0;
> +
> +	hctx = blk_mq_map_queue_type(q, HCTX_TYPE_POLL, smp_processor_id());

This should probably be __smp_processor_id() in a preemptible context.

> +	if (!hctx) {
> +		pr_err("%s: hctx not resolved\n", __func__);
> +		return 0;
> +	}
> +
> +	hctx->poll_invoked++;
> +	ret = q->mq_ops->poll(hctx);
> +	if (ret > 0)
> +		hctx->poll_success++;
> +	return ret;
> +}
> +EXPORT_SYMBOL_GPL(blk_poll_once);
> +
>   /**
>    * blk_poll - poll for IO completions
>    * @q:  the queue
> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
> index 125dde3f410e..1d9ee8546ae1 100644
> --- a/drivers/nvme/target/io-cmd-bdev.c
> +++ b/drivers/nvme/target/io-cmd-bdev.c
> @@ -269,6 +269,12 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
>   	bio->bi_iter.bi_sector = sector;
>   	bio->bi_private = req;
>   	bio->bi_end_io = nvmet_bio_done;
> +	/* Set hipri flag to trigger use of poll queues */
> +	if ((req->op & REQ_HIPRI) &&
> +	    test_bit(QUEUE_FLAG_POLL, &bio->bi_disk->queue->queue_flags)) {
> +		op |= REQ_HIPRI;
> +		req->queue = bio->bi_disk->queue;
> +	}
>   	bio->bi_opf = op;
>   
>   	blk_start_plug(&plug);
> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
> index 559a15ccc322..65900df2b45d 100644
> --- a/drivers/nvme/target/nvmet.h
> +++ b/drivers/nvme/target/nvmet.h
> @@ -341,6 +341,10 @@ struct nvmet_req {
>   	size_t			transfer_len;
>   	size_t			metadata_len;
>   
> +	/* details to support polling */
> +	struct request_queue	*queue;
> +	int			op;
> +
>   	struct nvmet_port	*port;
>   
>   	void (*execute)(struct nvmet_req *req);
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 739d67985d93..43a0cc630bc7 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -62,6 +62,12 @@ enum {
>   	NVMET_TCP_F_INIT_FAILED = (1 << 0),
>   };
>   
> +struct nvmet_tcp_devq {
> +	u32				pend_cnt;
> +	struct request_queue		*queue;
> +	struct list_head		entry;
> +};
> +
>   struct nvmet_tcp_cmd {
>   	struct nvmet_tcp_queue		*queue;
>   	struct nvmet_req		req;
> @@ -82,6 +88,7 @@ struct nvmet_tcp_cmd {
>   	struct kvec			*iov;
>   	u32				flags;
>   
> +	struct nvmet_tcp_devq		*devq;
>   	struct list_head		entry;
>   	struct llist_node		lentry;
>   
> @@ -142,6 +149,9 @@ struct nvmet_tcp_queue {
>   	int			idx;
>   	struct list_head	queue_list;
>   
> +	/* ref to group subsys devq entry */
> +	struct list_head	devq_list;
> +
>   	struct nvmet_tcp_cmd	connect;
>   
>   	struct page_frag_cache	pf_cache;
> @@ -237,6 +247,42 @@ static inline int queue_cpu(struct nvmet_tcp_queue *queue)
>   	return queue->sock->sk->sk_incoming_cpu;
>   }
>   
> +static int nvmet_tcp_track_queue(struct nvmet_tcp_queue *queue)
> +{
> +	struct nvmet_tcp_devq *devq;
> +	struct nvmet_req *req = &queue->cmd->req;
> +
> +	/* track only requests issued as poll requests */
> +	if (req->queue == NULL)
> +		return 0;
> +
> +	list_for_each_entry(devq, &queue->devq_list, entry) {
> +		if (devq->queue == req->queue)
> +			goto update;
> +	}
> +
> +	devq = kzalloc(sizeof(*devq), GFP_KERNEL);
> +	if (!devq)
> +		return -ENOMEM;
> +	list_add_tail(&devq->entry, &queue->devq_list);
> +	devq->queue = req->queue;
> +update:
> +	devq->pend_cnt++;
> +	queue->cmd->devq = devq;
> +	return 0;
> +}
> +
> +static void nvmet_tcp_free_dev_tracking(struct nvmet_tcp_queue *queue)
> +{
> +	struct nvmet_tcp_devq *devq;
> +
> +	while ((devq = list_first_entry_or_null(
> +			&queue->devq_list, struct nvmet_tcp_devq, entry))) {
> +		list_del_init(&devq->entry);
> +		kfree(devq);
> +	}
> +}
> +
>   static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
>   {
>   	return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
> @@ -479,6 +525,25 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
>   	}
>   }
>   
> +static inline void nvmet_tcp_poll_init(struct nvmet_req *req)
> +{
> +	req->queue = NULL;
> +	if (idle_poll_period)
> +		req->op |= REQ_HIPRI;
> +	else
> +		req->op = 0;
> +}

All of this patch really exemplifies why none of this should live
in nvme-tcp.

All of this management, tracking, and decisions should live in
nvmet core. But as I said, I'm not convinced that this is an easy
change because it fundamentally changes the target architecture (not
saying we shouldn't do this, but it is a rather intrusive change).

Would be good to hear what others think on this. I do think that all
transports would benefit from backend device polling (despite
demonstrated on nvme-tcp).