[RFC PATCH 2/2] nvmet_tcp: enable polling nvme for completions
Sagi Grimberg
sagi at grimberg.me
Wed Jan 13 21:29:45 EST 2021
> nvmet_tcp: enable polling nvme for completions
>
> NVMe device queue interrupt activity can conflict with CPUs
> processing NVMe/TCP network queues. This patch adds the ability
> to poll an NVMe device for completions, when NVMe device poll
> queues are configured.
>
> A new non-blocking blk_poll_once() function is introduced to
> provide a one shot option to poll an individual NVMe device
> queue.
>
> Per-queue tracking of outstanding requests to specific NVMe
> devices established to facilitate directed use of the new
> blk_poll_once() function.
>
> These changes are viable after adding the previous patch
> that added the idle_poll_period option to io_work(). If this
> option is not enabled then use of NVMe device polling is
> bypassed.
>
> Signed-off-by: Mark Wunderlich <mark.wunderlich at intel.com>
> ---
> block/blk-mq.c | 22 +++++++++
> drivers/nvme/target/io-cmd-bdev.c | 6 ++
> drivers/nvme/target/nvmet.h | 4 ++
> drivers/nvme/target/tcp.c | 90 ++++++++++++++++++++++++++++++++++++-
> include/linux/blkdev.h | 1
> 5 files changed, 120 insertions(+), 3 deletions(-)
Patches for block, nvmet core and nvmet-tcp should be split into
sub-patches.
>
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 55bcee5dc032..ceacf327f65e 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -3832,6 +3832,28 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
> return blk_mq_poll_hybrid_sleep(q, rq);
> }
>
> +int blk_poll_once(struct request_queue *q)
> +{
> + struct blk_mq_hw_ctx *hctx;
> + int ret;
> +
> + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
> + return 0;
> +
> + hctx = blk_mq_map_queue_type(q, HCTX_TYPE_POLL, smp_processor_id());
This should probably be __smp_processor_id() in a preemptible context.
> + if (!hctx) {
> + pr_err("%s: hctx not resolved\n", __func__);
> + return 0;
> + }
> +
> + hctx->poll_invoked++;
> + ret = q->mq_ops->poll(hctx);
> + if (ret > 0)
> + hctx->poll_success++;
> + return ret;
> +}
> +EXPORT_SYMBOL_GPL(blk_poll_once);
> +
> /**
> * blk_poll - poll for IO completions
> * @q: the queue
> diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
> index 125dde3f410e..1d9ee8546ae1 100644
> --- a/drivers/nvme/target/io-cmd-bdev.c
> +++ b/drivers/nvme/target/io-cmd-bdev.c
> @@ -269,6 +269,12 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
> bio->bi_iter.bi_sector = sector;
> bio->bi_private = req;
> bio->bi_end_io = nvmet_bio_done;
> + /* Set hipri flag to trigger use of poll queues */
> + if ((req->op & REQ_HIPRI) &&
> + test_bit(QUEUE_FLAG_POLL, &bio->bi_disk->queue->queue_flags)) {
> + op |= REQ_HIPRI;
> + req->queue = bio->bi_disk->queue;
> + }
> bio->bi_opf = op;
>
> blk_start_plug(&plug);
> diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
> index 559a15ccc322..65900df2b45d 100644
> --- a/drivers/nvme/target/nvmet.h
> +++ b/drivers/nvme/target/nvmet.h
> @@ -341,6 +341,10 @@ struct nvmet_req {
> size_t transfer_len;
> size_t metadata_len;
>
> + /* details to support polling */
> + struct request_queue *queue;
> + int op;
> +
> struct nvmet_port *port;
>
> void (*execute)(struct nvmet_req *req);
> diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
> index 739d67985d93..43a0cc630bc7 100644
> --- a/drivers/nvme/target/tcp.c
> +++ b/drivers/nvme/target/tcp.c
> @@ -62,6 +62,12 @@ enum {
> NVMET_TCP_F_INIT_FAILED = (1 << 0),
> };
>
> +struct nvmet_tcp_devq {
> + u32 pend_cnt;
> + struct request_queue *queue;
> + struct list_head entry;
> +};
> +
> struct nvmet_tcp_cmd {
> struct nvmet_tcp_queue *queue;
> struct nvmet_req req;
> @@ -82,6 +88,7 @@ struct nvmet_tcp_cmd {
> struct kvec *iov;
> u32 flags;
>
> + struct nvmet_tcp_devq *devq;
> struct list_head entry;
> struct llist_node lentry;
>
> @@ -142,6 +149,9 @@ struct nvmet_tcp_queue {
> int idx;
> struct list_head queue_list;
>
> + /* ref to group subsys devq entry */
> + struct list_head devq_list;
> +
> struct nvmet_tcp_cmd connect;
>
> struct page_frag_cache pf_cache;
> @@ -237,6 +247,42 @@ static inline int queue_cpu(struct nvmet_tcp_queue *queue)
> return queue->sock->sk->sk_incoming_cpu;
> }
>
> +static int nvmet_tcp_track_queue(struct nvmet_tcp_queue *queue)
> +{
> + struct nvmet_tcp_devq *devq;
> + struct nvmet_req *req = &queue->cmd->req;
> +
> + /* track only requests issued as poll requests */
> + if (req->queue == NULL)
> + return 0;
> +
> + list_for_each_entry(devq, &queue->devq_list, entry) {
> + if (devq->queue == req->queue)
> + goto update;
> + }
> +
> + devq = kzalloc(sizeof(*devq), GFP_KERNEL);
> + if (!devq)
> + return -ENOMEM;
> + list_add_tail(&devq->entry, &queue->devq_list);
> + devq->queue = req->queue;
> +update:
> + devq->pend_cnt++;
> + queue->cmd->devq = devq;
> + return 0;
> +}
> +
> +static void nvmet_tcp_free_dev_tracking(struct nvmet_tcp_queue *queue)
> +{
> + struct nvmet_tcp_devq *devq;
> +
> + while ((devq = list_first_entry_or_null(
> + &queue->devq_list, struct nvmet_tcp_devq, entry))) {
> + list_del_init(&devq->entry);
> + kfree(devq);
> + }
> +}
> +
> static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
> {
> return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
> @@ -479,6 +525,25 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
> }
> }
>
> +static inline void nvmet_tcp_poll_init(struct nvmet_req *req)
> +{
> + req->queue = NULL;
> + if (idle_poll_period)
> + req->op |= REQ_HIPRI;
> + else
> + req->op = 0;
> +}
All of this patch really exemplifies why none of this should live
in nvme-tcp.
All of this management, tracking, and decisions should live in
nvmet core. But as I said, I'm not convinced that this is an easy
change because it fundamentally changes the target architecture (not
saying we shouldn't do this, but it is a rather intrusive change).
Would be good to hear what others think on this. I do think that all
transports would benefit from backend device polling (despite
demonstrated on nvme-tcp).
More information about the Linux-nvme
mailing list