[RFC PATCH 2/2] nvmet_tcp: enable polling nvme for completions
Wunderlich, Mark
mark.wunderlich at intel.com
Mon Dec 14 20:11:16 EST 2020
nvmet_tcp: enable polling nvme for completions
NVMe device queue interrupt activity can conflict with CPUs
processing NVMe/TCP network queues. This patch adds the ability
to poll an NVMe device for completions, when NVMe device poll
queues are configured.
A new non-blocking blk_poll_once() function is introduced to
provide a one shot option to poll an individual NVMe device
queue.
Per-queue tracking of outstanding requests to specific NVMe
devices established to facilitate directed use of the new
blk_poll_once() function.
These changes are viable after adding the previous patch
that added the idle_poll_period option to io_work(). If this
option is not enabled then use of NVMe device polling is
bypassed.
Signed-off-by: Mark Wunderlich <mark.wunderlich at intel.com>
---
block/blk-mq.c | 22 +++++++++
drivers/nvme/target/io-cmd-bdev.c | 6 ++
drivers/nvme/target/nvmet.h | 4 ++
drivers/nvme/target/tcp.c | 90 ++++++++++++++++++++++++++++++++++++-
include/linux/blkdev.h | 1
5 files changed, 120 insertions(+), 3 deletions(-)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 55bcee5dc032..ceacf327f65e 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -3832,6 +3832,28 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
return blk_mq_poll_hybrid_sleep(q, rq);
}
+int blk_poll_once(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int ret;
+
+ if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ return 0;
+
+ hctx = blk_mq_map_queue_type(q, HCTX_TYPE_POLL, smp_processor_id());
+ if (!hctx) {
+ pr_err("%s: hctx not resolved\n", __func__);
+ return 0;
+ }
+
+ hctx->poll_invoked++;
+ ret = q->mq_ops->poll(hctx);
+ if (ret > 0)
+ hctx->poll_success++;
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_poll_once);
+
/**
* blk_poll - poll for IO completions
* @q: the queue
diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c
index 125dde3f410e..1d9ee8546ae1 100644
--- a/drivers/nvme/target/io-cmd-bdev.c
+++ b/drivers/nvme/target/io-cmd-bdev.c
@@ -269,6 +269,12 @@ static void nvmet_bdev_execute_rw(struct nvmet_req *req)
bio->bi_iter.bi_sector = sector;
bio->bi_private = req;
bio->bi_end_io = nvmet_bio_done;
+ /* Set hipri flag to trigger use of poll queues */
+ if ((req->op & REQ_HIPRI) &&
+ test_bit(QUEUE_FLAG_POLL, &bio->bi_disk->queue->queue_flags)) {
+ op |= REQ_HIPRI;
+ req->queue = bio->bi_disk->queue;
+ }
bio->bi_opf = op;
blk_start_plug(&plug);
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index 559a15ccc322..65900df2b45d 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -341,6 +341,10 @@ struct nvmet_req {
size_t transfer_len;
size_t metadata_len;
+ /* details to support polling */
+ struct request_queue *queue;
+ int op;
+
struct nvmet_port *port;
void (*execute)(struct nvmet_req *req);
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 739d67985d93..43a0cc630bc7 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -62,6 +62,12 @@ enum {
NVMET_TCP_F_INIT_FAILED = (1 << 0),
};
+struct nvmet_tcp_devq {
+ u32 pend_cnt;
+ struct request_queue *queue;
+ struct list_head entry;
+};
+
struct nvmet_tcp_cmd {
struct nvmet_tcp_queue *queue;
struct nvmet_req req;
@@ -82,6 +88,7 @@ struct nvmet_tcp_cmd {
struct kvec *iov;
u32 flags;
+ struct nvmet_tcp_devq *devq;
struct list_head entry;
struct llist_node lentry;
@@ -142,6 +149,9 @@ struct nvmet_tcp_queue {
int idx;
struct list_head queue_list;
+ /* ref to group subsys devq entry */
+ struct list_head devq_list;
+
struct nvmet_tcp_cmd connect;
struct page_frag_cache pf_cache;
@@ -237,6 +247,42 @@ static inline int queue_cpu(struct nvmet_tcp_queue *queue)
return queue->sock->sk->sk_incoming_cpu;
}
+static int nvmet_tcp_track_queue(struct nvmet_tcp_queue *queue)
+{
+ struct nvmet_tcp_devq *devq;
+ struct nvmet_req *req = &queue->cmd->req;
+
+ /* track only requests issued as poll requests */
+ if (req->queue == NULL)
+ return 0;
+
+ list_for_each_entry(devq, &queue->devq_list, entry) {
+ if (devq->queue == req->queue)
+ goto update;
+ }
+
+ devq = kzalloc(sizeof(*devq), GFP_KERNEL);
+ if (!devq)
+ return -ENOMEM;
+ list_add_tail(&devq->entry, &queue->devq_list);
+ devq->queue = req->queue;
+update:
+ devq->pend_cnt++;
+ queue->cmd->devq = devq;
+ return 0;
+}
+
+static void nvmet_tcp_free_dev_tracking(struct nvmet_tcp_queue *queue)
+{
+ struct nvmet_tcp_devq *devq;
+
+ while ((devq = list_first_entry_or_null(
+ &queue->devq_list, struct nvmet_tcp_devq, entry))) {
+ list_del_init(&devq->entry);
+ kfree(devq);
+ }
+}
+
static inline u8 nvmet_tcp_hdgst_len(struct nvmet_tcp_queue *queue)
{
return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
@@ -479,6 +525,25 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd)
}
}
+static inline void nvmet_tcp_poll_init(struct nvmet_req *req)
+{
+ req->queue = NULL;
+ if (idle_poll_period)
+ req->op |= REQ_HIPRI;
+ else
+ req->op = 0;
+}
+
+static inline void nvmet_tcp_poll_devq_list(struct nvmet_tcp_queue *queue)
+{
+ struct nvmet_tcp_devq *devq;
+
+ list_for_each_entry(devq, &queue->devq_list, entry) {
+ if (devq->pend_cnt > 0)
+ blk_poll_once(devq->queue);
+ }
+}
+
static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
{
struct llist_node *node;
@@ -486,6 +551,9 @@ static void nvmet_tcp_process_resp_list(struct nvmet_tcp_queue *queue)
for (node = llist_del_all(&queue->resp_list); node; node = node->next) {
cmd = llist_entry(node, struct nvmet_tcp_cmd, lentry);
+ /* remove cmd from device queue pending poll count */
+ if ((cmd->req.op & REQ_HIPRI) && (cmd->req.queue != NULL))
+ cmd->devq->pend_cnt--;
list_add(&cmd->entry, &queue->resp_send_list);
queue->send_list_len++;
}
@@ -524,7 +592,8 @@ static void nvmet_tcp_queue_response(struct nvmet_req *req)
struct nvmet_tcp_queue *queue = cmd->queue;
llist_add(&cmd->lentry, &queue->resp_list);
- queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
+ if (req->queue == NULL)
+ queue_work_on(queue_cpu(queue), nvmet_tcp_wq, &cmd->queue->io_work);
}
static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
@@ -745,6 +814,9 @@ static int nvmet_tcp_try_send(struct nvmet_tcp_queue *queue,
if (!queue->pend_cmds)
goto done;
+ /* poll device queues for pending completions */
+ nvmet_tcp_poll_devq_list(queue);
+
for (i = 0; i < budget; i++) {
ret = nvmet_tcp_try_send_one(queue, i == budget - 1);
if (unlikely(ret < 0)) {
@@ -990,7 +1062,9 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue)
goto out;
}
+ nvmet_tcp_poll_init(req);
queue->cmd->req.execute(&queue->cmd->req);
+ ret = nvmet_tcp_track_queue(queue);
out:
nvmet_prepare_receive_pdu(queue);
return ret;
@@ -1110,7 +1184,11 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue)
nvmet_tcp_prep_recv_ddgst(cmd);
return 0;
}
+ nvmet_tcp_poll_init(&cmd->req);
cmd->req.execute(&cmd->req);
+ ret = nvmet_tcp_track_queue(queue);
+ if (ret < 0)
+ return ret;
}
nvmet_prepare_receive_pdu(queue);
@@ -1149,8 +1227,13 @@ static int nvmet_tcp_try_recv_ddgst(struct nvmet_tcp_queue *queue)
}
if (!(cmd->flags & NVMET_TCP_F_INIT_FAILED) &&
- cmd->rbytes_done == cmd->req.transfer_len)
+ cmd->rbytes_done == cmd->req.transfer_len) {
+ nvmet_tcp_poll_init(&cmd->req);
cmd->req.execute(&cmd->req);
+ ret = nvmet_tcp_track_queue(queue);
+ if (ret < 0)
+ goto out;
+ }
ret = 0;
out:
nvmet_prepare_receive_pdu(queue);
@@ -1459,6 +1542,7 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w)
cancel_work_sync(&queue->io_work);
sock_release(queue->sock);
nvmet_tcp_free_cmds(queue);
+ nvmet_tcp_free_dev_tracking(queue);
if (queue->hdr_digest || queue->data_digest)
nvmet_tcp_free_crypto(queue);
ida_simple_remove(&nvmet_tcp_queue_ida, queue->idx);
@@ -1581,9 +1665,9 @@ static int nvmet_tcp_alloc_queue(struct nvmet_tcp_port *port,
INIT_WORK(&queue->io_work, nvmet_tcp_io_work);
queue->sock = newsock;
queue->port = port;
- queue->nr_cmds = 0;
spin_lock_init(&queue->state_lock);
queue->state = NVMET_TCP_Q_CONNECTING;
+ INIT_LIST_HEAD(&queue->devq_list);
INIT_LIST_HEAD(&queue->free_list);
init_llist_head(&queue->resp_list);
INIT_LIST_HEAD(&queue->resp_send_list);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 639cae2c158b..6a3e62e46c3a 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -952,6 +952,7 @@ int blk_status_to_errno(blk_status_t status);
blk_status_t errno_to_blk_status(int errno);
int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin);
+int blk_poll_once(struct request_queue *q);
static inline struct request_queue *bdev_get_queue(struct block_device *bdev)
{
More information about the Linux-nvme
mailing list