[PATCH] blk-mq: aggregate ktime_get per request batch

Keith Busch kbusch at meta.com
Tue Oct 4 08:30:04 PDT 2022


From: Keith Busch <kbusch at kernel.org>

The request statistics fields' precision doesn't require each get a
timestamp at their exact location of being set. We can get the time just
once per batch to reduce the overhead from this repeated call. This is
only really helpful if requests are batched, but no harm done if sending
one at a time.

Using fio's t/io_uring benchmark with default settings, time spent in
ktime_get() reduced from ~6% to <1%, and is good for ~3% IOPs boost when
stats are enabled.

Signed-off-by: Keith Busch <kbusch at kernel.org>
---
 block/blk-mq.c             | 88 ++++++++++++++++++++++++--------------
 drivers/block/virtio_blk.c |  3 +-
 drivers/nvme/host/pci.c    |  3 +-
 include/linux/blk-mq.h     |  2 +-
 include/linux/blkdev.h     |  7 +++
 5 files changed, 69 insertions(+), 34 deletions(-)

diff --git a/block/blk-mq.c b/block/blk-mq.c
index 8070b6c10e8d..c759fb36b684 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -339,7 +339,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 EXPORT_SYMBOL(blk_rq_init);
 
 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
-		struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns)
+		struct blk_mq_tags *tags, unsigned int tag, u64 now)
 {
 	struct blk_mq_ctx *ctx = data->ctx;
 	struct blk_mq_hw_ctx *hctx = data->hctx;
@@ -366,15 +366,11 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 	}
 	rq->timeout = 0;
 
-	if (blk_mq_need_time_stamp(rq))
-		rq->start_time_ns = ktime_get_ns();
-	else
-		rq->start_time_ns = 0;
+	rq->start_time_ns = now;
 	rq->part = NULL;
 #ifdef CONFIG_BLK_RQ_ALLOC_TIME
-	rq->alloc_time_ns = alloc_time_ns;
+	rq->alloc_time_ns = now;
 #endif
-	rq->io_start_time_ns = 0;
 	rq->stats_sectors = 0;
 	rq->nr_phys_segments = 0;
 #if defined(CONFIG_BLK_DEV_INTEGRITY)
@@ -407,7 +403,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
 
 static inline struct request *
 __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
-		u64 alloc_time_ns)
+		u64 now)
 {
 	unsigned int tag, tag_offset;
 	struct blk_mq_tags *tags;
@@ -426,7 +422,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 		tag = tag_offset + i;
 		prefetch(tags->static_rqs[tag]);
 		tag_mask &= ~(1UL << i);
-		rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns);
+		rq = blk_mq_rq_ctx_init(data, tags, tag, now);
 		rq_list_add(data->cached_rq, rq);
 		nr++;
 	}
@@ -440,13 +436,13 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data,
 static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 {
 	struct request_queue *q = data->q;
-	u64 alloc_time_ns = 0;
 	struct request *rq;
 	unsigned int tag;
+	u64 now = 0;
 
 	/* alloc_time includes depth and tag waits */
-	if (blk_queue_rq_alloc_time(q))
-		alloc_time_ns = ktime_get_ns();
+	if (blk_queue_need_ts(q))
+		now = ktime_get_ns();
 
 	if (data->cmd_flags & REQ_NOWAIT)
 		data->flags |= BLK_MQ_REQ_NOWAIT;
@@ -481,7 +477,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 	 * Try batched alloc if we want more than 1 tag.
 	 */
 	if (data->nr_tags > 1) {
-		rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns);
+		rq = __blk_mq_alloc_requests_batch(data, now);
 		if (rq)
 			return rq;
 		data->nr_tags = 1;
@@ -507,7 +503,7 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
 	}
 
 	return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag,
-					alloc_time_ns);
+					now);
 }
 
 static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
@@ -610,14 +606,14 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 		.cmd_flags	= opf,
 		.nr_tags	= 1,
 	};
-	u64 alloc_time_ns = 0;
 	unsigned int cpu;
 	unsigned int tag;
+	u64 now = 0;
 	int ret;
 
 	/* alloc_time includes depth and tag waits */
-	if (blk_queue_rq_alloc_time(q))
-		alloc_time_ns = ktime_get_ns();
+	if (blk_queue_need_ts(q))
+		now = ktime_get_ns();
 
 	/*
 	 * If the tag allocator sleeps we could get an allocation for a
@@ -661,7 +657,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	if (tag == BLK_MQ_NO_TAG)
 		goto out_queue_exit;
 	return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag,
-					alloc_time_ns);
+					now);
 
 out_queue_exit:
 	blk_queue_exit(q);
@@ -1214,8 +1210,13 @@ void blk_mq_start_request(struct request *rq)
 
 	trace_block_rq_issue(rq);
 
-	if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
-		rq->io_start_time_ns = ktime_get_ns();
+	if (blk_queue_stat(q) ) {
+		/*
+		 * io_start_time_ns may not have been set earlier if the stat
+		 * attribute was being changed.
+		 */
+		if (!rq->io_start_time_ns)
+			rq->io_start_time_ns = ktime_get_ns();
 		rq->stats_sectors = blk_rq_sectors(rq);
 		rq->rq_flags |= RQF_STATS;
 		rq_qos_issue(q, rq);
@@ -1944,10 +1945,14 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 	blk_status_t ret = BLK_STS_OK;
 	LIST_HEAD(zone_list);
 	bool needs_resource = false;
+	u64 now = 0;
 
 	if (list_empty(list))
 		return false;
 
+	if (blk_queue_stat(q))
+		now = ktime_get_ns();
+
 	/*
 	 * Now process all the entries, sending them to the driver.
 	 */
@@ -1983,6 +1988,8 @@ bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
 		 */
 		if (nr_budgets)
 			nr_budgets--;
+
+		rq->io_start_time_ns = now;
 		ret = q->mq_ops->queue_rq(hctx, &bd);
 		switch (ret) {
 		case BLK_STS_OK:
@@ -2507,7 +2514,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 }
 
 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
-					    struct request *rq, bool last)
+					    struct request *rq, bool last, u64 now)
 {
 	struct request_queue *q = rq->q;
 	struct blk_mq_queue_data bd = {
@@ -2521,6 +2528,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 	 * Any other error (busy), just add it to our list as we
 	 * previously would have done.
 	 */
+	rq->io_start_time_ns = now;
 	ret = q->mq_ops->queue_rq(hctx, &bd);
 	switch (ret) {
 	case BLK_STS_OK:
@@ -2541,7 +2549,7 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
 
 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 						struct request *rq,
-						bool bypass_insert, bool last)
+						bool bypass_insert, bool last, u64 now)
 {
 	struct request_queue *q = rq->q;
 	bool run_queue = true;
@@ -2574,7 +2582,7 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		goto insert;
 	}
 
-	return __blk_mq_issue_directly(hctx, rq, last);
+	return __blk_mq_issue_directly(hctx, rq, last, now);
 insert:
 	if (bypass_insert)
 		return BLK_STS_RESOURCE;
@@ -2597,8 +2605,13 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		struct request *rq)
 {
-	blk_status_t ret =
-		__blk_mq_try_issue_directly(hctx, rq, false, true);
+	blk_status_t ret;
+	u64 now = 0;
+
+	if (blk_queue_stat(rq->q))
+		now = ktime_get_ns();
+
+	ret = __blk_mq_try_issue_directly(hctx, rq, false, true, now);
 
 	if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
 		blk_mq_request_bypass_insert(rq, false, true);
@@ -2606,14 +2619,15 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
 		blk_mq_end_request(rq, ret);
 }
 
-static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
+static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last, u64 now)
 {
-	return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last);
+	return __blk_mq_try_issue_directly(rq->mq_hctx, rq, true, last, now);
 }
 
 static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 {
 	struct blk_mq_hw_ctx *hctx = NULL;
+	u64 now = ktime_get_ns();
 	struct request *rq;
 	int queued = 0;
 	int errors = 0;
@@ -2628,7 +2642,7 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 			hctx = rq->mq_hctx;
 		}
 
-		ret = blk_mq_request_issue_directly(rq, last);
+		ret = blk_mq_request_issue_directly(rq, last, now);
 		switch (ret) {
 		case BLK_STS_OK:
 			queued++;
@@ -2656,9 +2670,14 @@ static void blk_mq_plug_issue_direct(struct blk_plug *plug, bool from_schedule)
 static void __blk_mq_flush_plug_list(struct request_queue *q,
 				     struct blk_plug *plug)
 {
+	u64 now = 0;
+
 	if (blk_queue_quiesced(q))
 		return;
-	q->mq_ops->queue_rqs(&plug->mq_list);
+	if (blk_queue_stat(q))
+		now = ktime_get_ns();
+
+	q->mq_ops->queue_rqs(&plug->mq_list, now);
 }
 
 static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
@@ -2736,6 +2755,10 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 {
 	int queued = 0;
 	int errors = 0;
+	u64 now = 0;
+
+	if (blk_queue_stat(hctx->queue))
+		now = ktime_get_ns();
 
 	while (!list_empty(list)) {
 		blk_status_t ret;
@@ -2743,7 +2766,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
 				queuelist);
 
 		list_del_init(&rq->queuelist);
-		ret = blk_mq_request_issue_directly(rq, list_empty(list));
+		ret = blk_mq_request_issue_directly(rq, list_empty(list), now);
 		if (ret != BLK_STS_OK) {
 			errors++;
 			if (ret == BLK_STS_RESOURCE ||
@@ -2938,6 +2961,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
 	struct request_queue *q = rq->q;
 	unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
 	blk_status_t ret;
+	u64 now = 0;
 
 	if (blk_rq_sectors(rq) > max_sectors) {
 		/*
@@ -2975,6 +2999,8 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
 	if (blk_crypto_insert_cloned_request(rq))
 		return BLK_STS_IOERR;
 
+	if (blk_do_io_stat(rq))
+		now = ktime_get_ns();
 	blk_account_io_start(rq);
 
 	/*
@@ -2983,7 +3009,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq)
 	 * insert.
 	 */
 	blk_mq_run_dispatch_ops(q,
-			ret = blk_mq_request_issue_directly(rq, true));
+			ret = blk_mq_request_issue_directly(rq, true, now));
 	if (ret)
 		blk_account_io_done(rq, ktime_get_ns());
 	return ret;
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 3f4739d52268..b3b391c56d8b 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -417,7 +417,7 @@ static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
 	return kick;
 }
 
-static void virtio_queue_rqs(struct request **rqlist)
+static void virtio_queue_rqs(struct request **rqlist, u64 now)
 {
 	struct request *req, *next, *prev = NULL;
 	struct request *requeue_list = NULL;
@@ -426,6 +426,7 @@ static void virtio_queue_rqs(struct request **rqlist)
 		struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx);
 		bool kick;
 
+		req->io_start_time_ns = now;
 		if (!virtblk_prep_rq_batch(req)) {
 			rq_list_move(rqlist, &requeue_list, req, prev);
 			req = prev;
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 5b796efa325b..6fc14a3f2980 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -981,7 +981,7 @@ static bool nvme_prep_rq_batch(struct nvme_queue *nvmeq, struct request *req)
 	return nvme_prep_rq(nvmeq->dev, req) == BLK_STS_OK;
 }
 
-static void nvme_queue_rqs(struct request **rqlist)
+static void nvme_queue_rqs(struct request **rqlist, u64 now)
 {
 	struct request *req, *next, *prev = NULL;
 	struct request *requeue_list = NULL;
@@ -989,6 +989,7 @@ static void nvme_queue_rqs(struct request **rqlist)
 	rq_list_for_each_safe(rqlist, req, next) {
 		struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
 
+		req->io_start_time_ns = now;
 		if (!nvme_prep_rq_batch(nvmeq, req)) {
 			/* detach 'req' and add to remainder list */
 			rq_list_move(rqlist, &requeue_list, req, prev);
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index ba18e9bdb799..071ebd7fd6c9 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -562,7 +562,7 @@ struct blk_mq_ops {
 	 * empty the @rqlist completely, then the rest will be queued
 	 * individually by the block layer upon return.
 	 */
-	void (*queue_rqs)(struct request **rqlist);
+	void (*queue_rqs)(struct request **rqlist, u64 now);
 
 	/**
 	 * @get_budget: Reserve budget before queue request, once .queue_rq is
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 49373d002631..6bda1414cca0 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -599,6 +599,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_stable_writes(q) \
 	test_bit(QUEUE_FLAG_STABLE_WRITES, &(q)->queue_flags)
 #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
+#define blk_queue_stat(q)	test_bit(QUEUE_FLAG_STATS, &(q)->queue_flags)
 #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
 #define blk_queue_zone_resetall(q)	\
 	test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
@@ -612,6 +613,12 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_rq_alloc_time(q)	false
 #endif
 
+#define blk_queue_need_ts(q)		\
+	(blk_queue_rq_alloc_time(q) ||	\
+	 blk_queue_stat(q) || 		\
+	 blk_queue_io_stat(q) || 	\
+	 q->elevator)
+
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
 			     REQ_FAILFAST_DRIVER))
-- 
2.30.2




More information about the Linux-nvme mailing list