[PATCHv3 1/2] block: accumulate segment page gaps per bio

Thu Aug 21 13:44:19 PDT 2025

From: Keith Busch <kbusch at kernel.org>

The blk-mq dma iteration has an optimization for requests that align to
the device's iommu merge boundary. This boundary may be larger than the
device's virtual boundary (if any), but the code had been depending on
that queue limit to know ahead of time if the request is guaranteed to
align to that optimization.

Rather than rely on that queue limit, which many devices may not report,
store the lowest set bit of any page boundary gap between each segment
into the bio while checking the segments. The request turns this into a
mask for merging and quickly checking per io if the request can use the
iova optimization.

Signed-off-by: Keith Busch <kbusch at kernel.org>
---
 block/bio.c               |  1 +
 block/blk-merge.c         | 39 ++++++++++++++++++++++++++++++++++++---
 block/blk-mq-dma.c        |  3 +--
 block/blk-mq.c            | 10 ++++++++++
 include/linux/blk-mq.h    |  2 ++
 include/linux/blk_types.h |  8 ++++++++
 6 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/block/bio.c b/block/bio.c
index f185119b8b40b..8bbb9236c2567 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -253,6 +253,7 @@ void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
 	bio->bi_write_hint = 0;
 	bio->bi_write_stream = 0;
 	bio->bi_status = 0;
+	bio->bi_pg_bit = 0;
 	bio->bi_iter.bi_sector = 0;
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_idx = 0;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 35e99dd1c5fd8..7e81a729d5067 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -278,6 +278,12 @@ static unsigned int bio_split_alignment(struct bio *bio,
 	return lim->logical_block_size;
 }
 
+static inline unsigned int bvec_seg_gap(struct bio_vec *bv, struct bio_vec *bvprv)
+{
+	return ((bvprv->bv_offset + bvprv->bv_len) & (PAGE_SIZE - 1)) |
+			bv->bv_offset;
+}
+
 /**
  * bio_split_io_at - check if and where to split a bio
  * @bio:  [in] bio to be split
@@ -294,9 +300,9 @@ static unsigned int bio_split_alignment(struct bio *bio,
 int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
 		unsigned *segs, unsigned max_bytes, unsigned len_align_mask)
 {
+	unsigned nsegs = 0, bytes = 0, page_gaps = 0;
 	struct bio_vec bv, bvprv, *bvprvp = NULL;
 	struct bvec_iter iter;
-	unsigned nsegs = 0, bytes = 0;
 
 	bio_for_each_bvec(bv, bio, iter) {
 		if (bv.bv_offset & lim->dma_alignment ||
@@ -307,8 +313,11 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
-			goto split;
+		if (bvprvp) {
+			if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
+				goto split;
+			page_gaps |= bvec_seg_gap(&bv, &bvprv);
+		}
 
 		if (nsegs < lim->max_segments &&
 		    bytes + bv.bv_len <= max_bytes &&
@@ -326,6 +335,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
 	}
 
 	*segs = nsegs;
+	bio->bi_pg_bit = ffs(page_gaps);
 	return 0;
 split:
 	if (bio->bi_opf & REQ_ATOMIC)
@@ -361,6 +371,7 @@ int bio_split_io_at(struct bio *bio, const struct queue_limits *lim,
 	 * big IO can be trival, disable iopoll when split needed.
 	 */
 	bio_clear_polled(bio);
+	bio->bi_pg_bit = ffs(page_gaps);
 	return bytes >> SECTOR_SHIFT;
 }
 EXPORT_SYMBOL_GPL(bio_split_io_at);
@@ -697,6 +708,24 @@ static bool blk_atomic_write_mergeable_rqs(struct request *rq,
 	return (rq->cmd_flags & REQ_ATOMIC) == (next->cmd_flags & REQ_ATOMIC);
 }
 
+static inline unsigned int bio_seg_gap(struct request_queue *q,
+				struct bio *next, struct bio *prev)
+{
+	unsigned int page_gaps = 0;
+	struct bio_vec pb, nb;
+
+	if (prev->bi_pg_bit)
+		page_gaps |= 1 << (prev->bi_pg_bit - 1);
+	if (next->bi_pg_bit)
+		page_gaps |= 1 << (next->bi_pg_bit - 1);
+
+	bio_get_last_bvec(prev, &pb);
+	bio_get_first_bvec(next, &nb);
+	if (!biovec_phys_mergeable(q, &pb, &nb))
+		page_gaps |= bvec_seg_gap(&nb, &pb);
+	return page_gaps;
+}
+
 /*
  * For non-mq, this has to be called with the request spinlock acquired.
  * For mq with scheduling, the appropriate queue wide lock should be held.
@@ -761,6 +790,8 @@ static struct request *attempt_merge(struct request_queue *q,
 	if (next->start_time_ns < req->start_time_ns)
 		req->start_time_ns = next->start_time_ns;
 
+	req->page_gap |= bio_seg_gap(req->q, next->bio, req->biotail) |
+			   next->page_gap;
 	req->biotail->bi_next = next->bio;
 	req->biotail = next->biotail;
 
@@ -884,6 +915,7 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
 	if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
 		blk_zone_write_plug_bio_merged(bio);
 
+	req->page_gap |= bio_seg_gap(req->q, bio, req->biotail);
 	req->biotail->bi_next = bio;
 	req->biotail = bio;
 	req->__data_len += bio->bi_iter.bi_size;
@@ -918,6 +950,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
 
 	blk_update_mixed_merge(req, bio, true);
 
+	req->page_gap |= bio_seg_gap(req->q, req->bio, bio);
 	bio->bi_next = req->bio;
 	req->bio = bio;
 
diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
index 660b5e200ccf6..ee542ef28de0c 100644
--- a/block/blk-mq-dma.c
+++ b/block/blk-mq-dma.c
@@ -79,8 +79,7 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter,
 static inline bool blk_can_dma_map_iova(struct request *req,
 		struct device *dma_dev)
 {
-	return !((queue_virt_boundary(req->q) + 1) &
-		dma_get_merge_boundary(dma_dev));
+	return !(req->page_gap & dma_get_merge_boundary(dma_dev));
 }
 
 static bool blk_dma_map_bus(struct blk_dma_iter *iter, struct phys_vec *vec)
diff --git a/block/blk-mq.c b/block/blk-mq.c
index b67d6c02ecebd..8d1cde13742d4 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_LIST_HEAD(&rq->queuelist);
 	rq->q = q;
 	rq->__sector = (sector_t) -1;
+	rq->page_gap = 0;
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->tag = BLK_MQ_NO_TAG;
@@ -659,6 +660,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
 			goto out_queue_exit;
 	}
 	rq->__data_len = 0;
+	rq->page_gap = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
 	return rq;
@@ -739,6 +741,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
 	rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
 	blk_mq_rq_time_init(rq, alloc_time_ns);
 	rq->__data_len = 0;
+	rq->page_gap = 0;
 	rq->__sector = (sector_t) -1;
 	rq->bio = rq->biotail = NULL;
 	return rq;
@@ -2665,6 +2668,12 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 	rq->bio = rq->biotail = bio;
 	rq->__sector = bio->bi_iter.bi_sector;
 	rq->__data_len = bio->bi_iter.bi_size;
+
+	if (bio->bi_pg_bit)
+		rq->page_gap = 1 << (bio->bi_pg_bit - 1);
+	else
+		rq->page_gap  = 0;
+
 	rq->nr_phys_segments = nr_segs;
 	if (bio_integrity(bio))
 		rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
@@ -3370,6 +3379,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
 	}
 	rq->nr_phys_segments = rq_src->nr_phys_segments;
 	rq->nr_integrity_segments = rq_src->nr_integrity_segments;
+	rq->page_gap = rq_src->page_gap;
 
 	if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
 		goto free_and_out;
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 2a5a828f19a0b..5ef0cef8823be 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -152,6 +152,8 @@ struct request {
 	unsigned short nr_phys_segments;
 	unsigned short nr_integrity_segments;
 
+	unsigned int page_gap;
+
 #ifdef CONFIG_BLK_INLINE_ENCRYPTION
 	struct bio_crypt_ctx *crypt_ctx;
 	struct blk_crypto_keyslot *crypt_keyslot;
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 930daff207df2..0bfea2d4cd03a 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -222,6 +222,14 @@ struct bio {
 	enum rw_hint		bi_write_hint;
 	u8			bi_write_stream;
 	blk_status_t		bi_status;
+
+	/*
+	 * The page gap bit indicates the lowest set bit in any page address
+	 * offset between all bi_io_vecs. This field is initialized only after
+	 * splitting to the hardware limits.
+	 */
+	u8			bi_pg_bit;
+
 	atomic_t		__bi_remaining;
 
 	struct bvec_iter	bi_iter;
-- 
2.47.3