[PATCH 1/2] block: accumulate segment page gaps per bio
Caleb Sander Mateos
csander at purestorage.com
Tue Aug 5 13:32:22 PDT 2025
On Tue, Aug 5, 2025 at 3:59 PM Keith Busch <kbusch at meta.com> wrote:
>
> From: Keith Busch <kbusch at kernel.org>
>
> The blk-mq dma iteration has an optimization for requests that align to
> the device's iommu merge boundary. This boundary may be larger than the
> device's virtual boundary, but the code had been depending on that queue
> limit to know ahead of time if the request aligns to the optimization.
>
> Rather than rely on that queue limit, which many devices may not even
> have, store the virtual boundary gaps of each segment into the bio as a
> mask while checking the segments and merging. We can then quickly check
> per io if the request can use the optimization or not.
>
> Signed-off-by: Keith Busch <kbusch at kernel.org>
> ---
> block/blk-merge.c | 30 +++++++++++++++++++++++++++---
> block/blk-mq-dma.c | 3 +--
> block/blk-mq.c | 5 +++++
> include/linux/blk-mq.h | 6 ++++++
> include/linux/blk_types.h | 2 ++
> 5 files changed, 41 insertions(+), 5 deletions(-)
>
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 81bdad915699a..d63389c063006 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -278,6 +278,9 @@ static unsigned int bio_split_alignment(struct bio *bio,
> return lim->logical_block_size;
> }
>
> +#define bv_seg_gap(bv, bvprv) \
> + bv.bv_offset | ((bvprv.bv_offset + bvprv.bv_len) & (PAGE_SIZE - 1));
Extra semicolon and missing parentheses around inputs and output. Is
there a reason not to make this a static inline function rather than a
macro?
Best,
Caleb
> +
> /**
> * bio_split_rw_at - check if and where to split a read/write bio
> * @bio: [in] bio to be split
> @@ -293,9 +296,9 @@ static unsigned int bio_split_alignment(struct bio *bio,
> int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
> unsigned *segs, unsigned max_bytes)
> {
> + unsigned nsegs = 0, bytes = 0, page_gaps = 0;
> struct bio_vec bv, bvprv, *bvprvp = NULL;
> struct bvec_iter iter;
> - unsigned nsegs = 0, bytes = 0;
>
> bio_for_each_bvec(bv, bio, iter) {
> if (bv.bv_offset & lim->dma_alignment)
> @@ -305,8 +308,11 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
> * If the queue doesn't support SG gaps and adding this
> * offset would create a gap, disallow it.
> */
> - if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
> - goto split;
> + if (bvprvp) {
> + if (bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
> + goto split;
> + page_gaps |= bv_seg_gap(bv, bvprv);
> + }
>
> if (nsegs < lim->max_segments &&
> bytes + bv.bv_len <= max_bytes &&
> @@ -324,6 +330,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
> }
>
> *segs = nsegs;
> + bio->page_gaps = page_gaps;
> return 0;
> split:
> if (bio->bi_opf & REQ_ATOMIC)
> @@ -353,6 +360,7 @@ int bio_split_rw_at(struct bio *bio, const struct queue_limits *lim,
> * big IO can be trival, disable iopoll when split needed.
> */
> bio_clear_polled(bio);
> + bio->page_gaps = page_gaps;
> return bytes >> SECTOR_SHIFT;
> }
> EXPORT_SYMBOL_GPL(bio_split_rw_at);
> @@ -696,6 +704,8 @@ static bool blk_atomic_write_mergeable_rqs(struct request *rq,
> static struct request *attempt_merge(struct request_queue *q,
> struct request *req, struct request *next)
> {
> + struct bio_vec bv, bvprv;
> +
> if (!rq_mergeable(req) || !rq_mergeable(next))
> return NULL;
>
> @@ -753,6 +763,10 @@ static struct request *attempt_merge(struct request_queue *q,
> if (next->start_time_ns < req->start_time_ns)
> req->start_time_ns = next->start_time_ns;
>
> + bv = next->bio->bi_io_vec[0];
> + bvprv = req->biotail->bi_io_vec[req->biotail->bi_vcnt - 1];
> + req->__page_gaps |= blk_rq_page_gaps(next) | bv_seg_gap(bv, bvprv);
> +
> req->biotail->bi_next = next->bio;
> req->biotail = next->biotail;
>
> @@ -861,6 +875,7 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
> struct bio *bio, unsigned int nr_segs)
> {
> const blk_opf_t ff = bio_failfast(bio);
> + struct bio_vec bv, bvprv;
>
> if (!ll_back_merge_fn(req, bio, nr_segs))
> return BIO_MERGE_FAILED;
> @@ -876,6 +891,10 @@ enum bio_merge_status bio_attempt_back_merge(struct request *req,
> if (req->rq_flags & RQF_ZONE_WRITE_PLUGGING)
> blk_zone_write_plug_bio_merged(bio);
>
> + bv = bio->bi_io_vec[0];
> + bvprv = req->biotail->bi_io_vec[req->biotail->bi_vcnt - 1];
> + req->__page_gaps |= bio->page_gaps | bv_seg_gap(bv, bvprv);
> +
> req->biotail->bi_next = bio;
> req->biotail = bio;
> req->__data_len += bio->bi_iter.bi_size;
> @@ -890,6 +909,7 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
> struct bio *bio, unsigned int nr_segs)
> {
> const blk_opf_t ff = bio_failfast(bio);
> + struct bio_vec bv, bvprv;
>
> /*
> * A front merge for writes to sequential zones of a zoned block device
> @@ -910,6 +930,10 @@ static enum bio_merge_status bio_attempt_front_merge(struct request *req,
>
> blk_update_mixed_merge(req, bio, true);
>
> + bv = req->bio->bi_io_vec[0];
> + bvprv = bio->bi_io_vec[bio->bi_vcnt - 1];
> + req->__page_gaps |= bio->page_gaps | bv_seg_gap(bv, bvprv);
> +
> bio->bi_next = req->bio;
> req->bio = bio;
>
> diff --git a/block/blk-mq-dma.c b/block/blk-mq-dma.c
> index faa36ff6465ee..a03067c4a268f 100644
> --- a/block/blk-mq-dma.c
> +++ b/block/blk-mq-dma.c
> @@ -73,8 +73,7 @@ static bool blk_map_iter_next(struct request *req, struct blk_map_iter *iter)
> static inline bool blk_can_dma_map_iova(struct request *req,
> struct device *dma_dev)
> {
> - return !((queue_virt_boundary(req->q) + 1) &
> - dma_get_merge_boundary(dma_dev));
> + return !(blk_rq_page_gaps(req) & dma_get_merge_boundary(dma_dev));
> }
>
> static bool blk_dma_map_bus(struct blk_dma_iter *iter)
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index b67d6c02ecebd..09134a66c5666 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -376,6 +376,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
> INIT_LIST_HEAD(&rq->queuelist);
> rq->q = q;
> rq->__sector = (sector_t) -1;
> + rq->__page_gaps = 0;
> INIT_HLIST_NODE(&rq->hash);
> RB_CLEAR_NODE(&rq->rb_node);
> rq->tag = BLK_MQ_NO_TAG;
> @@ -659,6 +660,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
> goto out_queue_exit;
> }
> rq->__data_len = 0;
> + rq->__page_gaps = 0;
> rq->__sector = (sector_t) -1;
> rq->bio = rq->biotail = NULL;
> return rq;
> @@ -739,6 +741,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
> rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
> blk_mq_rq_time_init(rq, alloc_time_ns);
> rq->__data_len = 0;
> + rq->__page_gaps = 0;
> rq->__sector = (sector_t) -1;
> rq->bio = rq->biotail = NULL;
> return rq;
> @@ -2665,6 +2668,7 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
> rq->bio = rq->biotail = bio;
> rq->__sector = bio->bi_iter.bi_sector;
> rq->__data_len = bio->bi_iter.bi_size;
> + rq->__page_gaps = bio->page_gaps;
> rq->nr_phys_segments = nr_segs;
> if (bio_integrity(bio))
> rq->nr_integrity_segments = blk_rq_count_integrity_sg(rq->q,
> @@ -3363,6 +3367,7 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
>
> /* Copy attributes of the original request to the clone request. */
> rq->__sector = blk_rq_pos(rq_src);
> + rq->__page_gaps = blk_rq_page_gaps(rq_src);
> rq->__data_len = blk_rq_bytes(rq_src);
> if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
> rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
> diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
> index 2a5a828f19a0b..d8f491867adc0 100644
> --- a/include/linux/blk-mq.h
> +++ b/include/linux/blk-mq.h
> @@ -115,6 +115,7 @@ struct request {
>
> /* the following two fields are internal, NEVER access directly */
> unsigned int __data_len; /* total data len */
> + unsigned int __page_gaps; /* a mask of all the segment page gaps */
> sector_t __sector; /* sector cursor */
>
> struct bio *bio;
> @@ -1080,6 +1081,11 @@ static inline sector_t blk_rq_pos(const struct request *rq)
> return rq->__sector;
> }
>
> +static inline unsigned int blk_rq_page_gaps(const struct request *rq)
> +{
> + return rq->__page_gaps;
> +}
> +
> static inline unsigned int blk_rq_bytes(const struct request *rq)
> {
> return rq->__data_len;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 0a29b20939d17..d0ed28d40fe02 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -264,6 +264,8 @@ struct bio {
>
> unsigned short bi_max_vecs; /* max bvl_vecs we can hold */
>
> + unsigned int page_gaps; /* a mask of all the vector gaps */
> +
> atomic_t __bi_cnt; /* pin count */
>
> struct bio_vec *bi_io_vec; /* the actual vec list */
> --
> 2.47.3
>
>
More information about the Linux-nvme
mailing list