[PATCH 2/5] block: add support for copy offload
Keith Busch
kbusch at meta.com
Wed May 21 15:31:04 PDT 2025
From: Keith Busch <kbusch at kernel.org>
Various storage protocols can support offloading block data copies.
Enhance the block layer to know about the device's copying capabilities,
introduce the new REQ_OP_COPY operation, and provide the infrastructure
to iterate, split, and merge these kinds of requests.
A copy command must provide the device with a list of source LBAs and
their lengths, and a destination LBA. The 'struct bio' type doesn't
readily have a way to describe such a thing. But a copy request doesn't
use host memory for data, so the bio's bio_vec is unused space. This
patch adds a new purpose to the bio_vec where it can provide a vector of
sectors instead of memory pages.
Signed-off-by: Keith Busch <kbusch at kernel.org>
---
block/bio.c | 25 ++++++++++++++
block/blk-core.c | 4 +++
block/blk-lib.c | 47 ++++++++++++++++++++++-----
block/blk-merge.c | 28 +++++++++++++++-
block/blk-sysfs.c | 9 ++++++
block/blk.h | 17 +++++++++-
include/linux/bio.h | 20 ++++++++++++
include/linux/blk-mq.h | 5 +++
include/linux/blk_types.h | 2 ++
include/linux/blkdev.h | 14 ++++++++
include/linux/bvec.h | 68 +++++++++++++++++++++++++++++++++++++--
11 files changed, 226 insertions(+), 13 deletions(-)
diff --git a/block/bio.c b/block/bio.c
index 3c0a558c90f52..9c73a895c987b 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1156,6 +1156,31 @@ void bio_iov_bvec_set(struct bio *bio, const struct iov_iter *iter)
bio_set_flag(bio, BIO_CLONED);
}
+static int bvec_try_merge_copy_src(struct bio *bio, struct bio_vec *src)
+{
+ struct bio_vec *bv;
+
+ if (!bio->bi_vcnt)
+ return false;
+
+ bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ if (bv->bv_sector + src->bv_sectors != src->bv_sector)
+ return false;
+
+ bv->bv_sectors += src->bv_sectors;
+ return true;
+}
+
+int bio_add_copy_src(struct bio *bio, struct bio_vec *src)
+{
+ if (bvec_try_merge_copy_src(bio, src))
+ return 0;
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return -EINVAL;
+ bio->bi_io_vec[bio->bi_vcnt++] = *src;
+ return 0;
+}
+
static unsigned int get_contig_folio_len(unsigned int *num_pages,
struct page **pages, unsigned int i,
struct folio *folio, size_t left,
diff --git a/block/blk-core.c b/block/blk-core.c
index b862c66018f25..cb3d9879e2d65 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -837,6 +837,10 @@ void submit_bio_noacct(struct bio *bio)
if (!bdev_max_discard_sectors(bdev))
goto not_supported;
break;
+ case REQ_OP_COPY:
+ if (!bdev_copy_sectors(bdev))
+ goto not_supported;
+ break;
case REQ_OP_SECURE_ERASE:
if (!bdev_max_secure_erase_sectors(bdev))
goto not_supported;
diff --git a/block/blk-lib.c b/block/blk-lib.c
index a819ded0ed3a9..a538acbaa2cd7 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -369,14 +369,7 @@ int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_secure_erase);
-/**
- * blkdev_copy - copy source sectors to a destination on the same block device
- * @dst_sector: start sector of the destination to copy to
- * @src_sector: start sector of the source to copy from
- * @nr_sects: number of sectors to copy
- * @gfp: allocation flags to use
- */
-int blkdev_copy(struct block_device *bdev, sector_t dst_sector,
+static int __blkdev_copy(struct block_device *bdev, sector_t dst_sector,
sector_t src_sector, sector_t nr_sects, gfp_t gfp)
{
unsigned int nr_vecs = __blkdev_sectors_to_bio_pages(nr_sects);
@@ -429,4 +422,42 @@ int blkdev_copy(struct block_device *bdev, sector_t dst_sector,
kvfree(buf);
return ret;
}
+
+static int blkdev_copy_offload(struct block_device *bdev, sector_t dst_sector,
+ sector_t src_sector, sector_t nr_sects, gfp_t gfp)
+{
+ struct bio *bio;
+ int ret;
+
+ struct bio_vec bv = {
+ .bv_sector = src_sector,
+ .bv_sectors = nr_sects,
+ };
+
+ bio = bio_alloc(bdev, 1, REQ_OP_COPY, gfp);
+ bio_add_copy_src(bio, &bv);
+ bio->bi_iter.bi_sector = dst_sector;
+ bio->bi_iter.bi_size = nr_sects << SECTOR_SHIFT;
+
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+ return ret;
+
+}
+
+/**
+ * blkdev_copy - copy source sectors to a destination on the same block device
+ * @dst_sector: start sector of the destination to copy to
+ * @src_sector: start sector of the source to copy from
+ * @nr_sects: number of sectors to copy
+ * @gfp: allocation flags to use
+ */
+int blkdev_copy(struct block_device *bdev, sector_t dst_sector,
+ sector_t src_sector, sector_t nr_sects, gfp_t gfp)
+{
+ if (bdev_copy_sectors(bdev))
+ return blkdev_copy_offload(bdev, dst_sector, src_sector,
+ nr_sects, gfp);
+ return __blkdev_copy(bdev, dst_sector, src_sector, nr_sects, gfp);
+}
EXPORT_SYMBOL_GPL(blkdev_copy);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 3af1d284add50..8085fc0a27c2f 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -399,6 +399,31 @@ struct bio *bio_split_write_zeroes(struct bio *bio,
return bio_submit_split(bio, max_sectors);
}
+struct bio *bio_split_copy(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nr_segs)
+{
+ unsigned nsegs = 0, sectors = 0, mcss = lim->max_copy_segment_sectors;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+
+ bio_for_each_copy_bvec(bv, bio, iter, mcss) {
+ unsigned s;
+
+ s = min(lim->max_copy_sectors - sectors, bv.bv_sectors);
+ nsegs += 1;
+ sectors += s;
+
+ if (nsegs >= lim->max_copy_segments || sectors >= lim->max_copy_sectors)
+ break;
+ }
+
+ if (sectors == bio_sectors(bio))
+ sectors = 0;
+
+ *nr_segs = nsegs;
+ return bio_submit_split(bio, sectors);
+}
+
/**
* bio_split_to_limits - split a bio to fit the queue limits
* @bio: bio to be split
@@ -467,6 +492,7 @@ static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
if (!boundary_sectors ||
req_op(rq) == REQ_OP_DISCARD ||
+ req_op(rq) == REQ_OP_COPY ||
req_op(rq) == REQ_OP_SECURE_ERASE)
return max_sectors;
return min(max_sectors,
@@ -753,7 +779,7 @@ static struct request *attempt_merge(struct request_queue *q,
req->__data_len += blk_rq_bytes(next);
- if (!blk_discard_mergable(req))
+ if (!blk_discard_mergable(req) && !blk_copy_mergable(req))
elv_merge_requests(q, req, next);
blk_crypto_rq_put_keyslot(next);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b2b9b89d6967c..93ce41f399363 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -132,6 +132,7 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
QUEUE_SYSFS_LIMIT_SHOW(max_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_discard_segments)
+QUEUE_SYSFS_LIMIT_SHOW(max_copy_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_integrity_segments)
QUEUE_SYSFS_LIMIT_SHOW(max_segment_size)
QUEUE_SYSFS_LIMIT_SHOW(max_write_streams)
@@ -160,6 +161,8 @@ static ssize_t queue_##_field##_show(struct gendisk *disk, char *page) \
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_discard_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_hw_discard_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_copy_sectors)
+QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_copy_segment_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(max_write_zeroes_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_max_sectors)
QUEUE_SYSFS_LIMIT_SHOW_SECTORS_TO_BYTES(atomic_write_boundary_sectors)
@@ -501,10 +504,13 @@ QUEUE_LIM_RO_ENTRY(queue_io_min, "minimum_io_size");
QUEUE_LIM_RO_ENTRY(queue_io_opt, "optimal_io_size");
QUEUE_LIM_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
+QUEUE_LIM_RO_ENTRY(queue_max_copy_segments, "max_copy_segments");
QUEUE_LIM_RO_ENTRY(queue_discard_granularity, "discard_granularity");
QUEUE_LIM_RO_ENTRY(queue_max_hw_discard_sectors, "discard_max_hw_bytes");
QUEUE_LIM_RW_ENTRY(queue_max_discard_sectors, "discard_max_bytes");
QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
+QUEUE_RO_ENTRY(queue_max_copy_sectors, "copy_max_bytes");
+QUEUE_RO_ENTRY(queue_max_copy_segment_sectors, "copy_segment_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_atomic_write_max_sectors, "atomic_write_max_bytes");
QUEUE_LIM_RO_ENTRY(queue_atomic_write_boundary_sectors,
@@ -644,6 +650,7 @@ static struct attribute *queue_attrs[] = {
&queue_max_sectors_entry.attr,
&queue_max_segments_entry.attr,
&queue_max_discard_segments_entry.attr,
+ &queue_max_copy_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
&queue_max_write_streams_entry.attr,
@@ -657,6 +664,8 @@ static struct attribute *queue_attrs[] = {
&queue_discard_granularity_entry.attr,
&queue_max_discard_sectors_entry.attr,
&queue_max_hw_discard_sectors_entry.attr,
+ &queue_max_copy_sectors_entry.attr,
+ &queue_max_copy_segment_sectors_entry.attr,
&queue_atomic_write_max_sectors_entry.attr,
&queue_atomic_write_boundary_sectors_entry.attr,
&queue_atomic_write_unit_min_entry.attr,
diff --git a/block/blk.h b/block/blk.h
index 37ec459fe6562..685f3eeca46e0 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -185,10 +185,20 @@ static inline bool blk_discard_mergable(struct request *req)
return false;
}
+static inline bool blk_copy_mergable(struct request *req)
+{
+ if (req_op(req) == REQ_OP_COPY &&
+ queue_max_copy_segments(req->q) > 1)
+ return true;
+ return false;
+}
+
static inline unsigned int blk_rq_get_max_segments(struct request *rq)
{
if (req_op(rq) == REQ_OP_DISCARD)
return queue_max_discard_segments(rq->q);
+ if (req_op(rq) == REQ_OP_COPY)
+ return queue_max_copy_segments(rq->q);
return queue_max_segments(rq->q);
}
@@ -200,7 +210,8 @@ static inline unsigned int blk_queue_get_max_sectors(struct request *rq)
if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
return min(q->limits.max_discard_sectors,
UINT_MAX >> SECTOR_SHIFT);
-
+ if (unlikely(op == REQ_OP_COPY))
+ return q->limits.max_copy_sectors;
if (unlikely(op == REQ_OP_WRITE_ZEROES))
return q->limits.max_write_zeroes_sectors;
@@ -347,6 +358,8 @@ struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *nr_segs);
struct bio *bio_split_zone_append(struct bio *bio,
const struct queue_limits *lim, unsigned *nr_segs);
+struct bio *bio_split_copy(struct bio *bio, const struct queue_limits *lim,
+ unsigned *nsegs);
/*
* All drivers must accept single-segments bios that are smaller than PAGE_SIZE.
@@ -397,6 +410,8 @@ static inline struct bio *__bio_split_to_limits(struct bio *bio,
return bio_split_discard(bio, lim, nr_segs);
case REQ_OP_WRITE_ZEROES:
return bio_split_write_zeroes(bio, lim, nr_segs);
+ case REQ_OP_COPY:
+ return bio_split_copy(bio, lim, nr_segs);
default:
/* other operations can't be split */
*nr_segs = 0;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 9c37c66ef9ca3..e25bcde9ec59d 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -54,6 +54,7 @@ static inline bool bio_has_data(struct bio *bio)
if (bio &&
bio->bi_iter.bi_size &&
bio_op(bio) != REQ_OP_DISCARD &&
+ bio_op(bio) != REQ_OP_COPY &&
bio_op(bio) != REQ_OP_SECURE_ERASE &&
bio_op(bio) != REQ_OP_WRITE_ZEROES)
return true;
@@ -68,6 +69,11 @@ static inline bool bio_no_advance_iter(const struct bio *bio)
bio_op(bio) == REQ_OP_WRITE_ZEROES;
}
+static inline bool bio_sector_advance_iter(const struct bio *bio)
+{
+ return bio_op(bio) == REQ_OP_COPY;
+}
+
static inline void *bio_data(struct bio *bio)
{
if (bio_has_data(bio))
@@ -100,6 +106,8 @@ static inline void bio_advance_iter(const struct bio *bio,
if (bio_no_advance_iter(bio))
iter->bi_size -= bytes;
+ else if (bio_sector_advance_iter(bio))
+ bvec_iter_sector_advance(bio->bi_io_vec, iter, bytes);
else
bvec_iter_advance(bio->bi_io_vec, iter, bytes);
/* TODO: It is reasonable to complete bio with error here. */
@@ -114,6 +122,8 @@ static inline void bio_advance_iter_single(const struct bio *bio,
if (bio_no_advance_iter(bio))
iter->bi_size -= bytes;
+ else if (bio_sector_advance_iter(bio))
+ bvec_iter_sector_advance_single(bio->bi_io_vec, iter, bytes);
else
bvec_iter_advance_single(bio->bi_io_vec, iter, bytes);
}
@@ -155,6 +165,15 @@ static inline void bio_advance(struct bio *bio, unsigned int nbytes)
((bvl = mp_bvec_iter_bvec((bio)->bi_io_vec, (iter))), 1); \
bio_advance_iter_single((bio), &(iter), (bvl).bv_len))
+#define __bio_for_each_copy_bvec(bvl, bio, iter, start, max) \
+ for (iter = (start); \
+ (iter).bi_size && \
+ ((bvl = copy_bvec_iter_bvec((bio)->bi_io_vec, (iter), max)), 1); \
+ bio_advance_iter_single((bio), &(iter), (bvl).bv_sectors << SECTOR_SHIFT))
+
+#define bio_for_each_copy_bvec(bvl, bio, iter, max) \
+ __bio_for_each_copy_bvec(bvl, bio, iter, (bio)->bi_iter, max)
+
/* iterate over multi-page bvec */
#define bio_for_each_bvec(bvl, bio, iter) \
__bio_for_each_bvec(bvl, bio, iter, (bio)->bi_iter)
@@ -409,6 +428,7 @@ extern void bio_uninit(struct bio *);
void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf);
void bio_chain(struct bio *, struct bio *);
+int bio_add_copy_src(struct bio *bio, struct bio_vec *src);
int __must_check bio_add_page(struct bio *bio, struct page *page, unsigned len,
unsigned off);
bool __must_check bio_add_folio(struct bio *bio, struct folio *folio,
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index de8c85a03bb7f..49816e7f7df7d 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -1049,6 +1049,11 @@ struct req_iterator {
struct bio *bio;
};
+#define rq_for_each_copy_bvec(bvl, _rq, _iter) \
+ __rq_for_each_bio(_iter.bio, _rq) \
+ bio_for_each_copy_bvec(bvl, _iter.bio, _iter.iter, \
+ _rq->q->limits.max_copy_segment_sectors)
+
#define __rq_for_each_bio(_bio, rq) \
if ((rq->bio)) \
for (_bio = (rq)->bio; _bio; _bio = _bio->bi_next)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 3d1577f07c1c8..361d44c0d1317 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -355,6 +355,8 @@ enum req_op {
REQ_OP_ZONE_RESET = (__force blk_opf_t)13,
/* reset all the zone present on the device */
REQ_OP_ZONE_RESET_ALL = (__force blk_opf_t)15,
+ /* Copy offload sectors to the device */
+ REQ_OP_COPY = (__force blk_opf_t)17,
/* Driver private requests */
REQ_OP_DRV_IN = (__force blk_opf_t)34,
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index b7d71b126ec9b..e39ba0e91d43e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -399,9 +399,13 @@ struct queue_limits {
unsigned int atomic_write_hw_unit_max;
unsigned int atomic_write_unit_max;
+ unsigned int max_copy_sectors;
+ unsigned int max_copy_segment_sectors;
+
unsigned short max_segments;
unsigned short max_integrity_segments;
unsigned short max_discard_segments;
+ unsigned short max_copy_segments;
unsigned short max_write_streams;
unsigned int write_stream_granularity;
@@ -1271,6 +1275,11 @@ static inline unsigned short queue_max_discard_segments(const struct request_que
return q->limits.max_discard_segments;
}
+static inline unsigned short queue_max_copy_segments(const struct request_queue *q)
+{
+ return q->limits.max_copy_segments;
+}
+
static inline unsigned int queue_max_segment_size(const struct request_queue *q)
{
return q->limits.max_segment_size;
@@ -1380,6 +1389,11 @@ static inline unsigned int bdev_write_zeroes_sectors(struct block_device *bdev)
return bdev_limits(bdev)->max_write_zeroes_sectors;
}
+static inline unsigned int bdev_copy_sectors(struct block_device *bdev)
+{
+ return bdev_limits(bdev)->max_copy_sectors;
+}
+
static inline bool bdev_nonrot(struct block_device *bdev)
{
return blk_queue_nonrot(bdev_get_queue(bdev));
diff --git a/include/linux/bvec.h b/include/linux/bvec.h
index 204b22a99c4ba..7cc82738ede8a 100644
--- a/include/linux/bvec.h
+++ b/include/linux/bvec.h
@@ -21,6 +21,8 @@ struct page;
* @bv_page: First page associated with the address range.
* @bv_len: Number of bytes in the address range.
* @bv_offset: Start of the address range relative to the start of @bv_page.
+ * @bv_sector: Start sector associated with the source block range
+ * @bv_sectors: Number of sectors in the block range
*
* The following holds for a bvec if n * PAGE_SIZE < bv_offset + bv_len:
*
@@ -29,9 +31,17 @@ struct page;
* This holds because page_is_mergeable() checks the above property.
*/
struct bio_vec {
- struct page *bv_page;
- unsigned int bv_len;
- unsigned int bv_offset;
+ union {
+ struct {
+ struct page *bv_page;
+ unsigned int bv_len;
+ unsigned int bv_offset;
+ };
+ struct {
+ sector_t bv_sector;
+ sector_t bv_sectors;
+ };
+ };
};
/**
@@ -118,6 +128,21 @@ struct bvec_iter_all {
.bv_offset = mp_bvec_iter_offset((bvec), (iter)), \
})
+/* sector based bvec helpers */
+#define copy_bvec_iter_sector(bvec, iter) \
+ (__bvec_iter_bvec((bvec), (iter))->bv_sector) + \
+ ((iter).bi_bvec_done >> 9)
+
+#define copy_bvec_iter_sectors(bvec, iter) \
+ (__bvec_iter_bvec((bvec), (iter))->bv_sectors) - \
+ ((iter).bi_bvec_done >> 9)
+
+#define copy_bvec_iter_bvec(bvec, iter, max) \
+((struct bio_vec) { \
+ .bv_sector = copy_bvec_iter_sector((bvec), (iter)), \
+ .bv_sectors = min(max, copy_bvec_iter_sectors((bvec), (iter))), \
+})
+
/* For building single-page bvec in flight */
#define bvec_iter_offset(bvec, iter) \
(mp_bvec_iter_offset((bvec), (iter)) % PAGE_SIZE)
@@ -161,6 +186,30 @@ static inline bool bvec_iter_advance(const struct bio_vec *bv,
return true;
}
+static inline bool bvec_iter_sector_advance(const struct bio_vec *bv,
+ struct bvec_iter *iter, unsigned bytes)
+{
+ unsigned int idx = iter->bi_idx;
+
+ if (WARN_ONCE(bytes > iter->bi_size,
+ "Attempted to advance past end of bvec iter\n")) {
+ iter->bi_size = 0;
+ return false;
+ }
+
+ iter->bi_size -= bytes;
+ bytes += iter->bi_bvec_done;
+
+ while (bytes && bytes >> 9 >= bv[idx].bv_sectors) {
+ bytes -= bv[idx].bv_sectors << 9;
+ idx++;
+ }
+
+ iter->bi_idx = idx;
+ iter->bi_bvec_done = bytes;
+ return true;
+}
+
/*
* A simpler version of bvec_iter_advance(), @bytes should not span
* across multiple bvec entries, i.e. bytes <= bv[i->bi_idx].bv_len
@@ -178,6 +227,19 @@ static inline void bvec_iter_advance_single(const struct bio_vec *bv,
iter->bi_size -= bytes;
}
+static inline void bvec_iter_sector_advance_single(const struct bio_vec *bv,
+ struct bvec_iter *iter, unsigned bytes)
+{
+ unsigned int done = iter->bi_bvec_done + bytes;
+
+ if (done == bv[iter->bi_idx].bv_sectors << 9) {
+ done = 0;
+ iter->bi_idx++;
+ }
+ iter->bi_bvec_done = done;
+ iter->bi_size -= bytes;
+}
+
#define for_each_bvec(bvl, bio_vec, iter, start) \
for (iter = (start); \
(iter).bi_size && \
--
2.47.1
More information about the Linux-nvme
mailing list