[PATCH 4/5] block: add support for vectored copies

Hannes Reinecke hare at suse.de
Thu May 22 06:58:18 PDT 2025


On 5/22/25 00:31, Keith Busch wrote:
> From: Keith Busch <kbusch at kernel.org>
> 
> Copy offload can be used to defrad or garbage collect data spread across

Defrag?

> the disk. Most storage protocols provide a way to specifiy multiple
> sources in a single copy commnd, so introduce kernel and user space
> interfaces to accomplish that.
> 
> Signed-off-by: Keith Busch <kbusch at kernel.org>
> ---
>   block/blk-lib.c         | 50 ++++++++++++++++++++++++----------
>   block/ioctl.c           | 59 +++++++++++++++++++++++++++++++++++++++++
>   include/linux/blkdev.h  |  2 ++
>   include/uapi/linux/fs.h | 14 ++++++++++
>   4 files changed, 111 insertions(+), 14 deletions(-)
> 
Any specific reason why this is a different patch, and not folded into
patch 2? It really feels odd to continuously updating interfaces which
have been added with the same patchset...

> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index a538acbaa2cd7..7513b876a5399 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -424,26 +424,46 @@ static int __blkdev_copy(struct block_device *bdev, sector_t dst_sector,
>   }
>   
>   static int blkdev_copy_offload(struct block_device *bdev, sector_t dst_sector,
> -		sector_t src_sector, sector_t nr_sects, gfp_t gfp)
> +		struct bio_vec *bv, int nr_vecs, gfp_t gfp)
>   {
> +	unsigned size = 0;
>   	struct bio *bio;
> -	int ret;
> -
> -	struct bio_vec bv = {
> -		.bv_sector = src_sector,
> -		.bv_sectors = nr_sects,
> -	};
> +	int ret, i;
>   
> -	bio = bio_alloc(bdev, 1, REQ_OP_COPY, gfp);
> -	bio_add_copy_src(bio, &bv);
> +	bio = bio_alloc(bdev, nr_vecs, REQ_OP_COPY, gfp);
> +	for (i = 0; i < nr_vecs; i++) {
> +		size += bv[i].bv_sectors << SECTOR_SHIFT;
> +		bio_add_copy_src(bio, &bv[i]);
> +	}
>   	bio->bi_iter.bi_sector = dst_sector;
> -	bio->bi_iter.bi_size = nr_sects << SECTOR_SHIFT;
> +	bio->bi_iter.bi_size = size;
>   
>   	ret = submit_bio_wait(bio);
>   	bio_put(bio);
>   	return ret;
> +}
> +
> +/**
> + * blkdev_copy_range - copy range of sectors to a destination
> + * @dst_sector:	start sector of the destination to copy to
> + * @bv:		vector of source sectors
> + * @nr_vecs:	number of source sector vectors
> + * @gfp:	allocation flags to use
> + */
> +int blkdev_copy_range(struct block_device *bdev, sector_t dst_sector,
> +		struct bio_vec *bv, int nr_vecs, gfp_t gfp)
> +{
> +	int ret, i;
>   
> +	if (bdev_copy_sectors(bdev))
> +		return blkdev_copy_offload(bdev, dst_sector, bv, nr_vecs, gfp);
> +
> +	for (i = 0, ret = 0; i < nr_vecs && !ret; i++)
> +		ret = __blkdev_copy(bdev, dst_sector, bv[i].bv_sector,
> +				bv[i].bv_sectors, gfp);
> +	return ret;
>   }
> +EXPORT_SYMBOL_GPL(blkdev_copy_range);
>   
>   /**
>    * blkdev_copy - copy source sectors to a destination on the same block device
> @@ -455,9 +475,11 @@ static int blkdev_copy_offload(struct block_device *bdev, sector_t dst_sector,
>   int blkdev_copy(struct block_device *bdev, sector_t dst_sector,
>   		sector_t src_sector, sector_t nr_sects, gfp_t gfp)
>   {
> -	if (bdev_copy_sectors(bdev))
> -		return blkdev_copy_offload(bdev, dst_sector, src_sector,
> -					nr_sects, gfp);
> -	return __blkdev_copy(bdev, dst_sector, src_sector, nr_sects, gfp);
> +	struct bio_vec bv = {
> +		.bv_sector = src_sector,
> +		.bv_sectors = nr_sects,
> +	};
> +
> +	return blkdev_copy_range(bdev, dst_sector, &bv, 1, gfp);
>   }
>   EXPORT_SYMBOL_GPL(blkdev_copy);
> diff --git a/block/ioctl.c b/block/ioctl.c
> index 6f03c65867348..4b5095be19e1a 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -241,6 +241,63 @@ static int blk_ioctl_copy(struct block_device *bdev, blk_mode_t mode,
>   	return blkdev_copy(bdev, dst, src, nr, GFP_KERNEL);
>   }
>   
> +static int blk_ioctl_copy_vec(struct block_device *bdev, blk_mode_t mode,
> +		void __user *argp)
> +{
> +	sector_t align = bdev_logical_block_size(bdev) >> SECTOR_SHIFT;
> +	struct bio_vec *bv, fast_bv[UIO_FASTIOV];
> +	struct copy_range cr;
> +	int i, nr, ret;
> +	__u64 dst;
> +
> +	if (!(mode & BLK_OPEN_WRITE))
> +		return -EBADF;
> +	if (copy_from_user(&cr, argp, sizeof(cr)))
> +		return -EFAULT;
> +	if (!(IS_ALIGNED(cr.dst_sector, align)))
> +		return -EINVAL;
> +
> +	nr = cr.nr_ranges;
> +	if (nr <= UIO_FASTIOV) {
> +		bv = fast_bv;
> +	} else {
> +		bv = kmalloc_array(nr, sizeof(*bv), GFP_KERNEL);
> +		if (!bv)
> +			return -ENOMEM;
> +	}
> +
> +	dst = cr.dst_sector;
> +	for (i = 0; i < nr; i++) {
> +		struct copy_source csrc;
> +		__u64 nr_sects, src;
> +
> +		if (copy_from_user(&csrc,
> +				(void __user *)(cr.sources + i * sizeof(csrc)),
> +				sizeof(csrc))) {
> +			ret = -EFAULT;
> +			goto out;
> +		}
> +
> +		nr_sects = csrc.nr_sectors;
> +		src = csrc.src_sector;
> +		if (!(IS_ALIGNED(src | nr_sects, align)) ||
> +		    (src < dst && src + nr_sects > dst) ||
> +		    (dst < src && dst + nr_sects > src)) {
> +			ret = -EINVAL;
> +			goto out;
> +		}
> +
> +		bv[i].bv_sectors = nr_sects;
> +		bv[i].bv_sector = src;
> +	}
> +
> +	ret = blkdev_copy_range(bdev, dst, bv, nr, GFP_KERNEL);
> +out:
> +	if (bv != fast_bv)
> +		kfree(bv);
> +	return ret;
> +}
> +
>   static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
>   		unsigned long arg)
>   {
> @@ -605,6 +662,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
>   		return blk_ioctl_secure_erase(bdev, mode, argp);
>   	case BLKCPY:
>   		return blk_ioctl_copy(bdev, mode, argp);
> +	case BLKCPY_VEC:
> +		return blk_ioctl_copy_vec(bdev, mode, argp);
>   	case BLKZEROOUT:
>   		return blk_ioctl_zeroout(bdev, mode, arg);
>   	case BLKGETDISKSEQ:

And that makes it even worse; introducing two ioctls which basically do
the same thing (or where one is actually a special case of the other)
is probably not what we should be doing.

Cheers,

Hannes
-- 
Dr. Hannes Reinecke                  Kernel Storage Architect
hare at suse.de                                +49 911 74053 688
SUSE Software Solutions GmbH, Frankenstr. 146, 90461 Nürnberg
HRB 36809 (AG Nürnberg), GF: I. Totev, A. McDonald, W. Knoblich



More information about the Linux-nvme mailing list