[RFC PATCH v5 2/4] block: add simple copy support

Damien Le Moal Damien.LeMoal at wdc.com
Fri Feb 19 23:59:05 EST 2021


On 2021/02/20 11:01, SelvaKumar S wrote:
> Add new BLKCOPY ioctl that offloads copying of one or more sources
> ranges to a destination in the device. Accepts a 'copy_range' structure
> that contains destination (in sectors), no of sources and pointer to the
> array of source ranges. Each source range is represented by 'range_entry'
> that contains start and length of source ranges (in sectors).
> 
> Introduce REQ_OP_COPY, a no-merge copy offload operation. Create
> bio with control information as payload and submit to the device.
> REQ_OP_COPY(19) is a write op and takes zone_write_lock when submitted
> to zoned device.
> 
> If the device doesn't support copy or copy offload is disabled, then
> copy operation is emulated by default. However, the copy-emulation is an
> opt-in feature. Caller can choose not to use the copy-emulation by
> specifying a flag 'BLKDEV_COPY_NOEMULATION'.
> 
> Copy-emulation is implemented by allocating memory of total copy size.
> The source ranges are read into memory by chaining bio for each source
> ranges and submitting them async and the last bio waits for completion.
> After data is read, it is written to the destination.
> 
> bio_map_kern() is used to allocate bio and add pages of copy buffer to
> bio. As bio->bi_private and bio->bi_end_io are needed for chaining the
> bio and gets over-written, invalidate_kernel_vmap_range() for read is
> called in the caller.
> 
> Introduce queue limits for simple copy and other helper functions.
> Add device limits as sysfs entries.
> 	- copy_offload
> 	- max_copy_sectors
> 	- max_copy_ranges_sectors
> 	- max_copy_nr_ranges
> 
> copy_offload(= 0) is disabled by default. This needs to be enabled if
> copy-offload needs to be used.
> max_copy_sectors = 0 indicates the device doesn't support native copy.
> 
> Native copy offload is not supported for stacked devices and is done via
> copy emulation.
> 
> Signed-off-by: SelvaKumar S <selvakuma.s1 at samsung.com>
> Signed-off-by: Kanchan Joshi <joshi.k at samsung.com>
> Signed-off-by: Nitesh Shetty <nj.shetty at samsung.com>
> Signed-off-by: Javier González <javier.gonz at samsung.com>
> Signed-off-by: Chaitanya Kulkarni <kch at kernel.org>
> ---
>  block/blk-core.c          | 102 ++++++++++++++++--
>  block/blk-lib.c           | 222 ++++++++++++++++++++++++++++++++++++++
>  block/blk-merge.c         |   2 +
>  block/blk-settings.c      |  10 ++
>  block/blk-sysfs.c         |  47 ++++++++
>  block/blk-zoned.c         |   1 +
>  block/bounce.c            |   1 +
>  block/ioctl.c             |  33 ++++++
>  include/linux/bio.h       |   1 +
>  include/linux/blk_types.h |  14 +++
>  include/linux/blkdev.h    |  15 +++
>  include/uapi/linux/fs.h   |  13 +++
>  12 files changed, 453 insertions(+), 8 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 7663a9b94b80..23e646e5ae43 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -720,6 +720,17 @@ static noinline int should_fail_bio(struct bio *bio)
>  }
>  ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
>  
> +static inline int bio_check_copy_eod(struct bio *bio, sector_t start,
> +		sector_t nr_sectors, sector_t max_sect)
> +{
> +	if (nr_sectors && max_sect &&
> +	    (nr_sectors > max_sect || start > max_sect - nr_sectors)) {
> +		handle_bad_sector(bio, max_sect);
> +		return -EIO;
> +	}
> +	return 0;
> +}
> +
>  /*
>   * Check whether this bio extends beyond the end of the device or partition.
>   * This may well happen - the kernel calls bread() without checking the size of
> @@ -738,6 +749,75 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
>  	return 0;
>  }
>  
> +/*
> + * Check for copy limits and remap source ranges if needed.
> + */
> +static int blk_check_copy(struct bio *bio)
> +{
> +	struct blk_copy_payload *payload = bio_data(bio);
> +	struct request_queue *q = bio->bi_disk->queue;
> +	sector_t max_sect, start_sect, copy_size = 0;
> +	sector_t src_max_sect, src_start_sect;
> +	struct block_device *bd_part;
> +	int i, ret = -EIO;
> +
> +	rcu_read_lock();
> +
> +	bd_part = __disk_get_part(bio->bi_disk, bio->bi_partno);
> +	if (unlikely(!bd_part)) {
> +		rcu_read_unlock();
> +		goto out;
> +	}
> +
> +	max_sect =  bdev_nr_sectors(bd_part);
> +	start_sect = bd_part->bd_start_sect;
> +
> +	src_max_sect = bdev_nr_sectors(payload->src_bdev);
> +	src_start_sect = payload->src_bdev->bd_start_sect;
> +
> +	if (unlikely(should_fail_request(bd_part, bio->bi_iter.bi_size)))
> +		goto out;
> +
> +	if (unlikely(bio_check_ro(bio, bd_part)))
> +		goto out;

There is no rcu_unlock() in that out label. Did you test ?

> +
> +	rcu_read_unlock();
> +
> +	/* cannot handle copy crossing nr_ranges limit */
> +	if (payload->copy_nr_ranges > q->limits.max_copy_nr_ranges)
> +		goto out;
> +
> +	for (i = 0; i < payload->copy_nr_ranges; i++) {
> +		ret = bio_check_copy_eod(bio, payload->range[i].src,
> +				payload->range[i].len, src_max_sect);
> +		if (unlikely(ret))
> +			goto out;
> +
> +		/* single source range length limit */
> +		if (payload->range[i].len > q->limits.max_copy_range_sectors)
> +			goto out;

ret is not set. You will return success with this.

> +
> +		payload->range[i].src += src_start_sect;
> +		copy_size += payload->range[i].len;
> +	}
> +
> +	/* check if copy length crosses eod */
> +	ret = bio_check_copy_eod(bio, bio->bi_iter.bi_sector,
> +				copy_size, max_sect);
> +	if (unlikely(ret))
> +		goto out;
> +
> +	/* cannot handle copy more than copy limits */
> +	if (copy_size > q->limits.max_copy_sectors)
> +		goto out;

Again ret is not set... No error return ?

> +
> +	bio->bi_iter.bi_sector += start_sect;
> +	bio->bi_partno = 0;
> +	ret = 0;
> +out:
> +	return ret;
> +}
> +
>  /*
>   * Remap block n of partition p to block n+start(p) of the disk.
>   */
> @@ -827,14 +907,16 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
>  	if (should_fail_bio(bio))
>  		goto end_io;
>  
> -	if (bio->bi_partno) {
> -		if (unlikely(blk_partition_remap(bio)))
> -			goto end_io;
> -	} else {
> -		if (unlikely(bio_check_ro(bio, bio->bi_disk->part0)))
> -			goto end_io;
> -		if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
> -			goto end_io;
> +	if (likely(!op_is_copy(bio->bi_opf))) {
> +		if (bio->bi_partno) {
> +			if (unlikely(blk_partition_remap(bio)))
> +				goto end_io;
> +		} else {
> +			if (unlikely(bio_check_ro(bio, bio->bi_disk->part0)))
> +				goto end_io;
> +			if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
> +				goto end_io;
> +		}
>  	}
>  
>  	/*
> @@ -858,6 +940,10 @@ static noinline_for_stack bool submit_bio_checks(struct bio *bio)
>  		if (!blk_queue_discard(q))
>  			goto not_supported;
>  		break;
> +	case REQ_OP_COPY:
> +		if (unlikely(blk_check_copy(bio)))
> +			goto end_io;
> +		break;
>  	case REQ_OP_SECURE_ERASE:
>  		if (!blk_queue_secure_erase(q))
>  			goto not_supported;
> diff --git a/block/blk-lib.c b/block/blk-lib.c
> index 752f9c722062..97ba58d8d9a1 100644
> --- a/block/blk-lib.c
> +++ b/block/blk-lib.c
> @@ -150,6 +150,228 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  }
>  EXPORT_SYMBOL(blkdev_issue_discard);
>  
> +int blk_copy_offload(struct block_device *dest_bdev, struct blk_copy_payload *payload,
> +		sector_t dest, gfp_t gfp_mask)

Simple copy is only over the same device, right ? So the name "dest_bdev" is a
little strange.

> +{
> +	struct request_queue *q = bdev_get_queue(dest_bdev);
> +	struct bio *bio;
> +	int ret, payload_size;
> +
> +	payload_size = struct_size(payload, range, payload->copy_nr_ranges);
> +	bio = bio_map_kern(q, payload, payload_size, gfp_mask);
> +	if (IS_ERR(bio)) {
> +		ret = PTR_ERR(bio);
> +		goto err;

This will do a bio_put() on a non existent bio...

> +	}
> +
> +	bio->bi_iter.bi_sector = dest;
> +	bio->bi_opf = REQ_OP_COPY | REQ_NOMERGE;
> +	bio_set_dev(bio, dest_bdev);
> +
> +	ret = submit_bio_wait(bio);
> +err:
> +	bio_put(bio);
> +	return ret;
> +}
> +
> +int blk_read_to_buf(struct block_device *src_bdev, struct blk_copy_payload *payload,
> +		gfp_t gfp_mask, sector_t copy_size, void **buf_p)
> +{
> +	struct request_queue *q = bdev_get_queue(src_bdev);
> +	struct bio *bio, *parent = NULL;
> +	void *buf = NULL;
> +	int copy_len = copy_size << SECTOR_SHIFT;
> +	int i, nr_srcs, ret, cur_size, t_len = 0;
> +	bool is_vmalloc;
> +
> +	nr_srcs = payload->copy_nr_ranges;
> +
> +	buf = kvmalloc(copy_len, gfp_mask);
> +	if (!buf)
> +		return -ENOMEM;
> +	is_vmalloc = is_vmalloc_addr(buf);
> +
> +	for (i = 0; i < nr_srcs; i++) {
> +		cur_size = payload->range[i].len << SECTOR_SHIFT;
> +
> +		bio = bio_map_kern(q, buf + t_len, cur_size, gfp_mask);
> +		if (IS_ERR(bio)) {
> +			ret = PTR_ERR(bio);
> +			goto out;
> +		}
> +
> +		bio->bi_iter.bi_sector = payload->range[i].src;
> +		bio->bi_opf = REQ_OP_READ;
> +		bio_set_dev(bio, src_bdev);
> +		bio->bi_end_io = NULL;
> +		bio->bi_private = NULL;
> +
> +		if (parent) {
> +			bio_chain(parent, bio);
> +			submit_bio(parent);
> +		}
> +
> +		parent = bio;
> +		t_len += cur_size;
> +	}
> +
> +	ret = submit_bio_wait(bio);
> +	bio_put(bio);
> +	if (is_vmalloc)
> +		invalidate_kernel_vmap_range(buf, copy_len);

But blk_write_from_buf() will use the buffer right after this.. Is this really OK ?

> +	if (ret)
> +		goto out;
> +
> +	*buf_p = buf;
> +	return 0;
> +out:
> +	kvfree(buf);
> +	return ret;
> +}
> +
> +int blk_write_from_buf(struct block_device *dest_bdev, void *buf, sector_t dest,
> +		sector_t copy_size, gfp_t gfp_mask)
> +{
> +	struct request_queue *q = bdev_get_queue(dest_bdev);
> +	struct bio *bio;
> +	int ret, copy_len = copy_size << SECTOR_SHIFT;
> +
> +	bio = bio_map_kern(q, buf, copy_len, gfp_mask);
> +	if (IS_ERR(bio)) {
> +		ret = PTR_ERR(bio);
> +		goto out;
> +	}
> +	bio_set_dev(bio, dest_bdev);
> +	bio->bi_opf = REQ_OP_WRITE;
> +	bio->bi_iter.bi_sector = dest;
> +
> +	bio->bi_end_io = NULL;
> +	ret = submit_bio_wait(bio);
> +	bio_put(bio);
> +out:
> +	return ret;
> +}
> +
> +int blk_prepare_payload(struct block_device *src_bdev, int nr_srcs, struct range_entry *rlist,
> +		gfp_t gfp_mask, struct blk_copy_payload **payload_p, sector_t *copy_size)
> +{
> +
> +	struct request_queue *q = bdev_get_queue(src_bdev);
> +	struct blk_copy_payload *payload;
> +	sector_t bs_mask, total_len = 0;
> +	int i, ret, payload_size;
> +
> +	if (!q)
> +		return -ENXIO;
> +
> +	if (!nr_srcs)
> +		return -EINVAL;
> +
> +	if (bdev_read_only(src_bdev))
> +		return -EPERM;
> +
> +	bs_mask = (bdev_logical_block_size(src_bdev) >> 9) - 1;
> +
> +	payload_size = struct_size(payload, range, nr_srcs);
> +	payload = kmalloc(payload_size, gfp_mask);
> +	if (!payload)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < nr_srcs; i++) {
> +		if (rlist[i].src & bs_mask || rlist[i].len & bs_mask) {
> +			ret = -EINVAL;
> +			goto err;
> +		}
> +
> +		payload->range[i].src = rlist[i].src;
> +		payload->range[i].len = rlist[i].len;
> +
> +		total_len += rlist[i].len;
> +	}
> +
> +	payload->copy_nr_ranges = i;
> +	payload->src_bdev = src_bdev;
> +	*copy_size = total_len;
> +
> +	*payload_p = payload;
> +	return 0;
> +err:
> +	kfree(payload);
> +	return ret;
> +}
> +
> +int blk_copy_emulate(struct block_device *src_bdev, struct blk_copy_payload *payload,
> +			struct block_device *dest_bdev, sector_t dest,
> +			sector_t copy_size, gfp_t gfp_mask)
> +{
> +	void *buf = NULL;
> +	int ret;
> +
> +	ret = blk_read_to_buf(src_bdev, payload, gfp_mask, copy_size, &buf);
> +	if (ret)
> +		goto out;
> +
> +	ret = blk_write_from_buf(dest_bdev, buf, dest, copy_size, gfp_mask);
> +	if (buf)
> +		kvfree(buf);
> +out:
> +	return ret;
> +}

I already commented that this should better use the dm-kcopyd design, which
would be far more efficient than this. This will be slow...

Your function blkdev_issue_copy() below should deal only with issuing simple
copy (amd later scsi xcopy) for devices that support it. Bring the dm-kcopyd
interface in the block layer as a generic interface for hadling emulation.
Otherwise you are repeating what dm does, but not as efficiently.

> +
> +/**
> + * blkdev_issue_copy - queue a copy
> + * @src_bdev:	source block device
> + * @nr_srcs:	number of source ranges to copy
> + * @rlist:	array of source ranges in sector
> + * @dest_bdev:	destination block device
> + * @dest:	destination in sector
> + * @gfp_mask:   memory allocation flags (for bio_alloc)
> + * @flags:	BLKDEV_COPY_* flags to control behaviour
> + *
> + * Description:
> + *	Copy array of source ranges from source block device to
> + *	destination block devcie. All source must belong to same bdev and
> + *	length of a source range cannot be zero.
> + */
> +
> +int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
> +		struct range_entry *src_rlist, struct block_device *dest_bdev,
> +		sector_t dest, gfp_t gfp_mask, int flags)
> +{
> +	struct request_queue *q = bdev_get_queue(src_bdev);
> +	struct request_queue *dest_q = bdev_get_queue(dest_bdev);
> +	struct blk_copy_payload *payload;
> +	sector_t bs_mask, copy_size;
> +	int ret;
> +
> +	ret = blk_prepare_payload(src_bdev, nr_srcs, src_rlist, gfp_mask,
> +			&payload, &copy_size);
> +	if (ret)
> +		return ret;
> +
> +	bs_mask = (bdev_logical_block_size(dest_bdev) >> 9) - 1;
> +	if (dest & bs_mask) {
> +		return -EINVAL;
> +		goto out;
> +	}
> +
> +	if (q == dest_q && q->limits.copy_offload) {
> +		ret = blk_copy_offload(src_bdev, payload, dest, gfp_mask);
> +		if (ret)
> +			goto out;
> +	} else if (flags & BLKDEV_COPY_NOEMULATION) {

Why ? whoever calls blkdev_issue_copy() wants a copy to be done. Why would that
user say "Fail on me if the device does not support copy" ??? This is a weird
interface in my opinion.

> +		ret = -EIO;
> +		goto out;
> +	} else

Missing braces. By you do not need all these else after the gotos anyway.

> +		ret = blk_copy_emulate(src_bdev, payload, dest_bdev, dest,
> +				copy_size, gfp_mask);
> +
> +out:
> +	kvfree(payload);
> +	return ret;
> +}
> +EXPORT_SYMBOL(blkdev_issue_copy);
> +
>  /**
>   * __blkdev_issue_write_same - generate number of bios with same page
>   * @bdev:	target blockdev
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 808768f6b174..4e04f24e13c1 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -309,6 +309,8 @@ void __blk_queue_split(struct bio **bio, unsigned int *nr_segs)
>  	struct bio *split = NULL;
>  
>  	switch (bio_op(*bio)) {
> +	case REQ_OP_COPY:
> +			break;

Why would this even be called ? Copy BIOs cannot be split, right ?

>  	case REQ_OP_DISCARD:
>  	case REQ_OP_SECURE_ERASE:
>  		split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
> diff --git a/block/blk-settings.c b/block/blk-settings.c
> index 43990b1d148b..93c15ba45a69 100644
> --- a/block/blk-settings.c
> +++ b/block/blk-settings.c
> @@ -60,6 +60,10 @@ void blk_set_default_limits(struct queue_limits *lim)
>  	lim->io_opt = 0;
>  	lim->misaligned = 0;
>  	lim->zoned = BLK_ZONED_NONE;
> +	lim->copy_offload = 0;
> +	lim->max_copy_sectors = 0;
> +	lim->max_copy_nr_ranges = 0;
> +	lim->max_copy_range_sectors = 0;
>  }
>  EXPORT_SYMBOL(blk_set_default_limits);
>  
> @@ -565,6 +569,12 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
>  	if (b->chunk_sectors)
>  		t->chunk_sectors = gcd(t->chunk_sectors, b->chunk_sectors);
>  
> +	/* simple copy not supported in stacked devices */
> +	t->copy_offload = 0;
> +	t->max_copy_sectors = 0;
> +	t->max_copy_range_sectors = 0;
> +	t->max_copy_nr_ranges = 0;

You do not need this. Limits not explicitely initialized are 0 already.
But I do not see why you can't support copy on stacked devices. That should be
feasible taking the min() for each of the above limit.

> +
>  	/* Physical block size a multiple of the logical block size? */
>  	if (t->physical_block_size & (t->logical_block_size - 1)) {
>  		t->physical_block_size = t->logical_block_size;
> diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
> index b513f1683af0..625a72541263 100644
> --- a/block/blk-sysfs.c
> +++ b/block/blk-sysfs.c
> @@ -166,6 +166,44 @@ static ssize_t queue_discard_granularity_show(struct request_queue *q, char *pag
>  	return queue_var_show(q->limits.discard_granularity, page);
>  }
>  
> +static ssize_t queue_copy_offload_show(struct request_queue *q, char *page)
> +{
> +	return queue_var_show(q->limits.copy_offload, page);
> +}
> +
> +static ssize_t queue_copy_offload_store(struct request_queue *q,
> +				       const char *page, size_t count)
> +{
> +	unsigned long copy_offload;
> +	ssize_t ret = queue_var_store(&copy_offload, page, count);
> +
> +	if (ret < 0)
> +		return ret;
> +
> +	if (copy_offload && q->limits.max_copy_sectors == 0)
> +		return -EINVAL;
> +
> +	q->limits.copy_offload = copy_offload;
> +	return ret;
> +}

This is weird. If you want to allow a user to disable copy offload, then use
max_copy_sectors. This one should be read-only and only indicate if the device
supports it or not. I also would actually change this one into
max_copy_hw_sectors, immutable, indicating the max copy sectors that the device
supports, and 0 for no support. That would allow an easy implementation of
max_copy_sectors being red/write for controlling enable/disable.

> +
> +static ssize_t queue_max_copy_sectors_show(struct request_queue *q, char *page)
> +{
> +	return queue_var_show(q->limits.max_copy_sectors, page);
> +}
> +
> +static ssize_t queue_max_copy_range_sectors_show(struct request_queue *q,
> +		char *page)
> +{
> +	return queue_var_show(q->limits.max_copy_range_sectors, page);
> +}
> +
> +static ssize_t queue_max_copy_nr_ranges_show(struct request_queue *q,
> +		char *page)
> +{
> +	return queue_var_show(q->limits.max_copy_nr_ranges, page);
> +}
> +
>  static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page)
>  {
>  
> @@ -591,6 +629,11 @@ QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
>  QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
>  QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
>  
> +QUEUE_RW_ENTRY(queue_copy_offload, "copy_offload");
> +QUEUE_RO_ENTRY(queue_max_copy_sectors, "max_copy_sectors");
> +QUEUE_RO_ENTRY(queue_max_copy_range_sectors, "max_copy_range_sectors");
> +QUEUE_RO_ENTRY(queue_max_copy_nr_ranges, "max_copy_nr_ranges");
> +
>  QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
>  QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
>  QUEUE_RW_ENTRY(queue_poll, "io_poll");
> @@ -636,6 +679,10 @@ static struct attribute *queue_attrs[] = {
>  	&queue_discard_max_entry.attr,
>  	&queue_discard_max_hw_entry.attr,
>  	&queue_discard_zeroes_data_entry.attr,
> +	&queue_copy_offload_entry.attr,
> +	&queue_max_copy_sectors_entry.attr,
> +	&queue_max_copy_range_sectors_entry.attr,
> +	&queue_max_copy_nr_ranges_entry.attr,
>  	&queue_write_same_max_entry.attr,
>  	&queue_write_zeroes_max_entry.attr,
>  	&queue_zone_append_max_entry.attr,
> diff --git a/block/blk-zoned.c b/block/blk-zoned.c
> index 7a68b6e4300c..02069178d51e 100644
> --- a/block/blk-zoned.c
> +++ b/block/blk-zoned.c
> @@ -75,6 +75,7 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
>  	case REQ_OP_WRITE_ZEROES:
>  	case REQ_OP_WRITE_SAME:
>  	case REQ_OP_WRITE:
> +	case REQ_OP_COPY:
>  		return blk_rq_zone_is_seq(rq);
>  	default:
>  		return false;
> diff --git a/block/bounce.c b/block/bounce.c
> index d3f51acd6e3b..5e052afe8691 100644
> --- a/block/bounce.c
> +++ b/block/bounce.c
> @@ -254,6 +254,7 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
>  	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
>  
>  	switch (bio_op(bio)) {
> +	case REQ_OP_COPY:
>  	case REQ_OP_DISCARD:
>  	case REQ_OP_SECURE_ERASE:
>  	case REQ_OP_WRITE_ZEROES:
> diff --git a/block/ioctl.c b/block/ioctl.c
> index d61d652078f4..0e52181657a4 100644
> --- a/block/ioctl.c
> +++ b/block/ioctl.c
> @@ -133,6 +133,37 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
>  				    GFP_KERNEL, flags);
>  }
>  
> +static int blk_ioctl_copy(struct block_device *bdev, fmode_t mode,
> +		unsigned long arg, unsigned long flags)
> +{
> +	struct copy_range crange;
> +	struct range_entry *rlist;
> +	int ret;
> +
> +	if (!(mode & FMODE_WRITE))
> +		return -EBADF;
> +
> +	if (copy_from_user(&crange, (void __user *)arg, sizeof(crange)))
> +		return -EFAULT;
> +
> +	rlist = kmalloc_array(crange.nr_range, sizeof(*rlist),
> +			GFP_KERNEL);
> +	if (!rlist)
> +		return -ENOMEM;
> +
> +	if (copy_from_user(rlist, (void __user *)crange.range_list,
> +				sizeof(*rlist) * crange.nr_range)) {
> +		ret = -EFAULT;
> +		goto out;
> +	}
> +
> +	ret = blkdev_issue_copy(bdev, crange.nr_range, rlist, bdev, crange.dest,
> +			GFP_KERNEL, flags);
> +out:
> +	kfree(rlist);
> +	return ret;
> +}
> +
>  static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
>  		unsigned long arg)
>  {
> @@ -458,6 +489,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
>  	case BLKSECDISCARD:
>  		return blk_ioctl_discard(bdev, mode, arg,
>  				BLKDEV_DISCARD_SECURE);
> +	case BLKCOPY:
> +		return blk_ioctl_copy(bdev, mode, arg, 0);
>  	case BLKZEROOUT:
>  		return blk_ioctl_zeroout(bdev, mode, arg);
>  	case BLKREPORTZONE:
> diff --git a/include/linux/bio.h b/include/linux/bio.h
> index 1edda614f7ce..164313bdfb35 100644
> --- a/include/linux/bio.h
> +++ b/include/linux/bio.h
> @@ -71,6 +71,7 @@ static inline bool bio_has_data(struct bio *bio)
>  static inline bool bio_no_advance_iter(const struct bio *bio)
>  {
>  	return bio_op(bio) == REQ_OP_DISCARD ||
> +	       bio_op(bio) == REQ_OP_COPY ||
>  	       bio_op(bio) == REQ_OP_SECURE_ERASE ||
>  	       bio_op(bio) == REQ_OP_WRITE_SAME ||
>  	       bio_op(bio) == REQ_OP_WRITE_ZEROES;
> diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
> index 866f74261b3b..5a35c02ac0a8 100644
> --- a/include/linux/blk_types.h
> +++ b/include/linux/blk_types.h
> @@ -380,6 +380,8 @@ enum req_opf {
>  	REQ_OP_ZONE_RESET	= 15,
>  	/* reset all the zone present on the device */
>  	REQ_OP_ZONE_RESET_ALL	= 17,
> +	/* copy ranges within device */
> +	REQ_OP_COPY		= 19,
>  
>  	/* SCSI passthrough using struct scsi_request */
>  	REQ_OP_SCSI_IN		= 32,
> @@ -506,6 +508,11 @@ static inline bool op_is_discard(unsigned int op)
>  	return (op & REQ_OP_MASK) == REQ_OP_DISCARD;
>  }
>  
> +static inline bool op_is_copy(unsigned int op)
> +{
> +	return (op & REQ_OP_MASK) == REQ_OP_COPY;
> +}
> +
>  /*
>   * Check if a bio or request operation is a zone management operation, with
>   * the exception of REQ_OP_ZONE_RESET_ALL which is treated as a special case
> @@ -565,4 +572,11 @@ struct blk_rq_stat {
>  	u64 batch;
>  };
>  
> +struct blk_copy_payload {
> +	sector_t	dest;
> +	int		copy_nr_ranges;
> +	struct block_device *src_bdev;
> +	struct	range_entry	range[];
> +};
> +
>  #endif /* __LINUX_BLK_TYPES_H */
> diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
> index 699ace6b25ff..2bb4513d4bb8 100644
> --- a/include/linux/blkdev.h
> +++ b/include/linux/blkdev.h
> @@ -337,10 +337,14 @@ struct queue_limits {
>  	unsigned int		max_zone_append_sectors;
>  	unsigned int		discard_granularity;
>  	unsigned int		discard_alignment;
> +	unsigned int		copy_offload;
> +	unsigned int		max_copy_sectors;
>  
>  	unsigned short		max_segments;
>  	unsigned short		max_integrity_segments;
>  	unsigned short		max_discard_segments;
> +	unsigned short		max_copy_range_sectors;
> +	unsigned short		max_copy_nr_ranges;
>  
>  	unsigned char		misaligned;
>  	unsigned char		discard_misaligned;
> @@ -621,6 +625,7 @@ struct request_queue {
>  #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
>  #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
>  #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
> +#define QUEUE_FLAG_SIMPLE_COPY	30	/* supports simple copy */
>  
>  #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
>  				 (1 << QUEUE_FLAG_SAME_COMP) |		\
> @@ -643,6 +648,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
>  #define blk_queue_io_stat(q)	test_bit(QUEUE_FLAG_IO_STAT, &(q)->queue_flags)
>  #define blk_queue_add_random(q)	test_bit(QUEUE_FLAG_ADD_RANDOM, &(q)->queue_flags)
>  #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
> +#define blk_queue_copy(q)	test_bit(QUEUE_FLAG_SIMPLE_COPY, &(q)->queue_flags)
>  #define blk_queue_zone_resetall(q)	\
>  	test_bit(QUEUE_FLAG_ZONE_RESETALL, &(q)->queue_flags)
>  #define blk_queue_secure_erase(q) \
> @@ -1069,6 +1075,9 @@ static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
>  		return min(q->limits.max_discard_sectors,
>  			   UINT_MAX >> SECTOR_SHIFT);
>  
> +	if (unlikely(op == REQ_OP_COPY))
> +		return q->limits.max_copy_sectors;
> +

I would agreee with this if a copy BIO was always a single range, but that is
not the case. So I am not sure this makes sense at all.

>  	if (unlikely(op == REQ_OP_WRITE_SAME))
>  		return q->limits.max_write_same_sectors;
>  
> @@ -1343,6 +1352,12 @@ extern int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
>  		sector_t nr_sects, gfp_t gfp_mask, int flags,
>  		struct bio **biop);
>  
> +#define BLKDEV_COPY_NOEMULATION	(1 << 0)	/* do not emulate if copy offload not supported */
> +
> +extern int blkdev_issue_copy(struct block_device *src_bdev, int nr_srcs,
> +		struct range_entry *src_rlist, struct block_device *dest_bdev,
> +		sector_t dest, gfp_t gfp_mask, int flags);

No need for extern.

> +
>  #define BLKDEV_ZERO_NOUNMAP	(1 << 0)  /* do not free blocks */
>  #define BLKDEV_ZERO_NOFALLBACK	(1 << 1)  /* don't write explicit zeroes */
>  
> diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
> index f44eb0a04afd..5cadb176317a 100644
> --- a/include/uapi/linux/fs.h
> +++ b/include/uapi/linux/fs.h
> @@ -64,6 +64,18 @@ struct fstrim_range {
>  	__u64 minlen;
>  };
>  
> +struct range_entry {
> +	__u64 src;
> +	__u64 len;
> +};
> +
> +struct copy_range {
> +	__u64 dest;
> +	__u64 nr_range;
> +	__u64 range_list;
> +	__u64 rsvd;
> +};
> +
>  /* extent-same (dedupe) ioctls; these MUST match the btrfs ioctl definitions */
>  #define FILE_DEDUPE_RANGE_SAME		0
>  #define FILE_DEDUPE_RANGE_DIFFERS	1
> @@ -184,6 +196,7 @@ struct fsxattr {
>  #define BLKSECDISCARD _IO(0x12,125)
>  #define BLKROTATIONAL _IO(0x12,126)
>  #define BLKZEROOUT _IO(0x12,127)
> +#define BLKCOPY _IOWR(0x12, 128, struct copy_range)
>  /*
>   * A jump here: 130-131 are reserved for zoned block devices
>   * (see uapi/linux/blkzoned.h)
> 

Please test your code more thoroughly. It is full of problems that you should
have detected with better testing including RO devices, partitions and error
path coverage.

-- 
Damien Le Moal
Western Digital Research



More information about the Linux-nvme mailing list