[RFC PATCH 1/3] block: add copy offload support

Mikulas Patocka mpatocka at redhat.com
Tue Feb 1 10:32:29 PST 2022


Add generic copy offload support to the block layer.

We add two new bio types: REQ_OP_COPY_READ_TOKEN and
REQ_OP_COPY_WRITE_TOKEN. Their bio vector has one entry - a page
containing the token.

When we need to copy data, we send REQ_OP_COPY_READ_TOKEN to the source
device and then we send REQ_OP_COPY_WRITE_TOKEN to the destination device.

This patch introduces a new ioctl BLKCOPY that submits the copy operation.
BLKCOPY argument has four 64-bit numbers - source offset, destination
offset and length. The last number is returned by the ioctl and it is the
number of bytes that were actually copied.

For in-kernel users, we introduce a function blkdev_issue_copy.

Copying may fail anytime, the caller is required to fallback to explicit
copy.

Signed-off-by: Mikulas Patocka <mpatocka at redhat.com>

---
 block/blk-core.c          |    7 +++
 block/blk-lib.c           |   89 ++++++++++++++++++++++++++++++++++++++++++++++
 block/blk-settings.c      |   12 ++++++
 block/blk-sysfs.c         |    7 +++
 block/blk.h               |    3 +
 block/ioctl.c             |   56 ++++++++++++++++++++++++++++
 include/linux/blk_types.h |    4 ++
 include/linux/blkdev.h    |   18 +++++++++
 include/uapi/linux/fs.h   |    1 
 9 files changed, 197 insertions(+)

Index: linux-2.6/block/blk-settings.c
===================================================================
--- linux-2.6.orig/block/blk-settings.c	2022-01-26 19:12:30.000000000 +0100
+++ linux-2.6/block/blk-settings.c	2022-01-27 20:43:27.000000000 +0100
@@ -57,6 +57,7 @@ void blk_set_default_limits(struct queue
 	lim->misaligned = 0;
 	lim->zoned = BLK_ZONED_NONE;
 	lim->zone_write_granularity = 0;
+	lim->max_copy_sectors = 0;
 }
 EXPORT_SYMBOL(blk_set_default_limits);
 
@@ -365,6 +366,17 @@ void blk_queue_zone_write_granularity(st
 EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
 
 /**
+ * blk_queue_max_copy_sectors - set maximum copy offload sectors for the queue
+ * @q:  the request queue for the device
+ * @size:  the maximum copy offload sectors
+ */
+void blk_queue_max_copy_sectors(struct request_queue *q, unsigned int size)
+{
+	q->limits.max_copy_sectors = size;
+}
+EXPORT_SYMBOL_GPL(blk_queue_max_copy_sectors);
+
+/**
  * blk_queue_alignment_offset - set physical block alignment offset
  * @q:	the request queue for the device
  * @offset: alignment offset in bytes
Index: linux-2.6/include/linux/blkdev.h
===================================================================
--- linux-2.6.orig/include/linux/blkdev.h	2022-01-26 19:12:30.000000000 +0100
+++ linux-2.6/include/linux/blkdev.h	2022-01-29 17:46:03.000000000 +0100
@@ -103,6 +103,7 @@ struct queue_limits {
 	unsigned int		discard_granularity;
 	unsigned int		discard_alignment;
 	unsigned int		zone_write_granularity;
+	unsigned int		max_copy_sectors;
 
 	unsigned short		max_segments;
 	unsigned short		max_integrity_segments;
@@ -706,6 +707,7 @@ extern void blk_queue_max_zone_append_se
 extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
 void blk_queue_zone_write_granularity(struct request_queue *q,
 				      unsigned int size);
+void blk_queue_max_copy_sectors(struct request_queue *q, unsigned int size);
 extern void blk_queue_alignment_offset(struct request_queue *q,
 				       unsigned int alignment);
 void disk_update_readahead(struct gendisk *disk);
@@ -862,6 +864,10 @@ extern int __blkdev_issue_zeroout(struct
 extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
 		sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
 
+extern int blkdev_issue_copy(struct block_device *bdev1, sector_t sector1,
+		      struct block_device *bdev2, sector_t sector2,
+		      sector_t nr_sects, sector_t *copied, gfp_t gfp_mask);
+
 static inline int sb_issue_discard(struct super_block *sb, sector_t block,
 		sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
 {
@@ -1001,6 +1007,18 @@ bdev_zone_write_granularity(struct block
 	return queue_zone_write_granularity(bdev_get_queue(bdev));
 }
 
+static inline unsigned int
+queue_max_copy_sectors(const struct request_queue *q)
+{
+	return q->limits.max_copy_sectors;
+}
+
+static inline unsigned int
+bdev_max_copy_sectors(struct block_device *bdev)
+{
+	return queue_max_copy_sectors(bdev_get_queue(bdev));
+}
+
 static inline int queue_alignment_offset(const struct request_queue *q)
 {
 	if (q->limits.misaligned)
Index: linux-2.6/block/blk-sysfs.c
===================================================================
--- linux-2.6.orig/block/blk-sysfs.c	2022-01-26 19:12:30.000000000 +0100
+++ linux-2.6/block/blk-sysfs.c	2022-01-26 19:12:30.000000000 +0100
@@ -230,6 +230,11 @@ static ssize_t queue_zone_write_granular
 	return queue_var_show(queue_zone_write_granularity(q), page);
 }
 
+static ssize_t queue_max_copy_sectors_show(struct request_queue *q, char *page)
+{
+	return queue_var_show(queue_max_copy_sectors(q), page);
+}
+
 static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
 {
 	unsigned long long max_sectors = q->limits.max_zone_append_sectors;
@@ -591,6 +596,7 @@ QUEUE_RO_ENTRY(queue_write_same_max, "wr
 QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
 QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
+QUEUE_RO_ENTRY(queue_max_copy_sectors, "max_copy_sectors");
 
 QUEUE_RO_ENTRY(queue_zoned, "zoned");
 QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
@@ -647,6 +653,7 @@ static struct attribute *queue_attrs[] =
 	&queue_write_zeroes_max_entry.attr,
 	&queue_zone_append_max_entry.attr,
 	&queue_zone_write_granularity_entry.attr,
+	&queue_max_copy_sectors_entry.attr,
 	&queue_nonrot_entry.attr,
 	&queue_zoned_entry.attr,
 	&queue_nr_zones_entry.attr,
Index: linux-2.6/include/linux/blk_types.h
===================================================================
--- linux-2.6.orig/include/linux/blk_types.h	2022-01-06 18:55:01.000000000 +0100
+++ linux-2.6/include/linux/blk_types.h	2022-01-29 17:47:44.000000000 +0100
@@ -371,6 +371,10 @@ enum req_opf {
 	/* reset all the zone present on the device */
 	REQ_OP_ZONE_RESET_ALL	= 17,
 
+	/* copy offload bios */
+	REQ_OP_COPY_READ_TOKEN	= 18,
+	REQ_OP_COPY_WRITE_TOKEN	= 19,
+
 	/* Driver private requests */
 	REQ_OP_DRV_IN		= 34,
 	REQ_OP_DRV_OUT		= 35,
Index: linux-2.6/block/blk-lib.c
===================================================================
--- linux-2.6.orig/block/blk-lib.c	2021-08-18 13:59:55.000000000 +0200
+++ linux-2.6/block/blk-lib.c	2022-01-30 17:33:04.000000000 +0100
@@ -440,3 +440,92 @@ retry:
 	return ret;
 }
 EXPORT_SYMBOL(blkdev_issue_zeroout);
+
+static void bio_wake_completion(struct bio *bio)
+{
+	struct completion *comp = bio->bi_private;
+	complete(comp);
+}
+
+int blkdev_issue_copy(struct block_device *bdev1, sector_t sector1,
+		      struct block_device *bdev2, sector_t sector2,
+		      sector_t nr_sects, sector_t *copied, gfp_t gfp_mask)
+{
+	struct page *token;
+	sector_t m;
+	int r = 0;
+	struct completion comp;
+
+	*copied = 0;
+
+	m = min(bdev_max_copy_sectors(bdev1), bdev_max_copy_sectors(bdev2));
+	if (!m)
+		return -EOPNOTSUPP;
+	m = min(m, (sector_t)round_down(UINT_MAX, PAGE_SIZE) >> 9);
+
+	if (unlikely(bdev_read_only(bdev2)))
+		return -EPERM;
+
+	token = alloc_page(gfp_mask);
+	if (unlikely(!token))
+		return -ENOMEM;
+
+	while (nr_sects) {
+		struct bio *read_bio, *write_bio;
+		sector_t this_step = min(nr_sects, m);
+
+		read_bio = bio_alloc(gfp_mask, 1);
+		if (unlikely(!read_bio)) {
+			r = -ENOMEM;
+			break;
+		}
+		bio_set_op_attrs(read_bio, REQ_OP_COPY_READ_TOKEN, REQ_NOMERGE);
+		bio_set_dev(read_bio, bdev1);
+		__bio_add_page(read_bio, token, PAGE_SIZE, 0);
+		read_bio->bi_iter.bi_sector = sector1;
+		read_bio->bi_iter.bi_size = this_step << 9;
+		read_bio->bi_private = ∁
+		read_bio->bi_end_io = bio_wake_completion;
+		init_completion(&comp);
+		submit_bio(read_bio);
+		wait_for_completion(&comp);
+		if (unlikely(read_bio->bi_status != BLK_STS_OK)) {
+			r = blk_status_to_errno(read_bio->bi_status);
+			bio_put(read_bio);
+			break;
+		}
+		bio_put(read_bio);
+
+		write_bio = bio_alloc(gfp_mask, 1);
+		if (unlikely(!write_bio)) {
+			r = -ENOMEM;
+			break;
+		}
+		bio_set_op_attrs(write_bio, REQ_OP_COPY_WRITE_TOKEN, REQ_NOMERGE);
+		bio_set_dev(write_bio, bdev2);
+		__bio_add_page(write_bio, token, PAGE_SIZE, 0);
+		write_bio->bi_iter.bi_sector = sector2;
+		write_bio->bi_iter.bi_size = this_step << 9;
+		write_bio->bi_private = ∁
+		write_bio->bi_end_io = bio_wake_completion;
+		reinit_completion(&comp);
+		submit_bio(write_bio);
+		wait_for_completion(&comp);
+		if (unlikely(write_bio->bi_status != BLK_STS_OK)) {
+			r = blk_status_to_errno(write_bio->bi_status);
+			bio_put(write_bio);
+			break;
+		}
+		bio_put(write_bio);
+
+		sector1 += this_step;
+		sector2 += this_step;
+		nr_sects -= this_step;
+		*copied += this_step;
+	}
+
+	__free_page(token);
+
+	return r;
+}
+EXPORT_SYMBOL(blkdev_issue_copy);
Index: linux-2.6/block/ioctl.c
===================================================================
--- linux-2.6.orig/block/ioctl.c	2022-01-24 15:10:40.000000000 +0100
+++ linux-2.6/block/ioctl.c	2022-01-30 13:43:35.000000000 +0100
@@ -165,6 +165,60 @@ fail:
 	return err;
 }
 
+static int blk_ioctl_copy(struct block_device *bdev, fmode_t mode,
+		unsigned long arg)
+{
+	uint64_t range[4];
+	uint64_t start1, start2, end1, end2, len;
+	sector_t copied = 0;
+	struct inode *inode = bdev->bd_inode;
+	int err;
+
+	if (!(mode & FMODE_WRITE)) {
+		err = -EBADF;
+		goto fail1;
+	}
+
+	if (copy_from_user(range, (void __user *)arg, 24)) {
+		err = -EFAULT;
+		goto fail1;
+	}
+
+	start1 = range[0];
+	start2 = range[1];
+	len = range[2];
+	end1 = start1 + len - 1;
+	end2 = start2 + len - 1;
+
+	if ((start1 | start2 | len) & 511)
+		return -EINVAL;
+	if (end1 >= (uint64_t)bdev_nr_bytes(bdev))
+		return -EINVAL;
+	if (end2 >= (uint64_t)bdev_nr_bytes(bdev))
+		return -EINVAL;
+	if (end1 < start1)
+		return -EINVAL;
+	if (end2 < start2)
+		return -EINVAL;
+
+	filemap_invalidate_lock(inode->i_mapping);
+	err = truncate_bdev_range(bdev, mode, start2, end2);
+	if (err)
+		goto fail2;
+
+	err = blkdev_issue_copy(bdev, start1 >> 9, bdev, start2 >> 9, len >> 9, &copied, GFP_KERNEL);
+
+fail2:
+	filemap_invalidate_unlock(inode->i_mapping);
+
+fail1:
+	range[3] = (uint64_t)copied << 9;
+	if (copy_to_user((void __user *)(arg + 24), &range[3], 8))
+		err = -EFAULT;
+
+	return err;
+}
+
 static int put_ushort(unsigned short __user *argp, unsigned short val)
 {
 	return put_user(val, argp);
@@ -459,6 +513,8 @@ static int blkdev_common_ioctl(struct bl
 		return blk_ioctl_zeroout(bdev, mode, arg);
 	case BLKGETDISKSEQ:
 		return put_u64(argp, bdev->bd_disk->diskseq);
+	case BLKCOPY:
+		return blk_ioctl_copy(bdev, mode, arg);
 	case BLKREPORTZONE:
 		return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
 	case BLKRESETZONE:
Index: linux-2.6/include/uapi/linux/fs.h
===================================================================
--- linux-2.6.orig/include/uapi/linux/fs.h	2021-09-23 17:07:02.000000000 +0200
+++ linux-2.6/include/uapi/linux/fs.h	2022-01-27 19:05:46.000000000 +0100
@@ -185,6 +185,7 @@ struct fsxattr {
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
 #define BLKGETDISKSEQ _IOR(0x12,128,__u64)
+#define BLKCOPY _IO(0x12,129)
 /*
  * A jump here: 130-136 are reserved for zoned block devices
  * (see uapi/linux/blkzoned.h)
Index: linux-2.6/block/blk.h
===================================================================
--- linux-2.6.orig/block/blk.h	2022-01-24 15:10:40.000000000 +0100
+++ linux-2.6/block/blk.h	2022-01-29 18:10:28.000000000 +0100
@@ -288,6 +288,9 @@ static inline bool blk_may_split(struct
 	case REQ_OP_WRITE_ZEROES:
 	case REQ_OP_WRITE_SAME:
 		return true; /* non-trivial splitting decisions */
+	case REQ_OP_COPY_READ_TOKEN:
+	case REQ_OP_COPY_WRITE_TOKEN:
+		return false;
 	default:
 		break;
 	}
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c	2022-01-24 15:10:40.000000000 +0100
+++ linux-2.6/block/blk-core.c	2022-02-01 15:53:39.000000000 +0100
@@ -124,6 +124,8 @@ static const char *const blk_op_name[] =
 	REQ_OP_NAME(ZONE_APPEND),
 	REQ_OP_NAME(WRITE_SAME),
 	REQ_OP_NAME(WRITE_ZEROES),
+	REQ_OP_NAME(COPY_READ_TOKEN),
+	REQ_OP_NAME(COPY_WRITE_TOKEN),
 	REQ_OP_NAME(DRV_IN),
 	REQ_OP_NAME(DRV_OUT),
 };
@@ -758,6 +760,11 @@ noinline_for_stack bool submit_bio_check
 		if (!q->limits.max_write_zeroes_sectors)
 			goto not_supported;
 		break;
+	case REQ_OP_COPY_READ_TOKEN:
+	case REQ_OP_COPY_WRITE_TOKEN:
+		if (!q->limits.max_copy_sectors)
+			goto not_supported;
+		break;
 	default:
 		break;
 	}




More information about the Linux-nvme mailing list