[RFC PATCH 1/3] block: add copy offload support
Mikulas Patocka
mpatocka at redhat.com
Tue Feb 1 10:32:29 PST 2022
Add generic copy offload support to the block layer.
We add two new bio types: REQ_OP_COPY_READ_TOKEN and
REQ_OP_COPY_WRITE_TOKEN. Their bio vector has one entry - a page
containing the token.
When we need to copy data, we send REQ_OP_COPY_READ_TOKEN to the source
device and then we send REQ_OP_COPY_WRITE_TOKEN to the destination device.
This patch introduces a new ioctl BLKCOPY that submits the copy operation.
BLKCOPY argument has four 64-bit numbers - source offset, destination
offset and length. The last number is returned by the ioctl and it is the
number of bytes that were actually copied.
For in-kernel users, we introduce a function blkdev_issue_copy.
Copying may fail anytime, the caller is required to fallback to explicit
copy.
Signed-off-by: Mikulas Patocka <mpatocka at redhat.com>
---
block/blk-core.c | 7 +++
block/blk-lib.c | 89 ++++++++++++++++++++++++++++++++++++++++++++++
block/blk-settings.c | 12 ++++++
block/blk-sysfs.c | 7 +++
block/blk.h | 3 +
block/ioctl.c | 56 ++++++++++++++++++++++++++++
include/linux/blk_types.h | 4 ++
include/linux/blkdev.h | 18 +++++++++
include/uapi/linux/fs.h | 1
9 files changed, 197 insertions(+)
Index: linux-2.6/block/blk-settings.c
===================================================================
--- linux-2.6.orig/block/blk-settings.c 2022-01-26 19:12:30.000000000 +0100
+++ linux-2.6/block/blk-settings.c 2022-01-27 20:43:27.000000000 +0100
@@ -57,6 +57,7 @@ void blk_set_default_limits(struct queue
lim->misaligned = 0;
lim->zoned = BLK_ZONED_NONE;
lim->zone_write_granularity = 0;
+ lim->max_copy_sectors = 0;
}
EXPORT_SYMBOL(blk_set_default_limits);
@@ -365,6 +366,17 @@ void blk_queue_zone_write_granularity(st
EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
/**
+ * blk_queue_max_copy_sectors - set maximum copy offload sectors for the queue
+ * @q: the request queue for the device
+ * @size: the maximum copy offload sectors
+ */
+void blk_queue_max_copy_sectors(struct request_queue *q, unsigned int size)
+{
+ q->limits.max_copy_sectors = size;
+}
+EXPORT_SYMBOL_GPL(blk_queue_max_copy_sectors);
+
+/**
* blk_queue_alignment_offset - set physical block alignment offset
* @q: the request queue for the device
* @offset: alignment offset in bytes
Index: linux-2.6/include/linux/blkdev.h
===================================================================
--- linux-2.6.orig/include/linux/blkdev.h 2022-01-26 19:12:30.000000000 +0100
+++ linux-2.6/include/linux/blkdev.h 2022-01-29 17:46:03.000000000 +0100
@@ -103,6 +103,7 @@ struct queue_limits {
unsigned int discard_granularity;
unsigned int discard_alignment;
unsigned int zone_write_granularity;
+ unsigned int max_copy_sectors;
unsigned short max_segments;
unsigned short max_integrity_segments;
@@ -706,6 +707,7 @@ extern void blk_queue_max_zone_append_se
extern void blk_queue_physical_block_size(struct request_queue *, unsigned int);
void blk_queue_zone_write_granularity(struct request_queue *q,
unsigned int size);
+void blk_queue_max_copy_sectors(struct request_queue *q, unsigned int size);
extern void blk_queue_alignment_offset(struct request_queue *q,
unsigned int alignment);
void disk_update_readahead(struct gendisk *disk);
@@ -862,6 +864,10 @@ extern int __blkdev_issue_zeroout(struct
extern int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector,
sector_t nr_sects, gfp_t gfp_mask, unsigned flags);
+extern int blkdev_issue_copy(struct block_device *bdev1, sector_t sector1,
+ struct block_device *bdev2, sector_t sector2,
+ sector_t nr_sects, sector_t *copied, gfp_t gfp_mask);
+
static inline int sb_issue_discard(struct super_block *sb, sector_t block,
sector_t nr_blocks, gfp_t gfp_mask, unsigned long flags)
{
@@ -1001,6 +1007,18 @@ bdev_zone_write_granularity(struct block
return queue_zone_write_granularity(bdev_get_queue(bdev));
}
+static inline unsigned int
+queue_max_copy_sectors(const struct request_queue *q)
+{
+ return q->limits.max_copy_sectors;
+}
+
+static inline unsigned int
+bdev_max_copy_sectors(struct block_device *bdev)
+{
+ return queue_max_copy_sectors(bdev_get_queue(bdev));
+}
+
static inline int queue_alignment_offset(const struct request_queue *q)
{
if (q->limits.misaligned)
Index: linux-2.6/block/blk-sysfs.c
===================================================================
--- linux-2.6.orig/block/blk-sysfs.c 2022-01-26 19:12:30.000000000 +0100
+++ linux-2.6/block/blk-sysfs.c 2022-01-26 19:12:30.000000000 +0100
@@ -230,6 +230,11 @@ static ssize_t queue_zone_write_granular
return queue_var_show(queue_zone_write_granularity(q), page);
}
+static ssize_t queue_max_copy_sectors_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(queue_max_copy_sectors(q), page);
+}
+
static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
{
unsigned long long max_sectors = q->limits.max_zone_append_sectors;
@@ -591,6 +596,7 @@ QUEUE_RO_ENTRY(queue_write_same_max, "wr
QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
+QUEUE_RO_ENTRY(queue_max_copy_sectors, "max_copy_sectors");
QUEUE_RO_ENTRY(queue_zoned, "zoned");
QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
@@ -647,6 +653,7 @@ static struct attribute *queue_attrs[] =
&queue_write_zeroes_max_entry.attr,
&queue_zone_append_max_entry.attr,
&queue_zone_write_granularity_entry.attr,
+ &queue_max_copy_sectors_entry.attr,
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nr_zones_entry.attr,
Index: linux-2.6/include/linux/blk_types.h
===================================================================
--- linux-2.6.orig/include/linux/blk_types.h 2022-01-06 18:55:01.000000000 +0100
+++ linux-2.6/include/linux/blk_types.h 2022-01-29 17:47:44.000000000 +0100
@@ -371,6 +371,10 @@ enum req_opf {
/* reset all the zone present on the device */
REQ_OP_ZONE_RESET_ALL = 17,
+ /* copy offload bios */
+ REQ_OP_COPY_READ_TOKEN = 18,
+ REQ_OP_COPY_WRITE_TOKEN = 19,
+
/* Driver private requests */
REQ_OP_DRV_IN = 34,
REQ_OP_DRV_OUT = 35,
Index: linux-2.6/block/blk-lib.c
===================================================================
--- linux-2.6.orig/block/blk-lib.c 2021-08-18 13:59:55.000000000 +0200
+++ linux-2.6/block/blk-lib.c 2022-01-30 17:33:04.000000000 +0100
@@ -440,3 +440,92 @@ retry:
return ret;
}
EXPORT_SYMBOL(blkdev_issue_zeroout);
+
+static void bio_wake_completion(struct bio *bio)
+{
+ struct completion *comp = bio->bi_private;
+ complete(comp);
+}
+
+int blkdev_issue_copy(struct block_device *bdev1, sector_t sector1,
+ struct block_device *bdev2, sector_t sector2,
+ sector_t nr_sects, sector_t *copied, gfp_t gfp_mask)
+{
+ struct page *token;
+ sector_t m;
+ int r = 0;
+ struct completion comp;
+
+ *copied = 0;
+
+ m = min(bdev_max_copy_sectors(bdev1), bdev_max_copy_sectors(bdev2));
+ if (!m)
+ return -EOPNOTSUPP;
+ m = min(m, (sector_t)round_down(UINT_MAX, PAGE_SIZE) >> 9);
+
+ if (unlikely(bdev_read_only(bdev2)))
+ return -EPERM;
+
+ token = alloc_page(gfp_mask);
+ if (unlikely(!token))
+ return -ENOMEM;
+
+ while (nr_sects) {
+ struct bio *read_bio, *write_bio;
+ sector_t this_step = min(nr_sects, m);
+
+ read_bio = bio_alloc(gfp_mask, 1);
+ if (unlikely(!read_bio)) {
+ r = -ENOMEM;
+ break;
+ }
+ bio_set_op_attrs(read_bio, REQ_OP_COPY_READ_TOKEN, REQ_NOMERGE);
+ bio_set_dev(read_bio, bdev1);
+ __bio_add_page(read_bio, token, PAGE_SIZE, 0);
+ read_bio->bi_iter.bi_sector = sector1;
+ read_bio->bi_iter.bi_size = this_step << 9;
+ read_bio->bi_private = ∁
+ read_bio->bi_end_io = bio_wake_completion;
+ init_completion(&comp);
+ submit_bio(read_bio);
+ wait_for_completion(&comp);
+ if (unlikely(read_bio->bi_status != BLK_STS_OK)) {
+ r = blk_status_to_errno(read_bio->bi_status);
+ bio_put(read_bio);
+ break;
+ }
+ bio_put(read_bio);
+
+ write_bio = bio_alloc(gfp_mask, 1);
+ if (unlikely(!write_bio)) {
+ r = -ENOMEM;
+ break;
+ }
+ bio_set_op_attrs(write_bio, REQ_OP_COPY_WRITE_TOKEN, REQ_NOMERGE);
+ bio_set_dev(write_bio, bdev2);
+ __bio_add_page(write_bio, token, PAGE_SIZE, 0);
+ write_bio->bi_iter.bi_sector = sector2;
+ write_bio->bi_iter.bi_size = this_step << 9;
+ write_bio->bi_private = ∁
+ write_bio->bi_end_io = bio_wake_completion;
+ reinit_completion(&comp);
+ submit_bio(write_bio);
+ wait_for_completion(&comp);
+ if (unlikely(write_bio->bi_status != BLK_STS_OK)) {
+ r = blk_status_to_errno(write_bio->bi_status);
+ bio_put(write_bio);
+ break;
+ }
+ bio_put(write_bio);
+
+ sector1 += this_step;
+ sector2 += this_step;
+ nr_sects -= this_step;
+ *copied += this_step;
+ }
+
+ __free_page(token);
+
+ return r;
+}
+EXPORT_SYMBOL(blkdev_issue_copy);
Index: linux-2.6/block/ioctl.c
===================================================================
--- linux-2.6.orig/block/ioctl.c 2022-01-24 15:10:40.000000000 +0100
+++ linux-2.6/block/ioctl.c 2022-01-30 13:43:35.000000000 +0100
@@ -165,6 +165,60 @@ fail:
return err;
}
+static int blk_ioctl_copy(struct block_device *bdev, fmode_t mode,
+ unsigned long arg)
+{
+ uint64_t range[4];
+ uint64_t start1, start2, end1, end2, len;
+ sector_t copied = 0;
+ struct inode *inode = bdev->bd_inode;
+ int err;
+
+ if (!(mode & FMODE_WRITE)) {
+ err = -EBADF;
+ goto fail1;
+ }
+
+ if (copy_from_user(range, (void __user *)arg, 24)) {
+ err = -EFAULT;
+ goto fail1;
+ }
+
+ start1 = range[0];
+ start2 = range[1];
+ len = range[2];
+ end1 = start1 + len - 1;
+ end2 = start2 + len - 1;
+
+ if ((start1 | start2 | len) & 511)
+ return -EINVAL;
+ if (end1 >= (uint64_t)bdev_nr_bytes(bdev))
+ return -EINVAL;
+ if (end2 >= (uint64_t)bdev_nr_bytes(bdev))
+ return -EINVAL;
+ if (end1 < start1)
+ return -EINVAL;
+ if (end2 < start2)
+ return -EINVAL;
+
+ filemap_invalidate_lock(inode->i_mapping);
+ err = truncate_bdev_range(bdev, mode, start2, end2);
+ if (err)
+ goto fail2;
+
+ err = blkdev_issue_copy(bdev, start1 >> 9, bdev, start2 >> 9, len >> 9, &copied, GFP_KERNEL);
+
+fail2:
+ filemap_invalidate_unlock(inode->i_mapping);
+
+fail1:
+ range[3] = (uint64_t)copied << 9;
+ if (copy_to_user((void __user *)(arg + 24), &range[3], 8))
+ err = -EFAULT;
+
+ return err;
+}
+
static int put_ushort(unsigned short __user *argp, unsigned short val)
{
return put_user(val, argp);
@@ -459,6 +513,8 @@ static int blkdev_common_ioctl(struct bl
return blk_ioctl_zeroout(bdev, mode, arg);
case BLKGETDISKSEQ:
return put_u64(argp, bdev->bd_disk->diskseq);
+ case BLKCOPY:
+ return blk_ioctl_copy(bdev, mode, arg);
case BLKREPORTZONE:
return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
case BLKRESETZONE:
Index: linux-2.6/include/uapi/linux/fs.h
===================================================================
--- linux-2.6.orig/include/uapi/linux/fs.h 2021-09-23 17:07:02.000000000 +0200
+++ linux-2.6/include/uapi/linux/fs.h 2022-01-27 19:05:46.000000000 +0100
@@ -185,6 +185,7 @@ struct fsxattr {
#define BLKROTATIONAL _IO(0x12,126)
#define BLKZEROOUT _IO(0x12,127)
#define BLKGETDISKSEQ _IOR(0x12,128,__u64)
+#define BLKCOPY _IO(0x12,129)
/*
* A jump here: 130-136 are reserved for zoned block devices
* (see uapi/linux/blkzoned.h)
Index: linux-2.6/block/blk.h
===================================================================
--- linux-2.6.orig/block/blk.h 2022-01-24 15:10:40.000000000 +0100
+++ linux-2.6/block/blk.h 2022-01-29 18:10:28.000000000 +0100
@@ -288,6 +288,9 @@ static inline bool blk_may_split(struct
case REQ_OP_WRITE_ZEROES:
case REQ_OP_WRITE_SAME:
return true; /* non-trivial splitting decisions */
+ case REQ_OP_COPY_READ_TOKEN:
+ case REQ_OP_COPY_WRITE_TOKEN:
+ return false;
default:
break;
}
Index: linux-2.6/block/blk-core.c
===================================================================
--- linux-2.6.orig/block/blk-core.c 2022-01-24 15:10:40.000000000 +0100
+++ linux-2.6/block/blk-core.c 2022-02-01 15:53:39.000000000 +0100
@@ -124,6 +124,8 @@ static const char *const blk_op_name[] =
REQ_OP_NAME(ZONE_APPEND),
REQ_OP_NAME(WRITE_SAME),
REQ_OP_NAME(WRITE_ZEROES),
+ REQ_OP_NAME(COPY_READ_TOKEN),
+ REQ_OP_NAME(COPY_WRITE_TOKEN),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
};
@@ -758,6 +760,11 @@ noinline_for_stack bool submit_bio_check
if (!q->limits.max_write_zeroes_sectors)
goto not_supported;
break;
+ case REQ_OP_COPY_READ_TOKEN:
+ case REQ_OP_COPY_WRITE_TOKEN:
+ if (!q->limits.max_copy_sectors)
+ goto not_supported;
+ break;
default:
break;
}
More information about the Linux-nvme
mailing list