[PATCH 15/15] RFC: block: allow write streams on partitions
Christoph Hellwig
hch at lst.de
Tue Nov 19 04:16:29 PST 2024
By default assign all write streams to partition 1, and add a hack
sysfs files that distributes them all equally.
This is implemented by storing the number of per-partition write
streams in struct block device, as well as the offset to the global
ones, and then remapping the write streams in the I/O submission
path.
The sysfs is hacky and undocumented, better suggestions welcome
from actual users of write stream on partitions.
Signed-off-by: Christoph Hellwig <hch at lst.de>
---
block/bdev.c | 9 +++++++
block/blk-core.c | 2 ++
block/genhd.c | 52 +++++++++++++++++++++++++++++++++++++++
block/partitions/core.c | 6 +++--
include/linux/blk_types.h | 7 ++++++
include/linux/blkdev.h | 2 +-
6 files changed, 75 insertions(+), 3 deletions(-)
diff --git a/block/bdev.c b/block/bdev.c
index c23245f1fdfe..f3549a8cdb3f 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -440,6 +440,15 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
return NULL;
}
bdev->bd_disk = disk;
+
+ /*
+ * Assign all write streams to the first partition by default.
+ */
+ if (partno == 1) {
+ bdev->bd_part_write_stream_start = 0;
+ bdev->bd_part_write_streams = bdev_max_write_streams(bdev);
+ }
+
return bdev;
}
diff --git a/block/blk-core.c b/block/blk-core.c
index 666efe8fa202..9654937f9b2d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -574,6 +574,8 @@ static int blk_partition_remap(struct bio *bio)
return -EIO;
if (bio_sectors(bio)) {
bio->bi_iter.bi_sector += p->bd_start_sect;
+ if (bio->bi_write_stream)
+ bio->bi_write_stream += p->bd_part_write_stream_start;
trace_block_bio_remap(bio, p->bd_dev,
bio->bi_iter.bi_sector -
p->bd_start_sect);
diff --git a/block/genhd.c b/block/genhd.c
index 79230c109fca..3156c70522b6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1070,6 +1070,54 @@ static ssize_t partscan_show(struct device *dev,
return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
}
+static ssize_t disk_distribute_write_streams_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ /* Anything useful to show here like the ranges? */
+ return sysfs_emit(buf, "0\n");
+}
+
+static ssize_t disk_distribute_write_streams_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ struct block_device *bdev = disk->part0, *part;
+ unsigned short total_write_streams =
+ disk->queue->limits.max_write_streams;
+ unsigned short part_write_streams, part_write_stream_start = 0;
+ unsigned long nr_partitions = 0, idx;
+ int error = 0;
+
+ if (!total_write_streams)
+ return -EINVAL;
+
+ mutex_lock(&disk->open_mutex);
+ if (atomic_read(&bdev->bd_openers)) {
+ error = -EBUSY;
+ goto out_unlock;
+ }
+
+ xa_for_each_start(&disk->part_tbl, idx, part, 1)
+ nr_partitions++;
+ if (!nr_partitions)
+ goto out_unlock;
+
+ part_write_streams = total_write_streams / nr_partitions;
+ xa_for_each_start(&disk->part_tbl, idx, part, 1) {
+ part->bd_part_write_streams = part_write_streams;
+ part->bd_part_write_stream_start = part_write_stream_start;
+ part_write_stream_start += part_write_streams;
+ dev_info(dev,
+ "assigning %u write streams at %u to partition %lu\n",
+ part_write_streams, part_write_stream_start, idx - 1);
+ }
+out_unlock:
+ mutex_unlock(&disk->open_mutex);
+ if (error)
+ return error;
+ return count;
+}
+
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -1084,6 +1132,9 @@ static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
+static DEVICE_ATTR(distribute_write_streams, 0644,
+ disk_distribute_write_streams_show,
+ disk_distribute_write_streams_store);
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
@@ -1135,6 +1186,7 @@ static struct attribute *disk_attrs[] = {
&dev_attr_events_poll_msecs.attr,
&dev_attr_diskseq.attr,
&dev_attr_partscan.attr,
+ &dev_attr_distribute_write_streams.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 815ed33caa1b..a27dbb5589ce 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -245,8 +245,10 @@ static const struct attribute_group *part_attr_groups[] = {
static void part_release(struct device *dev)
{
- put_disk(dev_to_bdev(dev)->bd_disk);
- bdev_drop(dev_to_bdev(dev));
+ struct block_device *part = dev_to_bdev(dev);
+
+ put_disk(part->bd_disk);
+ bdev_drop(part);
}
static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4ca3449ce9c9..02a3d58e814f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -74,6 +74,13 @@ struct block_device {
#ifdef CONFIG_SECURITY
void *bd_security;
#endif
+
+ /*
+ * Allow assigning write streams to partitions.
+ */
+ unsigned short bd_part_write_streams;
+ unsigned short bd_part_write_stream_start;
+
/*
* keep this out-of-line as it's both big and not needed in the fast
* path
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9fda66530d9a..bb0921e642fb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1242,7 +1242,7 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev)
static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
{
if (bdev_is_partition(bdev))
- return 0;
+ return bdev->bd_part_write_streams;
return bdev_limits(bdev)->max_write_streams;
}
--
2.45.2
More information about the Linux-nvme
mailing list