[PATCH 15/15] RFC: block: allow write streams on partitions

Christoph Hellwig hch at lst.de
Tue Nov 19 04:16:29 PST 2024


By default assign all write streams to partition 1, and add a hack
sysfs files that distributes them all equally.

This is implemented by storing the number of per-partition write
streams in struct block device, as well as the offset to the global
ones, and then remapping the write streams in the I/O submission
path.

The sysfs is hacky and undocumented, better suggestions welcome
from actual users of write stream on partitions.

Signed-off-by: Christoph Hellwig <hch at lst.de>
---
 block/bdev.c              |  9 +++++++
 block/blk-core.c          |  2 ++
 block/genhd.c             | 52 +++++++++++++++++++++++++++++++++++++++
 block/partitions/core.c   |  6 +++--
 include/linux/blk_types.h |  7 ++++++
 include/linux/blkdev.h    |  2 +-
 6 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/block/bdev.c b/block/bdev.c
index c23245f1fdfe..f3549a8cdb3f 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -440,6 +440,15 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
 		return NULL;
 	}
 	bdev->bd_disk = disk;
+
+	/*
+	 * Assign all write streams to the first partition by default.
+	 */
+	if (partno == 1) {
+		bdev->bd_part_write_stream_start = 0;
+		bdev->bd_part_write_streams = bdev_max_write_streams(bdev);
+	}
+
 	return bdev;
 }
 
diff --git a/block/blk-core.c b/block/blk-core.c
index 666efe8fa202..9654937f9b2d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -574,6 +574,8 @@ static int blk_partition_remap(struct bio *bio)
 		return -EIO;
 	if (bio_sectors(bio)) {
 		bio->bi_iter.bi_sector += p->bd_start_sect;
+		if (bio->bi_write_stream)
+			bio->bi_write_stream += p->bd_part_write_stream_start;
 		trace_block_bio_remap(bio, p->bd_dev,
 				      bio->bi_iter.bi_sector -
 				      p->bd_start_sect);
diff --git a/block/genhd.c b/block/genhd.c
index 79230c109fca..3156c70522b6 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1070,6 +1070,54 @@ static ssize_t partscan_show(struct device *dev,
 	return sysfs_emit(buf, "%u\n", disk_has_partscan(dev_to_disk(dev)));
 }
 
+static ssize_t disk_distribute_write_streams_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	/* Anything useful to show here like the ranges? */
+	return sysfs_emit(buf, "0\n");
+}
+
+static ssize_t disk_distribute_write_streams_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct block_device *bdev = disk->part0, *part;
+	unsigned short total_write_streams =
+		disk->queue->limits.max_write_streams;
+	unsigned short part_write_streams, part_write_stream_start = 0;
+	unsigned long nr_partitions = 0, idx;
+	int error = 0;
+
+	if (!total_write_streams)
+		return -EINVAL;
+
+	mutex_lock(&disk->open_mutex);
+	if (atomic_read(&bdev->bd_openers)) {
+		error = -EBUSY;
+		goto out_unlock;
+	}
+
+	xa_for_each_start(&disk->part_tbl, idx, part, 1)
+		nr_partitions++;
+	if (!nr_partitions)
+		goto out_unlock;
+
+	part_write_streams = total_write_streams / nr_partitions;
+	xa_for_each_start(&disk->part_tbl, idx, part, 1) {
+		part->bd_part_write_streams = part_write_streams;
+		part->bd_part_write_stream_start = part_write_stream_start;
+		part_write_stream_start += part_write_streams;
+		dev_info(dev,
+			"assigning %u write streams at %u to partition %lu\n",
+			part_write_streams, part_write_stream_start, idx - 1);
+	}
+out_unlock:
+	mutex_unlock(&disk->open_mutex);
+	if (error)
+		return error;
+	return count;
+}
+
 static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -1084,6 +1132,9 @@ static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
 static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
 static DEVICE_ATTR(partscan, 0444, partscan_show, NULL);
+static DEVICE_ATTR(distribute_write_streams, 0644,
+	disk_distribute_write_streams_show,
+	disk_distribute_write_streams_store);
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
@@ -1135,6 +1186,7 @@ static struct attribute *disk_attrs[] = {
 	&dev_attr_events_poll_msecs.attr,
 	&dev_attr_diskseq.attr,
 	&dev_attr_partscan.attr,
+	&dev_attr_distribute_write_streams.attr,
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 	&dev_attr_fail.attr,
 #endif
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 815ed33caa1b..a27dbb5589ce 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -245,8 +245,10 @@ static const struct attribute_group *part_attr_groups[] = {
 
 static void part_release(struct device *dev)
 {
-	put_disk(dev_to_bdev(dev)->bd_disk);
-	bdev_drop(dev_to_bdev(dev));
+	struct block_device *part = dev_to_bdev(dev);
+
+	put_disk(part->bd_disk);
+	bdev_drop(part);
 }
 
 static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 4ca3449ce9c9..02a3d58e814f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -74,6 +74,13 @@ struct block_device {
 #ifdef CONFIG_SECURITY
 	void			*bd_security;
 #endif
+
+	/*
+	 * Allow assigning write streams to partitions.
+	 */
+	unsigned short		bd_part_write_streams;
+	unsigned short		bd_part_write_stream_start;
+
 	/*
 	 * keep this out-of-line as it's both big and not needed in the fast
 	 * path
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 9fda66530d9a..bb0921e642fb 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -1242,7 +1242,7 @@ static inline unsigned int bdev_max_segments(struct block_device *bdev)
 static inline unsigned short bdev_max_write_streams(struct block_device *bdev)
 {
 	if (bdev_is_partition(bdev))
-		return 0;
+		return bdev->bd_part_write_streams;
 	return bdev_limits(bdev)->max_write_streams;
 }
 
-- 
2.45.2




More information about the Linux-nvme mailing list