[RFC 3/4] block: add bit bucket support

Wed May 4 09:32:06 PDT 2022

From: Keith Busch <kbusch at kernel.org>

Bit buckets allow applications to read partial sectors. Add block
support for partial reads if the request_queue supports it.

This implementation designates a special page for bit buckets. Filling
the holes with this special page should let merging operations continue
as normal. Read data should never be sent to this page, and the driver
should instead recognize this special page and set up their scatter
gather accordingly.

The bit_bucket attribute is exported so applications may know if they
can do partial sector reads.

This implementation only works for direct io on raw block devices, and
does not work with pre-registered buffers due to those already coming in
as a bvec.

Requests with bit buckets need to be flagged specially for this since
NVMe needs to know before walking the segments if it should construct a
bitbucket SGL instead of a PRP.

Signed-off-by: Keith Busch <kbusch at kernel.org>
---
 block/blk-core.c          |  5 ++++
 block/blk-merge.c         |  3 +-
 block/blk-mq.c            |  2 ++
 block/blk-sysfs.c         |  3 ++
 block/fops.c              | 58 +++++++++++++++++++++++++++++++++------
 include/linux/blk-mq.h    |  2 ++
 include/linux/blk_types.h |  1 +
 include/linux/blkdev.h    | 13 +++++++++
 8 files changed, 77 insertions(+), 10 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 937bb6b86331..a11931857dd9 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -74,6 +74,9 @@ struct kmem_cache *blk_requestq_srcu_cachep;
  */
 static struct workqueue_struct *kblockd_workqueue;
 
+struct page *blk_bb_page;
+EXPORT_SYMBOL_GPL(blk_bb_page);
+
 /**
  * blk_queue_flag_set - atomically set a queue flag
  * @flag: flag to be set
@@ -1309,5 +1312,7 @@ int __init blk_dev_init(void)
 
 	blk_debugfs_root = debugfs_create_dir("block", NULL);
 
+	blk_bb_page = ZERO_PAGE(0);
+
 	return 0;
 }
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 7771dacc99cb..3fde24bf97f3 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -278,7 +278,8 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
 		 * If the queue doesn't support SG gaps and adding this
 		 * offset would create a gap, disallow it.
 		 */
-		if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
+		if (!bio_flagged(bio, BIO_BIT_BUCKET) && bvprvp &&
+		    bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
 			goto split;
 
 		if (nsegs < max_segs &&
diff --git a/block/blk-mq.c b/block/blk-mq.c
index c4370d276170..80309d243a09 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -2411,6 +2411,8 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
 
 	if (bio->bi_opf & REQ_RAHEAD)
 		rq->cmd_flags |= REQ_FAILFAST_MASK;
+	if (bio_flagged(bio, BIO_BIT_BUCKET))
+		rq->rq_flags |= RQF_BIT_BUCKET;
 
 	rq->__sector = bio->bi_iter.bi_sector;
 	blk_rq_bio_prep(rq, bio, nr_segs);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 14607565d781..19c385084aea 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -309,6 +309,7 @@ QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
 QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
 QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
 QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
+QUEUE_SYSFS_BIT_FNS(bit_bucket, BIT_BUCKET, 0);
 #undef QUEUE_SYSFS_BIT_FNS
 
 static ssize_t queue_zoned_show(struct request_queue *q, char *page)
@@ -627,6 +628,7 @@ QUEUE_RW_ENTRY(queue_nonrot, "rotational");
 QUEUE_RW_ENTRY(queue_iostats, "iostats");
 QUEUE_RW_ENTRY(queue_random, "add_random");
 QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
+QUEUE_RW_ENTRY(queue_bit_bucket, "bit_bucket");
 
 static struct attribute *queue_attrs[] = {
 	&queue_requests_entry.attr,
@@ -653,6 +655,7 @@ static struct attribute *queue_attrs[] = {
 	&queue_zone_append_max_entry.attr,
 	&queue_zone_write_granularity_entry.attr,
 	&queue_nonrot_entry.attr,
+	&queue_bit_bucket_entry.attr,
 	&queue_zoned_entry.attr,
 	&queue_nr_zones_entry.attr,
 	&queue_max_open_zones_entry.attr,
diff --git a/block/fops.c b/block/fops.c
index a6583bce1e7d..36ccd52ece03 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -57,13 +57,21 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 {
 	struct block_device *bdev = iocb->ki_filp->private_data;
 	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
+	unsigned int blksz = bdev_logical_block_size(bdev);
 	loff_t pos = iocb->ki_pos;
 	bool should_dirty = false;
+	u16 skip = 0, trunc = 0;
 	struct bio bio;
 	ssize_t ret;
 
-	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
-		return -EINVAL;
+	if ((pos | iov_iter_count(iter)) & (blksz - 1)) {
+		if (iov_iter_rw(iter) != READ || iov_iter_is_bvec(iter) ||
+		    !blk_queue_bb(bdev_get_queue(bdev)))
+			return -EINVAL;
+		skip = pos & (blksz - 1);
+		trunc = blksz - ((pos + iov_iter_count(iter)) & (blksz - 1));
+		nr_pages += !!skip + !!trunc;
+	}
 	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
 		return -EINVAL;
 
@@ -80,6 +88,8 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 		bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
 		if (iter_is_iovec(iter))
 			should_dirty = true;
+		if (skip)
+			blk_add_bb_page(&bio, skip);
 	} else {
 		bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
 	}
@@ -91,7 +101,10 @@ static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
 	ret = bio_iov_iter_get_pages(&bio, iter);
 	if (unlikely(ret))
 		goto out;
-	ret = bio.bi_iter.bi_size;
+
+	if (trunc)
+		blk_add_bb_page(&bio, trunc);
+	ret = bio.bi_iter.bi_size - trunc - skip;
 
 	if (iov_iter_rw(iter) == WRITE)
 		task_io_account_write(ret);
@@ -186,16 +199,25 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		unsigned int nr_pages)
 {
 	struct block_device *bdev = iocb->ki_filp->private_data;
+	unsigned int blksz = bdev_logical_block_size(bdev);
 	struct blk_plug plug;
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
 	unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	loff_t pos = iocb->ki_pos;
+	u16 skip = 0, trunc = 0, bucket_bytes = 0;
 	int ret = 0;
 
-	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
-		return -EINVAL;
+	if ((pos | iov_iter_count(iter)) & (blksz - 1)) {
+		if (iov_iter_rw(iter) != READ || iov_iter_is_bvec(iter) ||
+		    !blk_queue_bb(bdev_get_queue(bdev)))
+			return -EINVAL;
+		skip = pos & (blksz - 1);
+		trunc = blksz - ((pos + iov_iter_count(iter)) & (blksz - 1));
+		bucket_bytes = skip + trunc;
+		nr_pages += !!skip + !!trunc;
+	}
 	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
 		return -EINVAL;
 
@@ -240,6 +262,10 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 		if (is_read) {
 			if (dio->flags & DIO_SHOULD_DIRTY)
 				bio_set_pages_dirty(bio);
+			if (skip) {
+				blk_add_bb_page(bio, skip);
+				skip = 0;
+			}
 		} else {
 			task_io_account_write(bio->bi_iter.bi_size);
 		}
@@ -251,6 +277,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 
 		nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
 		if (!nr_pages) {
+			if (trunc)
+				blk_add_bb_page(bio, trunc);
 			submit_bio(bio);
 			break;
 		}
@@ -275,7 +303,7 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
 	if (!ret)
 		ret = blk_status_to_errno(dio->bio.bi_status);
 	if (likely(!ret))
-		ret = dio->size;
+		ret = dio->size - bucket_bytes;
 
 	bio_put(&dio->bio);
 	return ret;
@@ -311,15 +339,23 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 					unsigned int nr_pages)
 {
 	struct block_device *bdev = iocb->ki_filp->private_data;
+	unsigned int blksz = bdev_logical_block_size(bdev);
 	bool is_read = iov_iter_rw(iter) == READ;
 	unsigned int opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
 	struct blkdev_dio *dio;
 	struct bio *bio;
 	loff_t pos = iocb->ki_pos;
+	u16 skip = 0, trunc = 0;
 	int ret = 0;
 
-	if ((pos | iov_iter_count(iter)) & (bdev_logical_block_size(bdev) - 1))
-		return -EINVAL;
+	if ((pos | iov_iter_count(iter)) & (blksz - 1)) {
+		if (iov_iter_rw(iter) != READ || iov_iter_is_bvec(iter) ||
+		    !blk_queue_bb(bdev_get_queue(bdev)))
+			return -EINVAL;
+		skip = pos & (blksz - 1);
+		trunc = blksz - ((pos + iov_iter_count(iter)) & (blksz - 1));
+		nr_pages += !!skip + !!trunc;
+	}
 	if (iov_iter_alignment(iter) & bdev_dma_alignment(bdev))
 		return -EINVAL;
 
@@ -340,13 +376,17 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
 		 */
 		bio_iov_bvec_set(bio, iter);
 	} else {
+		if (skip)
+			blk_add_bb_page(bio, skip);
 		ret = bio_iov_iter_get_pages(bio, iter);
 		if (unlikely(ret)) {
 			bio_put(bio);
 			return ret;
 		}
+		if (trunc)
+			blk_add_bb_page(bio, trunc);
 	}
-	dio->size = bio->bi_iter.bi_size;
+	dio->size = bio->bi_iter.bi_size - trunc - skip;
 
 	if (is_read) {
 		if (iter_is_iovec(iter)) {
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index 7aa5c54901a9..1a3902c2440f 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -22,6 +22,8 @@ typedef __u32 __bitwise req_flags_t;
 
 /* drive already may have started this one */
 #define RQF_STARTED		((__force req_flags_t)(1 << 1))
+/* request has bit bucket payload */
+#define RQF_BIT_BUCKET         ((__force req_flags_t)(1 << 2))
 /* may not be passed by ioscheduler */
 #define RQF_SOFTBARRIER		((__force req_flags_t)(1 << 3))
 /* request for flush sequence */
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index 1973ef9bd40f..f55e194b72a0 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -330,6 +330,7 @@ enum {
 	BIO_REMAPPED,
 	BIO_ZONE_WRITE_LOCKED,	/* Owns a zoned device zone write lock */
 	BIO_PERCPU_CACHE,	/* can participate in per-cpu alloc cache */
+	BIO_BIT_BUCKET,		/* contains one or more bit bucket pages */
 	BIO_FLAG_LAST
 };
 
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index dba6d411fc1e..5feaa5e7810e 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -44,6 +44,7 @@ struct blk_crypto_profile;
 extern const struct device_type disk_type;
 extern struct device_type part_type;
 extern struct class block_class;
+extern struct page *blk_bb_page;
 
 /* Must be consistent with blk_mq_poll_stats_bkt() */
 #define BLK_MQ_POLL_STATS_BKTS 16
@@ -560,6 +561,7 @@ struct request_queue {
 #define QUEUE_FLAG_RQ_ALLOC_TIME 27	/* record rq->alloc_time_ns */
 #define QUEUE_FLAG_HCTX_ACTIVE	28	/* at least one blk-mq hctx is active */
 #define QUEUE_FLAG_NOWAIT       29	/* device supports NOWAIT */
+#define QUEUE_FLAG_BIT_BUCKET   30	/* device supports read bit buckets */
 
 #define QUEUE_FLAG_MQ_DEFAULT	((1 << QUEUE_FLAG_IO_STAT) |		\
 				 (1 << QUEUE_FLAG_SAME_COMP) |		\
@@ -605,6 +607,7 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
 #define blk_queue_fua(q)	test_bit(QUEUE_FLAG_FUA, &(q)->queue_flags)
 #define blk_queue_registered(q)	test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
 #define blk_queue_nowait(q)	test_bit(QUEUE_FLAG_NOWAIT, &(q)->queue_flags)
+#define blk_queue_bb(q)		test_bit(QUEUE_FLAG_BIT_BUCKET, &(q)->queue_flags)
 
 extern void blk_set_pm_only(struct request_queue *q);
 extern void blk_clear_pm_only(struct request_queue *q);
@@ -1588,4 +1591,14 @@ struct io_comp_batch {
 
 #define DEFINE_IO_COMP_BATCH(name)	struct io_comp_batch name = { }
 
+static inline void blk_add_bb_page(struct bio *bio, int len)
+{
+	bio_set_flag(bio, BIO_BIT_BUCKET);
+	get_page(blk_bb_page);
+	bio_add_page(bio, blk_bb_page, len, 0);
+}
+static inline bool blk_is_bit_bucket(struct page *page)
+{
+	return page == blk_bb_page;
+}
 #endif /* _LINUX_BLKDEV_H */
-- 
2.30.2