[PATCH 1/9] block: Make generic_make_request handle arbitrary sized bios
Keith Busch
keith.busch at intel.com
Thu Mar 13 19:33:04 EDT 2014
On Fri, 28 Feb 2014, Kent Overstreet wrote:
> On Thu, Feb 27, 2014 at 12:22:54PM -0500, Matthew Wilcox wrote:
>> On Wed, Feb 26, 2014 at 03:39:49PM -0800, Kent Overstreet wrote:
>>> We do this by adding calls to blk_queue_split() to the various
>>> make_request functions that need it - a few can already handle arbitrary
>>> size bios. Note that we add the call _after_ any call to blk_queue_bounce();
>>> this means that blk_queue_split() and blk_recalc_rq_segments() don't need to
>>> be concerned with bouncing affecting segment merging.
>>
>>> diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
>>> index 51824d1f23..e4376b9613 100644
>>> --- a/drivers/block/nvme-core.c
>>> +++ b/drivers/block/nvme-core.c
>>> @@ -737,6 +737,8 @@ static void nvme_make_request(struct request_queue *q, struct bio *bio)
>>> struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
>>> int result = -EBUSY;
>>>
>>> + blk_queue_split(q, &bio, q->bio_split);
>>> +
>>> if (!nvmeq) {
>>> put_nvmeq(NULL);
>>> bio_endio(bio, -EIO);
>>
>> I'd suggest that we do:
>>
>> - struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
>> + struct nvme_queue *nvmeq;
>> int result = -EBUSY;
>>
>> + blk_queue_split(q, &bio, q->bio_split);
>> +
>> + nvmeq = get_nvmeq(ns->dev);
>> if (!nvmeq) {
>>
>> so that we're running the blk_queue_split() code outside the get_cpu()
>> call.
>>
>> Now, the NVMe driver has its own rules about when BIOs have to be split.
>> Right now, that's way down inside the nvme_map_bio() call when we're
>> walking the bio to compose the scatterlist. Should we instead have an
>> nvme_bio_split() routine that is called instead of blk_queue_split(),
>> and we can simplify nvme_map_bio() since it'll know that it's working
>> with bios that don't have to be split.
>>
>> In fact, I think it would have little NVMe-specific in it at that point,
>> so we could name __blk_bios_map_sg() better, export it to drivers and
>> call it from nvme_map_bio(), which I think would make everybody happier.
>
> Actually, reading nvme_map_bio() (it's different since last I looked at
> it) it looks like nvme should already be able to handle arbitrary size
> bios?
>
> I do intend to rework the blk_bio_map_sg() (or add a new one?) to
> incrementally map as much of a bio as will fit in the provided
> scatterlist, but it looks like nvme has some odd restrictions where it's
> using BIOVEC_PHYS_MERGABLE()/BIOVEC_NOT_VIRT_MERGABLE() so I dunno if
> it's worth bothering to try and have it use generic code.
>
> However we don't need an explicit split here: if the sg fills up (i.e.
> the places nvme_split_and_submit() is called), we can just mark the bio
> as partially completed (set bio->bi_iter = iter, i.e. use the iterator
> you passed to bio_for_each_segment), then increment bi_remaining (which
> just counts completions, i.e. bio_endio() calls before the bio is really
> completed) and resubmit the original bio. No need to allocate a split
> bio, or loop over the bio again in bio_split().
Hi Kent,
I wanted to see about getting rid of nvme_map_bio and do all the
splits generically. I built this on top of your original patch and
added additional split criteria for NVMe, which includes the "segment"
address offset and the vendor specific "stripe_size" (I think "chunk_size"
may have been a more appropriate description, but I used someone else's
term for this). This is what I came up with; I hoped the additions would
be useful, but maybe the additional split criteria applies only to nvme.
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 0afbe3f..2848181 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -66,11 +66,9 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs)
{
- struct bio *split;
struct bio_vec bv, bvprv;
struct bvec_iter iter;
unsigned seg_size = 0, nsegs = 0;
- int prev = 0;
struct bvec_merge_data bvm = {
.bi_bdev = bio->bi_bdev,
@@ -79,17 +77,26 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
.bi_rw = bio->bi_rw,
};
+ if (bio->bi_iter.bi_size >> 9 > queue_max_sectors(q)) {
+ bvm.bi_size = queue_max_sectors(q) << 9;
+ goto split;
+ }
+ if (queue_stripe_sectors(q)) {
+ unsigned split_len =
+ (queue_stripe_sectors(q) -
+ (bio->bi_iter.bi_sector &
+ (queue_stripe_sectors(q) - 1))) << 9;
+ if (split_len < bio->bi_iter.bi_size) {
+ bvm.bi_size = split_len;
+ goto split;
+ }
+ }
+
bio_for_each_segment(bv, bio, iter) {
if (q->merge_bvec_fn &&
q->merge_bvec_fn(q, &bvm, &bv) < (int) bv.bv_len)
goto split;
-
- bvm.bi_size += bv.bv_len;
-
- if (bvm.bi_size >> 9 > queue_max_sectors(q))
- goto split;
-
- if (prev && blk_queue_cluster(q)) {
+ if (nsegs && blk_queue_cluster(q)) {
if (seg_size + bv.bv_len > queue_max_segment_size(q))
goto new_segment;
if (!BIOVEC_PHYS_MERGEABLE(&bvprv, &bv))
@@ -97,34 +104,26 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
if (!BIOVEC_SEG_BOUNDARY(q, &bvprv, &bv))
goto new_segment;
+ bvm.bi_size += bv.bv_len;
seg_size += bv.bv_len;
bvprv = bv;
- prev = 1;
continue;
}
new_segment:
+ if (nsegs && BIOVEC_SEG_OFFSET(q, &bvprv, &bv))
+ goto split;
if (nsegs == queue_max_segments(q))
goto split;
+ bvm.bi_size += bv.bv_len;
nsegs++;
bvprv = bv;
- prev = 1;
seg_size = bv.bv_len;
}
return NULL;
split:
- split = bio_clone_bioset(bio, GFP_NOIO, bs);
-
- split->bi_iter.bi_size -= iter.bi_size;
- bio->bi_iter = iter;
-
- if (bio_integrity(bio)) {
- bio_integrity_advance(bio, split->bi_iter.bi_size);
- bio_integrity_trim(split, 0, bio_sectors(split));
- }
-
- return split;
+ return bio_split(bio, bvm.bi_size >> 9, GFP_ATOMIC, bs);
}
void blk_queue_split(struct request_queue *q, struct bio **bio,
@@ -303,7 +302,7 @@ new_segment:
*bvprv = *bvec;
}
-static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
+int blk_bios_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist,
struct scatterlist **sg)
{
@@ -344,6 +343,7 @@ single_segment:
return nsegs;
}
+EXPORT_SYMBOL(blk_bios_map_sg);
/*
* map a request to scatterlist, return number of sg entries setup. Caller
@@ -356,7 +356,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
int nsegs = 0;
if (rq->bio)
- nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
+ nsegs = blk_bios_map_sg(q, rq->bio, sglist, &sg);
if (unlikely(rq->cmd_flags & REQ_COPY_USER) &&
(blk_rq_bytes(rq) & q->dma_pad_mask)) {
@@ -407,7 +407,7 @@ int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
struct bio *next = bio->bi_next;
bio->bi_next = NULL;
- nsegs = __blk_bios_map_sg(q, bio, sglist, &sg);
+ nsegs = blk_bios_map_sg(q, bio, sglist, &sg);
bio->bi_next = next;
if (sg)
sg_mark_end(sg);
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 5d21239..6590703 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -111,8 +111,10 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_segments = BLK_MAX_SEGMENTS;
lim->max_integrity_segments = 0;
lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK;
+ lim->seg_offset_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
+ lim->stripe_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_discard_sectors = 0;
lim->discard_granularity = 0;
@@ -146,6 +148,7 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_hw_sectors = UINT_MAX;
lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
+ lim->stripe_sectors = 0;
lim->max_write_same_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
@@ -262,6 +265,18 @@ void blk_limits_max_hw_sectors(struct queue_limits *limits, unsigned int max_hw_
}
EXPORT_SYMBOL(blk_limits_max_hw_sectors);
+void blk_queue_stripe_sectors(struct request_queue *q, unsigned int hw_stripe_sectors)
+{
+ if (hw_stripe_sectors && ((hw_stripe_sectors << 9) < PAGE_CACHE_SIZE ||
+ !is_power_of_2(hw_stripe_sectors))) {
+ hw_stripe_sectors = 1 << (PAGE_CACHE_SHIFT - 9);
+ printk(KERN_INFO "%s: set to minimum %d\n",
+ __func__, hw_stripe_sectors);
+ }
+ q->limits.stripe_sectors = hw_stripe_sectors;
+}
+EXPORT_SYMBOL(blk_queue_stripe_sectors);
+
/**
* blk_queue_max_hw_sectors - set max sectors for a request for this queue
* @q: the request queue for the device
@@ -527,10 +542,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
t->max_write_same_sectors = min(t->max_write_same_sectors,
b->max_write_same_sectors);
+ t->stripe_sectors = min_not_zero(t->stripe_sectors, b->stripe_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
+ t->seg_offset_mask = min_not_zero(t->seg_offset_mask,
+ b->seg_offset_mask);
t->max_segments = min_not_zero(t->max_segments, b->max_segments);
t->max_integrity_segments = min_not_zero(t->max_integrity_segments,
@@ -758,7 +776,7 @@ EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
**/
void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
{
- if (mask < PAGE_CACHE_SIZE - 1) {
+ if (mask < (PAGE_CACHE_SIZE - 1)) {
mask = PAGE_CACHE_SIZE - 1;
printk(KERN_INFO "%s: set to minimum %lx\n",
__func__, mask);
@@ -769,6 +787,23 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
EXPORT_SYMBOL(blk_queue_segment_boundary);
/**
+ * blk_queue_segment_offset - set boundary rules for segment start address
+ * @q: the request queue for the device
+ * @mask: the memory offset mask at which the request must be split
+ **/
+void blk_queue_segment_offset(struct request_queue *q, unsigned long mask)
+{
+ if (mask && mask < (PAGE_CACHE_SIZE - 1)) {
+ printk(KERN_INFO "%s: mask:%lx set to minimum %lx\n",
+ __func__, mask, PAGE_CACHE_SIZE - 1);
+ mask = PAGE_CACHE_SIZE - 1;
+ }
+
+ q->limits.seg_offset_mask = mask;
+}
+EXPORT_SYMBOL(blk_queue_segment_offset);
+
+/**
* blk_queue_dma_alignment - set dma length and memory alignment
* @q: the request queue for the device
* @mask: alignment mask
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c
index e4376b9..7980697 100644
--- a/drivers/block/nvme-core.c
+++ b/drivers/block/nvme-core.c
@@ -471,71 +471,6 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_common_command *cmd,
return total_len;
}
-static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq,
- int len)
-{
- struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL);
- if (!split)
- return -ENOMEM;
-
- bio_chain(split, bio);
-
- if (bio_list_empty(&nvmeq->sq_cong))
- add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait);
- bio_list_add(&nvmeq->sq_cong, split);
- bio_list_add(&nvmeq->sq_cong, bio);
-
- return 0;
-}
-
-/* NVMe scatterlists require no holes in the virtual address */
-#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \
- (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE))
-
-static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod,
- struct bio *bio, enum dma_data_direction dma_dir, int psegs)
-{
- struct bio_vec bvec, bvprv;
- struct bvec_iter iter;
- struct scatterlist *sg = NULL;
- int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size;
- int first = 1;
-
- if (nvmeq->dev->stripe_size)
- split_len = nvmeq->dev->stripe_size -
- ((bio->bi_iter.bi_sector << 9) &
- (nvmeq->dev->stripe_size - 1));
-
- sg_init_table(iod->sg, psegs);
- bio_for_each_segment(bvec, bio, iter) {
- if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) {
- sg->length += bvec.bv_len;
- } else {
- if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec))
- return nvme_split_and_submit(bio, nvmeq,
- length);
-
- sg = sg ? sg + 1 : iod->sg;
- sg_set_page(sg, bvec.bv_page,
- bvec.bv_len, bvec.bv_offset);
- nsegs++;
- }
-
- if (split_len - length < bvec.bv_len)
- return nvme_split_and_submit(bio, nvmeq, split_len);
- length += bvec.bv_len;
- bvprv = bvec;
- first = 0;
- }
- iod->nents = nsegs;
- sg_mark_end(sg);
- if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0)
- return -ENOMEM;
-
- BUG_ON(length != bio->bi_iter.bi_size);
- return length;
-}
-
/*
* We reuse the small pool to allocate the 16-byte range here as it is not
* worth having a special pool for these or additional cases to handle freeing
@@ -607,6 +542,7 @@ int nvme_submit_flush_data(struct nvme_queue *nvmeq, struct nvme_ns *ns)
static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
struct bio *bio)
{
+ struct scatterlist *sg = NULL;
struct nvme_command *cmnd;
struct nvme_iod *iod;
enum dma_data_direction dma_dir;
@@ -662,14 +598,17 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns,
dma_dir = DMA_FROM_DEVICE;
}
- result = nvme_map_bio(nvmeq, iod, bio, dma_dir, psegs);
- if (result <= 0)
+ sg_init_table(iod->sg, psegs);
+ iod->nents = blk_bios_map_sg(ns->queue, bio, iod->sg, &sg);
+ sg_mark_end(sg);
+ if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) {
+ result = -ENOMEM;
goto free_cmdid;
- length = result;
+ }
cmnd->rw.command_id = cmdid;
cmnd->rw.nsid = cpu_to_le32(ns->ns_id);
- length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, length,
+ length = nvme_setup_prps(nvmeq->dev, &cmnd->common, iod, bio->bi_iter.bi_size,
GFP_ATOMIC);
cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector));
cmnd->rw.length = cpu_to_le16((length >> ns->lba_shift) - 1);
@@ -734,11 +673,12 @@ static int nvme_process_cq(struct nvme_queue *nvmeq)
static void nvme_make_request(struct request_queue *q, struct bio *bio)
{
struct nvme_ns *ns = q->queuedata;
- struct nvme_queue *nvmeq = get_nvmeq(ns->dev);
+ struct nvme_queue *nvmeq;
int result = -EBUSY;
blk_queue_split(q, &bio, q->bio_split);
+ nvmeq = get_nvmeq(ns->dev);
if (!nvmeq) {
put_nvmeq(NULL);
bio_endio(bio, -EIO);
@@ -812,6 +752,9 @@ int nvme_submit_sync_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd,
int cmdid;
struct sync_cmd_info cmdinfo;
+ if (!nvmeq || nvmeq->q_suspended)
+ return -EBUSY;
+
cmdinfo.task = current;
cmdinfo.status = -EINTR;
@@ -1767,6 +1710,9 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid,
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
if (dev->max_hw_sectors)
blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors);
+ if (dev->stripe_size)
+ blk_queue_stripe_sectors(ns->queue, dev->stripe_size >> 9);
+ blk_queue_segment_offset(ns->queue, PAGE_SIZE - 1);
disk->major = nvme_major;
disk->first_minor = 0;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5a4d39b..7dbbd03 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -186,6 +186,11 @@ static inline void *bio_data(struct bio *bio)
#define BIOVEC_SEG_BOUNDARY(q, b1, b2) \
__BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q)))
+#define __BIO_SEG_OFFSET(addr1, addr2, mask) \
+ (((addr1) & (mask)) || (((addr2) & (mask))))
+#define BIOVEC_SEG_OFFSET(q, b1, b2) \
+ __BIO_SEG_OFFSET(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_offset((q)))
+
#define bio_io_error(bio) bio_endio((bio), -EIO)
/*
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 95e789d..08cad22 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -267,9 +267,11 @@ struct blk_queue_tag {
struct queue_limits {
unsigned long bounce_pfn;
unsigned long seg_boundary_mask;
+ unsigned long seg_offset_mask;
unsigned int max_hw_sectors;
unsigned int max_sectors;
+ unsigned int stripe_sectors;
unsigned int max_segment_size;
unsigned int physical_block_size;
unsigned int alignment_offset;
@@ -987,6 +989,8 @@ extern int blk_queue_dma_drain(struct request_queue *q,
void *buf, unsigned int size);
extern void blk_queue_lld_busy(struct request_queue *q, lld_busy_fn *fn);
extern void blk_queue_segment_boundary(struct request_queue *, unsigned long);
+extern void blk_queue_segment_offset(struct request_queue *, unsigned long);
+extern void blk_queue_stripe_sectors(struct request_queue *, unsigned int);
extern void blk_queue_prep_rq(struct request_queue *, prep_rq_fn *pfn);
extern void blk_queue_unprep_rq(struct request_queue *, unprep_rq_fn *ufn);
extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *);
@@ -999,6 +1003,8 @@ extern void blk_queue_flush(struct request_queue *q, unsigned int flush);
extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable);
extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev);
+extern int blk_bios_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist,
+ struct scatterlist **sg);
extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *);
extern int blk_bio_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist);
@@ -1159,11 +1165,21 @@ static inline unsigned long queue_segment_boundary(struct request_queue *q)
return q->limits.seg_boundary_mask;
}
+static inline unsigned long queue_segment_offset(struct request_queue *q)
+{
+ return q->limits.seg_offset_mask;
+}
+
static inline unsigned int queue_max_sectors(struct request_queue *q)
{
return q->limits.max_sectors;
}
+static inline unsigned int queue_stripe_sectors(struct request_queue *q)
+{
+ return q->limits.stripe_sectors;
+}
+
static inline unsigned int queue_max_hw_sectors(struct request_queue *q)
{
return q->limits.max_hw_sectors;
More information about the Linux-nvme
mailing list