[PATCHv2 1/4] block: bio-integrity: directly map user buffers

Kanchan Joshi joshi.k at samsung.com
Mon Oct 30 14:02:48 PDT 2023


On 10/27/2023 11:49 PM, Keith Busch wrote:
> From: Keith Busch <kbusch at kernel.org>
> 
> Passthrough commands that utilize metadata currently need to bounce the
> user space buffer through the kernel. Add support for mapping user space
> directly so that we can avoid this costly overhead. This is similiar to
> how the normal bio data payload utilizes user addresses with
> bio_map_user_iov().
> 
> If the user address can't directly be used for reasons like too many
> segments or address unalignement, fallback to a copy of the user vec
> while keeping the user address pinned for the IO duration so that it
> can safely be copied on completion in any process context.
> 
> Signed-off-by: Keith Busch <kbusch at kernel.org>
> ---
>   block/bio-integrity.c | 195 ++++++++++++++++++++++++++++++++++++++++++
>   include/linux/bio.h   |   9 ++
>   2 files changed, 204 insertions(+)
> 
> diff --git a/block/bio-integrity.c b/block/bio-integrity.c
> index ec8ac8cf6e1b9..7f9d242ad79df 100644
> --- a/block/bio-integrity.c
> +++ b/block/bio-integrity.c
> @@ -91,6 +91,37 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
>   }
>   EXPORT_SYMBOL(bio_integrity_alloc);
>   
> +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
> +{
> +	bool dirty = bio_data_dir(bip->bip_bio) == READ;
> +	struct bio_vec *copy = bip->copy_vec;
> +	struct bvec_iter iter;
> +	struct bio_vec bv;
> +
> +	if (copy) {
> +		unsigned short nr_vecs = bip->bip_max_vcnt;
> +		size_t bytes = bip->bip_iter.bi_size;
> +		void *buf = bvec_virt(bip->bip_vec);
> +
> +		if (dirty) {
> +			struct iov_iter iter;
> +
> +			iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
> +			WARN_ON(copy_to_iter(buf, bytes, &iter) != bytes);
> +		}
> +
> +		memcpy(bip->bip_vec, copy, nr_vecs * sizeof(*copy));
> +		kfree(copy);
> +		kfree(buf);
> +	}
> +
> +	bip_for_each_vec(bv, bip, iter) {
> +		if (dirty && !PageCompound(bv.bv_page))
> +			set_page_dirty_lock(bv.bv_page);
> +		unpin_user_page(bv.bv_page);
> +	}
> +}

Leak here, page-unpinning loop will not execute for the common (i.e., 
no-copy) case...


> +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
> +			   u32 seed)
> +{
> +	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> +	unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q);
> +	int ret, direction, nr_vecs, i, j, folios = 0;
> +	struct bio_vec stack_vec[UIO_FASTIOV];
> +	struct bio_vec bv, *bvec = stack_vec;
> +	struct page *stack_pages[UIO_FASTIOV];
> +	struct page **pages = stack_pages;
> +	struct bio_integrity_payload *bip;
> +	struct iov_iter iter;
> +	struct bvec_iter bi;
> +	u32 bytes;
> +
> +	if (bio_integrity(bio))
> +		return -EINVAL;
> +	if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q))
> +		return -E2BIG;
> +
> +	if (bio_data_dir(bio) == READ)
> +		direction = ITER_DEST;
> +	else
> +		direction = ITER_SOURCE;
> +
> +	iov_iter_ubuf(&iter, direction, ubuf, len);
> +	nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
> +	if (nr_vecs > BIO_MAX_VECS)
> +		return -E2BIG;
> +	if (nr_vecs > UIO_FASTIOV) {
> +		bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
> +		if (!bvec)
> +			return -ENOMEM;
> +		pages = NULL;
> +	}
> +
> +	bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs);
> +	if (unlikely(bytes < 0)) {
> +		ret =  bytes;
> +		goto free_bvec;
> +	}
> +
> +	for (i = 0; i < nr_vecs; i = j) {
> +		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
> +		struct folio *folio = page_folio(pages[i]);
> +
> +		bytes -= size;
> +		for (j = i + 1; j < nr_vecs; j++) {
> +			size_t next = min_t(size_t, PAGE_SIZE, bytes);
> +
> +			if (page_folio(pages[j]) != folio ||
> +			    pages[j] != pages[j - 1] + 1)
> +				break;
> +			unpin_user_page(pages[j]);
> +			size += next;
> +			bytes -= next;
> +		}
> +
> +		bvec_set_page(&bvec[folios], pages[i], size, offs);
> +		offs = 0;
> +		folios++;
> +	}
> +
> +	if (pages != stack_pages)
> +		kvfree(pages);
> +
> +	if (folios > queue_max_integrity_segments(q) ||
> +	    !iov_iter_is_aligned(&iter, align, align)) {
> +		ret = bio_integrity_copy_user(bio, bvec, folios, len,
> +					      direction, seed);
> +		if (ret)
> +			goto release_pages;
> +		return 0;
> +	}
> +
> +	bip = bio_integrity_alloc(bio, GFP_KERNEL, folios);
> +	if (IS_ERR(bip)) {
> +		ret = PTR_ERR(bip);
> +		goto release_pages;
> +	}
> +
> +	memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec));

Because with this way of copying, bip->bip_iter.bi_size will remain zero.

Second, is it fine not to have those virt-alignment checks that are done 
by bvec_gap_to_prev() when the pages are added using 
bio_integrity_add_page()?



More information about the Linux-nvme mailing list