[PATCHv2 1/4] block: bio-integrity: directly map user buffers
Kanchan Joshi
joshi.k at samsung.com
Mon Oct 30 14:02:48 PDT 2023
On 10/27/2023 11:49 PM, Keith Busch wrote:
> From: Keith Busch <kbusch at kernel.org>
>
> Passthrough commands that utilize metadata currently need to bounce the
> user space buffer through the kernel. Add support for mapping user space
> directly so that we can avoid this costly overhead. This is similiar to
> how the normal bio data payload utilizes user addresses with
> bio_map_user_iov().
>
> If the user address can't directly be used for reasons like too many
> segments or address unalignement, fallback to a copy of the user vec
> while keeping the user address pinned for the IO duration so that it
> can safely be copied on completion in any process context.
>
> Signed-off-by: Keith Busch <kbusch at kernel.org>
> ---
> block/bio-integrity.c | 195 ++++++++++++++++++++++++++++++++++++++++++
> include/linux/bio.h | 9 ++
> 2 files changed, 204 insertions(+)
>
> diff --git a/block/bio-integrity.c b/block/bio-integrity.c
> index ec8ac8cf6e1b9..7f9d242ad79df 100644
> --- a/block/bio-integrity.c
> +++ b/block/bio-integrity.c
> @@ -91,6 +91,37 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
> }
> EXPORT_SYMBOL(bio_integrity_alloc);
>
> +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
> +{
> + bool dirty = bio_data_dir(bip->bip_bio) == READ;
> + struct bio_vec *copy = bip->copy_vec;
> + struct bvec_iter iter;
> + struct bio_vec bv;
> +
> + if (copy) {
> + unsigned short nr_vecs = bip->bip_max_vcnt;
> + size_t bytes = bip->bip_iter.bi_size;
> + void *buf = bvec_virt(bip->bip_vec);
> +
> + if (dirty) {
> + struct iov_iter iter;
> +
> + iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
> + WARN_ON(copy_to_iter(buf, bytes, &iter) != bytes);
> + }
> +
> + memcpy(bip->bip_vec, copy, nr_vecs * sizeof(*copy));
> + kfree(copy);
> + kfree(buf);
> + }
> +
> + bip_for_each_vec(bv, bip, iter) {
> + if (dirty && !PageCompound(bv.bv_page))
> + set_page_dirty_lock(bv.bv_page);
> + unpin_user_page(bv.bv_page);
> + }
> +}
Leak here, page-unpinning loop will not execute for the common (i.e.,
no-copy) case...
> +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
> + u32 seed)
> +{
> + struct request_queue *q = bdev_get_queue(bio->bi_bdev);
> + unsigned long offs, align = q->dma_pad_mask | queue_dma_alignment(q);
> + int ret, direction, nr_vecs, i, j, folios = 0;
> + struct bio_vec stack_vec[UIO_FASTIOV];
> + struct bio_vec bv, *bvec = stack_vec;
> + struct page *stack_pages[UIO_FASTIOV];
> + struct page **pages = stack_pages;
> + struct bio_integrity_payload *bip;
> + struct iov_iter iter;
> + struct bvec_iter bi;
> + u32 bytes;
> +
> + if (bio_integrity(bio))
> + return -EINVAL;
> + if (len >> SECTOR_SHIFT > queue_max_hw_sectors(q))
> + return -E2BIG;
> +
> + if (bio_data_dir(bio) == READ)
> + direction = ITER_DEST;
> + else
> + direction = ITER_SOURCE;
> +
> + iov_iter_ubuf(&iter, direction, ubuf, len);
> + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
> + if (nr_vecs > BIO_MAX_VECS)
> + return -E2BIG;
> + if (nr_vecs > UIO_FASTIOV) {
> + bvec = kcalloc(sizeof(*bvec), nr_vecs, GFP_KERNEL);
> + if (!bvec)
> + return -ENOMEM;
> + pages = NULL;
> + }
> +
> + bytes = iov_iter_extract_pages(&iter, &pages, len, nr_vecs, 0, &offs);
> + if (unlikely(bytes < 0)) {
> + ret = bytes;
> + goto free_bvec;
> + }
> +
> + for (i = 0; i < nr_vecs; i = j) {
> + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
> + struct folio *folio = page_folio(pages[i]);
> +
> + bytes -= size;
> + for (j = i + 1; j < nr_vecs; j++) {
> + size_t next = min_t(size_t, PAGE_SIZE, bytes);
> +
> + if (page_folio(pages[j]) != folio ||
> + pages[j] != pages[j - 1] + 1)
> + break;
> + unpin_user_page(pages[j]);
> + size += next;
> + bytes -= next;
> + }
> +
> + bvec_set_page(&bvec[folios], pages[i], size, offs);
> + offs = 0;
> + folios++;
> + }
> +
> + if (pages != stack_pages)
> + kvfree(pages);
> +
> + if (folios > queue_max_integrity_segments(q) ||
> + !iov_iter_is_aligned(&iter, align, align)) {
> + ret = bio_integrity_copy_user(bio, bvec, folios, len,
> + direction, seed);
> + if (ret)
> + goto release_pages;
> + return 0;
> + }
> +
> + bip = bio_integrity_alloc(bio, GFP_KERNEL, folios);
> + if (IS_ERR(bip)) {
> + ret = PTR_ERR(bip);
> + goto release_pages;
> + }
> +
> + memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec));
Because with this way of copying, bip->bip_iter.bi_size will remain zero.
Second, is it fine not to have those virt-alignment checks that are done
by bvec_gap_to_prev() when the pages are added using
bio_integrity_add_page()?
More information about the Linux-nvme
mailing list