[PATCHv2 1/4] block: bio-integrity: directly map user buffers

Wed Nov 8 04:15:19 PST 2023

On 11/7/2023 8:38 PM, Keith Busch wrote:
> On Tue, Nov 07, 2023 at 03:55:14PM +0530, Kanchan Joshi wrote:
>> On 11/6/2023 8:32 PM, Keith Busch wrote:
>>> On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote:
>>>> On 10/27/2023 11:49 PM, Keith Busch wrote:
>>>>> +	for (i = 0; i < nr_vecs; i = j) {
>>>>> +		size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
>>>>> +		struct folio *folio = page_folio(pages[i]);
>>>>> +
>>>>> +		bytes -= size;
>>>>> +		for (j = i + 1; j < nr_vecs; j++) {
>>>>> +			size_t next = min_t(size_t, PAGE_SIZE, bytes);
>>>>> +
>>>>> +			if (page_folio(pages[j]) != folio ||
>>>>> +			    pages[j] != pages[j - 1] + 1)
>>>>> +				break;
>>>>> +			unpin_user_page(pages[j]);
>>>>
>>>> Is this unpin correct here?
>>>
>>> Should be. The pages are bound to the folio, so this doesn't really
>>> unpin the user page. It just drops a reference, and the folio holds the
>>> final reference to the contiguous pages, which is released on
>>> completion.
>>
>> But the completion is still going to see multiple pages and not one
>> (folio). The bip_for_each_vec loop is going to drop the reference again.
>> I suspect it is not folio-aware.
> 
> The completion unpins once per bvec, not individual pages. The setup
> creates multipage bvecs with only one pin remaining per bvec for all of
> the bvec's pages. If a page can't be merged into the current bvec, then
> that page is not unpinned and becomes the first page of to the next
> bvec.
> 

Here is a test program [2] that creates this scenario.
Single 8KB+16b read on a 4KB+8b formatted namespace. It prepares 
meta-buffer out of a huge-page in a way that it spans two regular 4K pages.
With this, I see more unpins than expected.

And I had added this [1] also on top of your patch.

[1]
@@ -339,7 +367,22 @@ int bio_integrity_map_user(struct bio *bio, void 
__user *ubuf, unsigned int len,
         memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec));
         if (bvec != stack_vec)
                 kfree(bvec);
+       // quick fix for completion
+       bip->bip_vcnt = folios;
+       bip->bip_iter.bi_size = len;


[2]
#define _GNU_SOURCE
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <liburing.h>
#include <libnvme.h>

#define DEV             "/dev/ng0n1"
#define NSID            1
#define DBCNT           2
#define DATA_BUFLEN     (4096 * DBCNT)
#define OFFSET          0
#define LBA_SHIFT       12

/* This assumes 4K + 8b lba format */
#define MD_BUFLEN       (8 * DBCNT)
#define MD_OFFSET       (4096 - 8)
#define HP_SIZE         (2*2*1024*1024) /*Two 2M pages*/

#define APPTAG_MASK     (0xFFFF)
#define APPTAG          (0x8888)

void *alloc_meta_buf_hp()
{
         void *ptr;

         ptr = mmap(NULL, HP_SIZE, PROT_READ | PROT_WRITE,
                         MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB,
                         -1, 0);
         if (ptr == MAP_FAILED)
                 return NULL;
         return ptr;
}

void free_meta_buf(void *ptr)
{
         munmap(ptr, HP_SIZE);
}

int main()
{
         struct io_uring ring;
         struct io_uring_cqe *cqe;
         struct io_uring_sqe *sqe;
         struct io_uring_params p = { };
         int fd, ret;
         struct nvme_uring_cmd *cmd;
         void *buffer, *md_buf;
         __u64 slba;
         __u16 nlb;

         ret = posix_memalign(&buffer, DATA_BUFLEN, DATA_BUFLEN);
         if (ret) {
                 fprintf(stderr, "data buffer allocation failed: %d\n", 
ret);
                 return 1;
         }
         memset(buffer, 'x', DATA_BUFLEN);

         md_buf = alloc_meta_buf_hp();
         if (!md_buf) {
                 fprintf(stderr, "meta buffer allocation failed: %d\n", 
ret);
                 return 1;
         }

         p.flags = IORING_SETUP_CQE32 | IORING_SETUP_SQE128;
         ret = io_uring_queue_init_params(4, &ring, &p);
         if (ret) {
                 fprintf(stderr, "ring create failed: %d\n", ret);
                 return 1;
         }

         fd = open(DEV, O_RDWR);
         if (fd < 0) {
                 perror("file open");
                 exit(1);
         }

         sqe = io_uring_get_sqe(&ring);
         io_uring_prep_read(sqe, fd, buffer, DATA_BUFLEN, OFFSET);
         sqe->cmd_op = NVME_URING_CMD_IO;
         sqe->opcode = IORING_OP_URING_CMD;
         sqe->user_data = 1234;

         cmd = (struct nvme_uring_cmd *)sqe->cmd;
         memset(cmd, 0, sizeof(struct nvme_uring_cmd));
         cmd->opcode = nvme_cmd_read;
         cmd->addr = (__u64)(uintptr_t)buffer;
         cmd->data_len = DATA_BUFLEN;
         cmd->nsid = NSID;

         slba = OFFSET >> LBA_SHIFT;
         nlb = (DATA_BUFLEN >> LBA_SHIFT) - 1;
         cmd->cdw10 = slba & 0xffffffff;
         cmd->cdw11 = slba >> 32;
         cmd->cdw12 = nlb;
         /* set the pract and prchk (Guard, App, RefTag) bits in cdw12 */
         //cmd->cdw12 |= 15 << 26;
         cmd->cdw12 |= 7 << 26;

         cmd->metadata = ((__u64)(uintptr_t)md_buf) + MD_OFFSET;
         cmd->metadata_len = MD_BUFLEN;

         /* reftag */
         cmd->cdw14 = (__u32)slba;
         /* apptag mask and apptag */
         cmd->cdw15 = APPTAG_MASK << 16 | APPTAG;

         ret = io_uring_submit(&ring);
         if (ret != 1) {
                 fprintf(stderr, "submit got %d, wanted %d\n", ret, 1);
                 goto err;
         }
         ret = io_uring_wait_cqe(&ring, &cqe);
         if (ret) {
                 fprintf(stderr, "wait_cqe=%d\n", ret);
                 goto err;
         }
         if (cqe->res != 0) {
                 fprintf(stderr, "cqe res %d, wanted success\n", cqe->res);
                 goto err;
         }

         io_uring_cqe_seen(&ring, cqe);
         free_meta_buf(md_buf);
         close(fd);
         io_uring_queue_exit(&ring);
         return 0;
err:
         if (fd != -1)
                 close(fd);
         io_uring_queue_exit(&ring);
         return 1;
}