[PATCHv2 1/4] block: bio-integrity: directly map user buffers
Kanchan Joshi
joshi.k at samsung.com
Wed Nov 8 04:15:19 PST 2023
On 11/7/2023 8:38 PM, Keith Busch wrote:
> On Tue, Nov 07, 2023 at 03:55:14PM +0530, Kanchan Joshi wrote:
>> On 11/6/2023 8:32 PM, Keith Busch wrote:
>>> On Mon, Nov 06, 2023 at 11:18:03AM +0530, Kanchan Joshi wrote:
>>>> On 10/27/2023 11:49 PM, Keith Busch wrote:
>>>>> + for (i = 0; i < nr_vecs; i = j) {
>>>>> + size_t size = min_t(size_t, bytes, PAGE_SIZE - offs);
>>>>> + struct folio *folio = page_folio(pages[i]);
>>>>> +
>>>>> + bytes -= size;
>>>>> + for (j = i + 1; j < nr_vecs; j++) {
>>>>> + size_t next = min_t(size_t, PAGE_SIZE, bytes);
>>>>> +
>>>>> + if (page_folio(pages[j]) != folio ||
>>>>> + pages[j] != pages[j - 1] + 1)
>>>>> + break;
>>>>> + unpin_user_page(pages[j]);
>>>>
>>>> Is this unpin correct here?
>>>
>>> Should be. The pages are bound to the folio, so this doesn't really
>>> unpin the user page. It just drops a reference, and the folio holds the
>>> final reference to the contiguous pages, which is released on
>>> completion.
>>
>> But the completion is still going to see multiple pages and not one
>> (folio). The bip_for_each_vec loop is going to drop the reference again.
>> I suspect it is not folio-aware.
>
> The completion unpins once per bvec, not individual pages. The setup
> creates multipage bvecs with only one pin remaining per bvec for all of
> the bvec's pages. If a page can't be merged into the current bvec, then
> that page is not unpinned and becomes the first page of to the next
> bvec.
>
Here is a test program [2] that creates this scenario.
Single 8KB+16b read on a 4KB+8b formatted namespace. It prepares
meta-buffer out of a huge-page in a way that it spans two regular 4K pages.
With this, I see more unpins than expected.
And I had added this [1] also on top of your patch.
[1]
@@ -339,7 +367,22 @@ int bio_integrity_map_user(struct bio *bio, void
__user *ubuf, unsigned int len,
memcpy(bip->bip_vec, bvec, folios * sizeof(*bvec));
if (bvec != stack_vec)
kfree(bvec);
+ // quick fix for completion
+ bip->bip_vcnt = folios;
+ bip->bip_iter.bi_size = len;
[2]
#define _GNU_SOURCE
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mman.h>
#include <liburing.h>
#include <libnvme.h>
#define DEV "/dev/ng0n1"
#define NSID 1
#define DBCNT 2
#define DATA_BUFLEN (4096 * DBCNT)
#define OFFSET 0
#define LBA_SHIFT 12
/* This assumes 4K + 8b lba format */
#define MD_BUFLEN (8 * DBCNT)
#define MD_OFFSET (4096 - 8)
#define HP_SIZE (2*2*1024*1024) /*Two 2M pages*/
#define APPTAG_MASK (0xFFFF)
#define APPTAG (0x8888)
void *alloc_meta_buf_hp()
{
void *ptr;
ptr = mmap(NULL, HP_SIZE, PROT_READ | PROT_WRITE,
MAP_PRIVATE|MAP_ANONYMOUS|MAP_HUGETLB,
-1, 0);
if (ptr == MAP_FAILED)
return NULL;
return ptr;
}
void free_meta_buf(void *ptr)
{
munmap(ptr, HP_SIZE);
}
int main()
{
struct io_uring ring;
struct io_uring_cqe *cqe;
struct io_uring_sqe *sqe;
struct io_uring_params p = { };
int fd, ret;
struct nvme_uring_cmd *cmd;
void *buffer, *md_buf;
__u64 slba;
__u16 nlb;
ret = posix_memalign(&buffer, DATA_BUFLEN, DATA_BUFLEN);
if (ret) {
fprintf(stderr, "data buffer allocation failed: %d\n",
ret);
return 1;
}
memset(buffer, 'x', DATA_BUFLEN);
md_buf = alloc_meta_buf_hp();
if (!md_buf) {
fprintf(stderr, "meta buffer allocation failed: %d\n",
ret);
return 1;
}
p.flags = IORING_SETUP_CQE32 | IORING_SETUP_SQE128;
ret = io_uring_queue_init_params(4, &ring, &p);
if (ret) {
fprintf(stderr, "ring create failed: %d\n", ret);
return 1;
}
fd = open(DEV, O_RDWR);
if (fd < 0) {
perror("file open");
exit(1);
}
sqe = io_uring_get_sqe(&ring);
io_uring_prep_read(sqe, fd, buffer, DATA_BUFLEN, OFFSET);
sqe->cmd_op = NVME_URING_CMD_IO;
sqe->opcode = IORING_OP_URING_CMD;
sqe->user_data = 1234;
cmd = (struct nvme_uring_cmd *)sqe->cmd;
memset(cmd, 0, sizeof(struct nvme_uring_cmd));
cmd->opcode = nvme_cmd_read;
cmd->addr = (__u64)(uintptr_t)buffer;
cmd->data_len = DATA_BUFLEN;
cmd->nsid = NSID;
slba = OFFSET >> LBA_SHIFT;
nlb = (DATA_BUFLEN >> LBA_SHIFT) - 1;
cmd->cdw10 = slba & 0xffffffff;
cmd->cdw11 = slba >> 32;
cmd->cdw12 = nlb;
/* set the pract and prchk (Guard, App, RefTag) bits in cdw12 */
//cmd->cdw12 |= 15 << 26;
cmd->cdw12 |= 7 << 26;
cmd->metadata = ((__u64)(uintptr_t)md_buf) + MD_OFFSET;
cmd->metadata_len = MD_BUFLEN;
/* reftag */
cmd->cdw14 = (__u32)slba;
/* apptag mask and apptag */
cmd->cdw15 = APPTAG_MASK << 16 | APPTAG;
ret = io_uring_submit(&ring);
if (ret != 1) {
fprintf(stderr, "submit got %d, wanted %d\n", ret, 1);
goto err;
}
ret = io_uring_wait_cqe(&ring, &cqe);
if (ret) {
fprintf(stderr, "wait_cqe=%d\n", ret);
goto err;
}
if (cqe->res != 0) {
fprintf(stderr, "cqe res %d, wanted success\n", cqe->res);
goto err;
}
io_uring_cqe_seen(&ring, cqe);
free_meta_buf(md_buf);
close(fd);
io_uring_queue_exit(&ring);
return 0;
err:
if (fd != -1)
close(fd);
io_uring_queue_exit(&ring);
return 1;
}
More information about the Linux-nvme
mailing list