[PATCH v9 7/8] PCI/P2PDMA: Allow userspace VMA allocations through sysfs

Logan Gunthorpe logang at deltatee.com
Thu Sep 1 09:32:55 PDT 2022




On 2022-09-01 10:20, Greg Kroah-Hartman wrote:
> On Thu, Aug 25, 2022 at 09:24:24AM -0600, Logan Gunthorpe wrote:
>> Create a sysfs bin attribute called "allocate" under the existing
>> "p2pmem" group. The only allowable operation on this file is the mmap()
>> call.
>>
>> When mmap() is called on this attribute, the kernel allocates a chunk of
>> memory from the genalloc and inserts the pages into the VMA. The
>> dev_pagemap .page_free callback will indicate when these pages are no
>> longer used and they will be put back into the genalloc.
>>
>> On device unbind, remove the sysfs file before the memremap_pages are
>> cleaned up. This ensures unmap_mapping_range() is called on the files
>> inode and no new mappings can be created.
>>
>> Signed-off-by: Logan Gunthorpe <logang at deltatee.com>
>> ---
>>  drivers/pci/p2pdma.c | 124 +++++++++++++++++++++++++++++++++++++++++++
>>  1 file changed, 124 insertions(+)
>>
>> diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
>> index 4496a7c5c478..a6ed6bbca214 100644
>> --- a/drivers/pci/p2pdma.c
>> +++ b/drivers/pci/p2pdma.c
>> @@ -89,6 +89,90 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
>>  }
>>  static DEVICE_ATTR_RO(published);
>>  
>> +static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
>> +		struct bin_attribute *attr, struct vm_area_struct *vma)
>> +{
>> +	struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
>> +	size_t len = vma->vm_end - vma->vm_start;
>> +	struct pci_p2pdma *p2pdma;
>> +	struct percpu_ref *ref;
>> +	unsigned long vaddr;
>> +	void *kaddr;
>> +	int ret;
>> +
>> +	/* prevent private mappings from being established */
>> +	if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
>> +		pci_info_ratelimited(pdev,
>> +				     "%s: fail, attempted private mapping\n",
>> +				     current->comm);
>> +		return -EINVAL;
>> +	}
>> +
>> +	if (vma->vm_pgoff) {
>> +		pci_info_ratelimited(pdev,
>> +				     "%s: fail, attempted mapping with non-zero offset\n",
>> +				     current->comm);
>> +		return -EINVAL;
>> +	}
>> +
>> +	rcu_read_lock();
>> +	p2pdma = rcu_dereference(pdev->p2pdma);
>> +	if (!p2pdma) {
>> +		ret = -ENODEV;
>> +		goto out;
>> +	}
>> +
>> +	kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
>> +	if (!kaddr) {
>> +		ret = -ENOMEM;
>> +		goto out;
>> +	}
>> +
>> +	/*
>> +	 * vm_insert_page() can sleep, so a reference is taken to mapping
>> +	 * such that rcu_read_unlock() can be done before inserting the
>> +	 * pages
>> +	 */
>> +	if (unlikely(!percpu_ref_tryget_live_rcu(ref))) {
>> +		ret = -ENODEV;
>> +		goto out_free_mem;
>> +	}
>> +	rcu_read_unlock();
>> +
>> +	for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
>> +		ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
>> +		if (ret) {
>> +			gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
>> +			return ret;
>> +		}
>> +		percpu_ref_get(ref);
>> +		put_page(virt_to_page(kaddr));
>> +		kaddr += PAGE_SIZE;
>> +		len -= PAGE_SIZE;
>> +	}
>> +
>> +	percpu_ref_put(ref);
>> +
>> +	return 0;
>> +out_free_mem:
>> +	gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
>> +out:
>> +	rcu_read_unlock();
>> +	return ret;
>> +}
>> +
>> +static struct bin_attribute p2pmem_alloc_attr = {
>> +	.attr = { .name = "allocate", .mode = 0660 },
>> +	.mmap = p2pmem_alloc_mmap,
>> +	/*
>> +	 * Some places where we want to call mmap (ie. python) will check
>> +	 * that the file size is greater than the mmap size before allowing
>> +	 * the mmap to continue. To work around this, just set the size
>> +	 * to be very large.
>> +	 */
>> +	.size = SZ_1T,
>> +};
>> +
>>  static struct attribute *p2pmem_attrs[] = {
>>  	&dev_attr_size.attr,
>>  	&dev_attr_available.attr,
>> @@ -96,11 +180,32 @@ static struct attribute *p2pmem_attrs[] = {
>>  	NULL,
>>  };
>>  
>> +static struct bin_attribute *p2pmem_bin_attrs[] = {
>> +	&p2pmem_alloc_attr,
>> +	NULL,
>> +};
>> +
>>  static const struct attribute_group p2pmem_group = {
>>  	.attrs = p2pmem_attrs,
>> +	.bin_attrs = p2pmem_bin_attrs,
>>  	.name = "p2pmem",
>>  };
>>  
>> +static void p2pdma_page_free(struct page *page)
>> +{
>> +	struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
>> +	struct percpu_ref *ref;
>> +
>> +	gen_pool_free_owner(pgmap->provider->p2pdma->pool,
>> +			    (uintptr_t)page_to_virt(page), PAGE_SIZE,
>> +			    (void **)&ref);
>> +	percpu_ref_put(ref);
>> +}
>> +
>> +static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
>> +	.page_free = p2pdma_page_free,
>> +};
>> +
>>  static void pci_p2pdma_release(void *data)
>>  {
>>  	struct pci_dev *pdev = data;
>> @@ -152,6 +257,19 @@ static int pci_p2pdma_setup(struct pci_dev *pdev)
>>  	return error;
>>  }
>>  
>> +static void pci_p2pdma_unmap_mappings(void *data)
>> +{
>> +	struct pci_dev *pdev = data;
>> +
>> +	/*
>> +	 * Removing the alloc attribute from sysfs will call
>> +	 * unmap_mapping_range() on the inode, teardown any existing userspace
>> +	 * mappings and prevent new ones from being created.
>> +	 */
>> +	sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
>> +				     p2pmem_group.name);
> 
> Wait, why are you manually removing the sysfs file here?  It's part of
> the group, if you do this then it is gone for forever, right?  Why
> manually do this the sysfs core should handle this for you if the device
> is removed.

We have to make sure the mappings are all removed before the cleanup of
devm_memremap_pages() which will wait for all the pages to be freed. If
we don't do this any userspace mapping will hang the cleanup until those
uses are unmapped themselves.

> And worst case, just pass in the device, not the pci device.

Ok, I'll make that change for v10.

Logan



More information about the Linux-nvme mailing list