[Xen-devel] [PATCH v2 19/20] xen/privcmd: Add support for Linux 64KB page granularity

Fri Jul 17 05:50:30 PDT 2015

Hi Stefano,

On 16/07/15 18:12, Stefano Stabellini wrote:
> On Thu, 9 Jul 2015, Julien Grall wrote:
>> The hypercall interface (as well as the toolstack) is always using 4KB
>> page granularity. When the toolstack is asking for mapping a series of
>> guest PFN in a batch, it expects to have the page map contiguously in
>> its virtual memory.
>>
>> When Linux is using 64KB page granularity, the privcmd driver will have
>> to map multiple Xen PFN in a single Linux page.
>>
>> Note that this solution works on page granularity which is a multiple of
>> 4KB.
>>
>> Signed-off-by: Julien Grall <julien.grall at citrix.com>
>> Cc: Konrad Rzeszutek Wilk <konrad.wilk at oracle.com>
>> Cc: Boris Ostrovsky <boris.ostrovsky at oracle.com>
>> Cc: David Vrabel <david.vrabel at citrix.com>
>> ---
>> Changes in v2:
>>     - Use xen_apply_to_page
>> ---
>>  drivers/xen/privcmd.c   |   8 +--
>>  drivers/xen/xlate_mmu.c | 127 +++++++++++++++++++++++++++++++++---------------
>>  2 files changed, 92 insertions(+), 43 deletions(-)
>>
>> diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c
>> index 5a29616..e8714b4 100644
>> --- a/drivers/xen/privcmd.c
>> +++ b/drivers/xen/privcmd.c
>> @@ -446,7 +446,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>>  		return -EINVAL;
>>  	}
>>  
>> -	nr_pages = m.num;
>> +	nr_pages = DIV_ROUND_UP_ULL(m.num, PAGE_SIZE / XEN_PAGE_SIZE);
>>  	if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT)))
>>  		return -EINVAL;
> 
> DIV_ROUND_UP is enough, neither arguments are unsigned long long

I'm not sure why I use DIV_ROUND_UP_ULL here... I will switch to
DIV_ROUND_UP in the next version.

> 
>> @@ -494,7 +494,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>>  			goto out_unlock;
>>  		}
>>  		if (xen_feature(XENFEAT_auto_translated_physmap)) {
>> -			ret = alloc_empty_pages(vma, m.num);
>> +			ret = alloc_empty_pages(vma, nr_pages);
>>  			if (ret < 0)
>>  				goto out_unlock;
>>  		} else
>> @@ -518,6 +518,7 @@ static long privcmd_ioctl_mmap_batch(void __user *udata, int version)
>>  	state.global_error  = 0;
>>  	state.version       = version;
>>  
>> +	BUILD_BUG_ON(((PAGE_SIZE / sizeof(xen_pfn_t)) % XEN_PFN_PER_PAGE) != 0);
>>  	/* mmap_batch_fn guarantees ret == 0 */
>>  	BUG_ON(traverse_pages_block(m.num, sizeof(xen_pfn_t),
>>  				    &pagelist, mmap_batch_fn, &state));
>> @@ -582,12 +583,13 @@ static void privcmd_close(struct vm_area_struct *vma)
>>  {
>>  	struct page **pages = vma->vm_private_data;
>>  	int numpgs = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
>> +	int nr_pfn = (vma->vm_end - vma->vm_start) >> XEN_PAGE_SHIFT;
>>  	int rc;
>>  
>>  	if (!xen_feature(XENFEAT_auto_translated_physmap) || !numpgs || !pages)
>>  		return;
>>  
>> -	rc = xen_unmap_domain_mfn_range(vma, numpgs, pages);
>> +	rc = xen_unmap_domain_mfn_range(vma, nr_pfn, pages);
>>  	if (rc == 0)
>>  		free_xenballooned_pages(numpgs, pages);
> 
> If you intend to pass the number of xen pages as nr argument to
> xen_unmap_domain_mfn_range, then I think that the changes to
> xen_xlate_unmap_gfn_range below are wrong.

Hmmm... right. I will fix it.

> 
> 
>>  	else
>> diff --git a/drivers/xen/xlate_mmu.c b/drivers/xen/xlate_mmu.c
>> index 58a5389..1fac17c 100644
>> --- a/drivers/xen/xlate_mmu.c
>> +++ b/drivers/xen/xlate_mmu.c
>> @@ -38,31 +38,9 @@
>>  #include <xen/interface/xen.h>
>>  #include <xen/interface/memory.h>
>>  
>> -/* map fgmfn of domid to lpfn in the current domain */
>> -static int map_foreign_page(unsigned long lpfn, unsigned long fgmfn,
>> -			    unsigned int domid)
>> -{
>> -	int rc;
>> -	struct xen_add_to_physmap_range xatp = {
>> -		.domid = DOMID_SELF,
>> -		.foreign_domid = domid,
>> -		.size = 1,
>> -		.space = XENMAPSPACE_gmfn_foreign,
>> -	};
>> -	xen_ulong_t idx = fgmfn;
>> -	xen_pfn_t gpfn = lpfn;
>> -	int err = 0;
>> -
>> -	set_xen_guest_handle(xatp.idxs, &idx);
>> -	set_xen_guest_handle(xatp.gpfns, &gpfn);
>> -	set_xen_guest_handle(xatp.errs, &err);
>> -
>> -	rc = HYPERVISOR_memory_op(XENMEM_add_to_physmap_range, &xatp);
>> -	return rc < 0 ? rc : err;
>> -}
>> -
>>  struct remap_data {
>>  	xen_pfn_t *fgmfn; /* foreign domain's gmfn */
>> +	xen_pfn_t *efgmfn; /* pointer to the end of the fgmfn array */
>>  	pgprot_t prot;
>>  	domid_t  domid;
>>  	struct vm_area_struct *vma;
>> @@ -71,24 +49,75 @@ struct remap_data {
>>  	struct xen_remap_mfn_info *info;
>>  	int *err_ptr;
>>  	int mapped;
>> +
>> +	/* Hypercall parameters */
>> +	int h_errs[XEN_PFN_PER_PAGE];
>> +	xen_ulong_t h_idxs[XEN_PFN_PER_PAGE];
>> +	xen_pfn_t h_gpfns[XEN_PFN_PER_PAGE];
> 
> I don't think you should be adding these fields to struct remap_data:
> struct remap_data is used to pass multi pages arguments from
> xen_xlate_remap_gfn_array to remap_pte_fn.
> 
> I think you need to introduce a different struct to pass per linux page
> arguments from remap_pte_fn to setup_hparams.

I didn't want to introduce a new structure in order to avoid allocating
it on the stack every time remap_pte_fn is called.

Maybe it is an optimization for nothing?

[...]

>> +	/* info->err_ptr expect to have one error status per Xen PFN */
>> +	for (i = 0; i < info->h_iter; i++) {
>> +		int err = (rc < 0) ? rc : info->h_errs[i];
>> +
>> +		*(info->err_ptr++) = err;
>> +		if (!err)
>> +			info->mapped++;
>>  	}
>> -	info->fgmfn++;
>> +
>> +	/*
>> +	 * Note: The hypercall will return 0 in most of the case if even if
>                                          ^ in most cases

Will fix it.

>> +	 * all the fgmfn are not mapped. We still have to update the pte
>        ^ not all the fgmfn are mapped.
> 
>> +	 * as the userspace may decide to continue.
>> +	 */
>> +	if (!rc)
>> +		set_pte_at(info->vma->vm_mm, addr, ptep, pte);
>>  
>>  	return 0;
>>  }
>> @@ -102,13 +131,14 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
>>  {
>>  	int err;
>>  	struct remap_data data;
>> -	unsigned long range = nr << PAGE_SHIFT;
>> +	unsigned long range = round_up(nr, XEN_PFN_PER_PAGE) << XEN_PAGE_SHIFT;
> 
> If would just BUG_ON(nr % XEN_PFN_PER_PAGE) and avoid the round_up;

As discussed IRL, the toolstack can request to map only 1 Xen page. So
the BUG_ON would always be hit.

Anyway, as you suggested IRL, I will replace the round_up by
DIV_ROUND_UP in the next version.

>>  	data.prot  = prot;
>>  	data.domid = domid;
>>  	data.vma   = vma;
>> @@ -123,21 +153,38 @@ int xen_xlate_remap_gfn_array(struct vm_area_struct *vma,
>>  }
>>  EXPORT_SYMBOL_GPL(xen_xlate_remap_gfn_array);
>>  
>> +static int unmap_gfn(struct page *page, unsigned long pfn, void *data)
>> +{
>> +	int *nr = data;
>> +	struct xen_remove_from_physmap xrp;
>> +
>> +	/* The Linux Page may not have been fully mapped to Xen */
>> +	if (!*nr)
>> +		return 0;
>> +
>> +	xrp.domid = DOMID_SELF;
>> +	xrp.gpfn = pfn;
>> +	(void)HYPERVISOR_memory_op(XENMEM_remove_from_physmap, &xrp);
>> +
>> +	(*nr)--;
> 
> I don't understand why you are passing nr as private argument. I would
> just call XENMEM_remove_from_physmap unconditionally here. Am I missing
> something? After all XENMEM_remove_from_physmap is just unmapping
> at 4K granularity, right?

Yes, but you may ask to only remove 1 4KB page. When 64KB is inuse that
would mean to call the hypervisor 16 times for only 1 useful remove.

This is because, the hypervisor doesn't provide an hypercall to remove a
list of PFN which is very infortunate.

Although, as discussed IIRC I can see to provide a new function
xen_apply_to_page_range which will handle the counter internally.

> 
> 
>> +	return 0;
>> +}
>> +
>>  int xen_xlate_unmap_gfn_range(struct vm_area_struct *vma,
>>  			      int nr, struct page **pages)
>>  {
>>  	int i;
>> +	int nr_page = round_up(nr, XEN_PFN_PER_PAGE);
> 
> If nr is the number of xen pages, then this should be:
> 
>     int nr_pages = DIV_ROUND_UP(nr, XEN_PFN_PER_PAGE);

Correct, I will fix it.

>> -	for (i = 0; i < nr; i++) {
>> -		struct xen_remove_from_physmap xrp;
>> -		unsigned long pfn;
>> +	for (i = 0; i < nr_page; i++) {
>> +		/* unmap_gfn guarantees ret == 0 */
>> +		BUG_ON(xen_apply_to_page(pages[i], unmap_gfn, &nr));
>> +	}
>>  
>> -		pfn = page_to_pfn(pages[i]);
>> +	/* We should have consume every xen page */
>                         ^ consumed

I will fix it.

Regards,

-- 
Julien Grall