[PATCH v3] arm64: Add support for DMA_ATTR_FORCE_CONTIGUOUS to IOMMU

Thu Feb 2 05:15:53 PST 2017

Hi Geert,

On 31/01/17 11:12, Geert Uytterhoeven wrote:
> Add support for allocating physically contiguous DMA buffers on arm64
> systems with an IOMMU.  This can be useful when two or more devices
> with different memory requirements are involved in buffer sharing.
> 
> Note that as this uses the CMA allocator, setting the
> DMA_ATTR_FORCE_CONTIGUOUS attribute has a runtime-dependency on
> CONFIG_DMA_CMA, just like on arm32.
> 
> For arm64 systems using swiotlb, no changes are needed to support the
> allocation of physically contiguous DMA buffers:
>   - swiotlb always uses physically contiguous buffers (up to
>     IO_TLB_SEGSIZE = 128 pages),
>   - arm64's __dma_alloc_coherent() already calls
>     dma_alloc_from_contiguous() when CMA is available.
> 
> Signed-off-by: Geert Uytterhoeven <geert+renesas at glider.be>
> Acked-by: Laurent Pinchart <laurent.pinchart at ideasonboard.com>
> ---
> v3:
>   - Add Acked-by,
>   - Update comment to "one of _4_ things",
>   - Call dma_alloc_from_contiguous() and iommu_dma_map_page() directly,
>     as suggested by Robin Murphy,
> 
> v2:
>   - New, handle dispatching in the arch (arm64) code, as requested by
>     Robin Murphy.
> ---
>  arch/arm64/mm/dma-mapping.c | 63 ++++++++++++++++++++++++++++++++++-----------
>  1 file changed, 48 insertions(+), 15 deletions(-)
> 
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index 1d7d5d2881db7c19..b314256fcee028ce 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -577,20 +577,7 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
>  	 */
>  	gfp |= __GFP_ZERO;
>  
> -	if (gfpflags_allow_blocking(gfp)) {
> -		struct page **pages;
> -		pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
> -
> -		pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
> -					handle, flush_page);
> -		if (!pages)
> -			return NULL;
> -
> -		addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
> -					      __builtin_return_address(0));
> -		if (!addr)
> -			iommu_dma_free(dev, pages, iosize, handle);
> -	} else {
> +	if (!gfpflags_allow_blocking(gfp)) {
>  		struct page *page;
>  		/*
>  		 * In atomic context we can't remap anything, so we'll only
> @@ -614,6 +601,45 @@ static void *__iommu_alloc_attrs(struct device *dev, size_t size,
>  				__free_from_pool(addr, size);
>  			addr = NULL;
>  		}
> +	} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
> +		pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
> +		struct page *page;
> +
> +		page = dma_alloc_from_contiguous(dev, size >> PAGE_SHIFT,
> +						 get_order(size));
> +		if (!page)
> +			return NULL;
> +
> +		*handle = iommu_dma_map_page(dev, page, 0, iosize, ioprot);
> +		if (iommu_dma_mapping_error(dev, *handle)) {
> +			dma_release_from_contiguous(dev, page,
> +						    size >> PAGE_SHIFT);
> +			return NULL;
> +		}
> +		if (!coherent)

I think we might be able to return early if coherent here, as the
non-IOMMU __dma_alloc() does. We still need a (cacheable) remap in the
normal coherent case to make whatever pages iommu_dma_alloc() scrapes
together appear contiguous, but that obviously isn't a concern here.
However, I'm not entirely confident that that wouldn't break (or at
least further complicate) the already-horrible "what the hell is this?"
logic in __iommu_free_attrs() below, so I'm inclined to leave it as-is.

> +			__dma_flush_area(page_to_virt(page), iosize);
> +
> +		addr = dma_common_contiguous_remap(page, size, VM_USERMAP,
> +						   prot,
> +						   __builtin_return_address(0));
> +		if (!addr) {
> +			iommu_dma_unmap_page(dev, *handle, iosize, 0, attrs);
> +			dma_release_from_contiguous(dev, page,
> +						    size >> PAGE_SHIFT);
> +		}
> +	} else {
> +		pgprot_t prot = __get_dma_pgprot(attrs, PAGE_KERNEL, coherent);
> +		struct page **pages;
> +
> +		pages = iommu_dma_alloc(dev, iosize, gfp, attrs, ioprot,
> +					handle, flush_page);
> +		if (!pages)
> +			return NULL;
> +
> +		addr = dma_common_pages_remap(pages, size, VM_USERMAP, prot,
> +					      __builtin_return_address(0));
> +		if (!addr)
> +			iommu_dma_free(dev, pages, iosize, handle);
>  	}
>  	return addr;
>  }
> @@ -625,7 +651,8 @@ static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
>  
>  	size = PAGE_ALIGN(size);
>  	/*
> -	 * @cpu_addr will be one of 3 things depending on how it was allocated:
> +	 * @cpu_addr will be one of 4 things depending on how it was allocated:
> +	 * - A remapped array of pages for contiguous allocations.
>  	 * - A remapped array of pages from iommu_dma_alloc(), for all
>  	 *   non-atomic allocations.
>  	 * - A non-cacheable alias from the atomic pool, for atomic
> @@ -637,6 +664,12 @@ static void __iommu_free_attrs(struct device *dev, size_t size, void *cpu_addr,
>  	if (__in_atomic_pool(cpu_addr, size)) {
>  		iommu_dma_unmap_page(dev, handle, iosize, 0, 0);
>  		__free_from_pool(cpu_addr, size);
> +	} else if (attrs & DMA_ATTR_FORCE_CONTIGUOUS) {
> +		struct page *page = phys_to_page(dma_to_phys(dev, handle));

handle is the IOVA returned by the original iommu_dma_map_page() call,
so I can't see this working very well. I was about to naively suggest
virt_to_page(cpu_addr), except of course cpu_addr isn't going to be a
linear map address :(

I think that works out to another argument in favour of not trying to
skip the remap when coherent, since that would mean we can consistently
rely on vmalloc_to_page(cpu_addr) here, and everything *should* be OK
(crosses fingers...)

Robin.

> +
> +		iommu_dma_unmap_page(dev, handle, iosize, 0, attrs);
> +		dma_release_from_contiguous(dev, page, size >> PAGE_SHIFT);
> +		dma_common_free_remap(cpu_addr, size, VM_USERMAP);
>  	} else if (is_vmalloc_addr(cpu_addr)){
>  		struct vm_struct *area = find_vm_area(cpu_addr);
>  
>