[PATCH v2] arm64: Revert L1_CACHE_SHIFT back to 6 (64-byte cache line size)

Robin Murphy robin.murphy at arm.com
Wed Feb 28 11:18:47 PST 2018


Hi Catalin,

On 28/02/18 18:47, Catalin Marinas wrote:
> Commit 97303480753e ("arm64: Increase the max granular size") increased
> the cache line size to 128 to match Cavium ThunderX, apparently for some
> performance benefit which could not be confirmed. This change, however,
> has an impact on the network packets allocation in certain
> circumstances, requiring slightly over a 4K page with a significant
> performance degradation.
> 
> This patch reverts L1_CACHE_SHIFT back to 6 (64-byte cache line) while
> keeping ARCH_DMA_MINALIGN at 128. The cache_line_size() function was
> changed to default to ARCH_DMA_MINALIGN in the absence of a meaningful
> CTR_EL0.CWG bit field.
> 
> In addition, if a system with ARCH_DMA_MINALIGN < CTR_EL0.CWG is
> detected, the kernel will force swiotlb bounce buffering for all
> non-coherent devices since DMA cache maintenance on sub-CWG ranges is
> not safe, leading to data corruption.
> 
> Cc: Tirumalesh Chalamarla <tchalamarla at cavium.com>
> Cc: Timur Tabi <timur at codeaurora.org>
> Cc: Florian Fainelli <f.fainelli at gmail.com>
> Cc: Robin Murphy <robin.murphy at arm.com>
> Cc: Will Deacon <will.deacon at arm.com>
> Signed-off-by: Catalin Marinas <catalin.marinas at arm.com>
> ---
> 
> Changes since v1:
>   
> - Moved CWG check and swiotlb = 1 assignment to arm64_dma_init() following
>    Robin's comment (swiotlb is __ro_after_init)
> 
> - The swiotlb_noncoherent_bounce label is now enabled in arm64_dma_init().
>    There is a possibility of a platform with large CWG not having any
>    non-coherent device and dma_capable() being slightly slower. But this
>    exercise is mostly theoretical anyway
> 
> - Removed Will's ack since I've made some small changes.
> 
>   arch/arm64/Kconfig                  |  1 +
>   arch/arm64/include/asm/cache.h      |  6 +++---
>   arch/arm64/include/asm/dma-direct.h | 43 +++++++++++++++++++++++++++++++++++++
>   arch/arm64/kernel/cpufeature.c      |  9 ++------
>   arch/arm64/mm/dma-mapping.c         | 17 +++++++++++++++
>   arch/arm64/mm/init.c                |  3 ++-
>   6 files changed, 68 insertions(+), 11 deletions(-)
>   create mode 100644 arch/arm64/include/asm/dma-direct.h
> 
> diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
> index 7381eeb7ef8e..655c0e99d9fa 100644
> --- a/arch/arm64/Kconfig
> +++ b/arch/arm64/Kconfig
> @@ -17,6 +17,7 @@ config ARM64
>   	select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA
>   	select ARCH_HAS_KCOV
>   	select ARCH_HAS_MEMBARRIER_SYNC_CORE
> +	select ARCH_HAS_PHYS_TO_DMA
>   	select ARCH_HAS_SET_MEMORY
>   	select ARCH_HAS_SG_CHAIN
>   	select ARCH_HAS_STRICT_KERNEL_RWX
> diff --git a/arch/arm64/include/asm/cache.h b/arch/arm64/include/asm/cache.h
> index ea9bb4e0e9bb..b2e6ece23713 100644
> --- a/arch/arm64/include/asm/cache.h
> +++ b/arch/arm64/include/asm/cache.h
> @@ -29,7 +29,7 @@
>   #define ICACHE_POLICY_VIPT	2
>   #define ICACHE_POLICY_PIPT	3
>   
> -#define L1_CACHE_SHIFT		7
> +#define L1_CACHE_SHIFT		(6)
>   #define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
>   
>   /*
> @@ -39,7 +39,7 @@
>    * cache before the transfer is done, causing old data to be seen by
>    * the CPU.
>    */
> -#define ARCH_DMA_MINALIGN	L1_CACHE_BYTES
> +#define ARCH_DMA_MINALIGN	(128)
>   
>   #ifndef __ASSEMBLY__
>   
> @@ -73,7 +73,7 @@ static inline u32 cache_type_cwg(void)
>   static inline int cache_line_size(void)
>   {
>   	u32 cwg = cache_type_cwg();
> -	return cwg ? 4 << cwg : L1_CACHE_BYTES;
> +	return cwg ? 4 << cwg : ARCH_DMA_MINALIGN;
>   }
>   
>   #endif	/* __ASSEMBLY__ */
> diff --git a/arch/arm64/include/asm/dma-direct.h b/arch/arm64/include/asm/dma-direct.h
> new file mode 100644
> index 000000000000..abb1b40ec751
> --- /dev/null
> +++ b/arch/arm64/include/asm/dma-direct.h
> @@ -0,0 +1,43 @@
> +/* SPDX-License-Identifier: GPL-2.0 */
> +#ifndef __ASM_DMA_DIRECT_H
> +#define __ASM_DMA_DIRECT_H
> +
> +#include <linux/jump_label.h>
> +#include <linux/swiotlb.h>
> +
> +#include <asm/cache.h>
> +
> +DECLARE_STATIC_KEY_FALSE(swiotlb_noncoherent_bounce);
> +
> +static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
> +{
> +	dma_addr_t dev_addr = (dma_addr_t)paddr;
> +
> +	return dev_addr - ((dma_addr_t)dev->dma_pfn_offset << PAGE_SHIFT);
> +}
> +
> +static inline phys_addr_t dma_to_phys(struct device *dev, dma_addr_t dev_addr)
> +{
> +	phys_addr_t paddr = (phys_addr_t)dev_addr;
> +
> +	return paddr + ((phys_addr_t)dev->dma_pfn_offset << PAGE_SHIFT);
> +}
> +
> +static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
> +{
> +	if (!dev->dma_mask)
> +		return false;
> +
> +	/*
> +	 * Force swiotlb buffer bouncing when ARCH_DMA_MINALIGN < CWG. The
> +	 * swiotlb bounce buffers are aligned to (1 << IO_TLB_SHIFT).
> +	 */

The relevance of the second half of that comment isn't entirely obvious 
- I assume you're referring to the fact that the IOTLB slab size happens 
to conveniently match the largest possible CWG?

I wonder somewhat if it's worth going even further down the ridiculously 
over-cautious route and adding a BUILD_BUG_ON(IO_TLB_SHIFT < 11), just 
so we'd get a heads-up in future if this could otherwise become silently 
broken...

Otherwise, the patch looks OK to me now - as horrible as it inherently is ;)

Acked-by: Robin Murphy <robin.murphy at arm.com>

> +	if (static_branch_unlikely(&swiotlb_noncoherent_bounce) &&
> +	    !is_device_dma_coherent(dev) &&
> +	    !is_swiotlb_buffer(dma_to_phys(dev, addr)))
> +		return false;
> +
> +	return addr + size - 1 <= *dev->dma_mask;
> +}
> +
> +#endif /* __ASM_DMA_DIRECT_H */
> diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
> index 2985a067fc13..702901eadf75 100644
> --- a/arch/arm64/kernel/cpufeature.c
> +++ b/arch/arm64/kernel/cpufeature.c
> @@ -1385,7 +1385,6 @@ bool this_cpu_has_cap(unsigned int cap)
>   void __init setup_cpu_features(void)
>   {
>   	u32 cwg;
> -	int cls;
>   
>   	/* Set the CPU feature capabilies */
>   	setup_feature_capabilities();
> @@ -1405,13 +1404,9 @@ void __init setup_cpu_features(void)
>   	 * Check for sane CTR_EL0.CWG value.
>   	 */
>   	cwg = cache_type_cwg();
> -	cls = cache_line_size();
>   	if (!cwg)
> -		pr_warn("No Cache Writeback Granule information, assuming cache line size %d\n",
> -			cls);
> -	if (L1_CACHE_BYTES < cls)
> -		pr_warn("L1_CACHE_BYTES smaller than the Cache Writeback Granule (%d < %d)\n",
> -			L1_CACHE_BYTES, cls);
> +		pr_warn("No Cache Writeback Granule information, assuming %d\n",
> +			ARCH_DMA_MINALIGN);
>   }
>   
>   static bool __maybe_unused
> diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c
> index a96ec0181818..1e9dac8684ca 100644
> --- a/arch/arm64/mm/dma-mapping.c
> +++ b/arch/arm64/mm/dma-mapping.c
> @@ -33,6 +33,7 @@
>   #include <asm/cacheflush.h>
>   
>   static int swiotlb __ro_after_init;
> +DEFINE_STATIC_KEY_FALSE(swiotlb_noncoherent_bounce);
>   
>   static pgprot_t __get_dma_pgprot(unsigned long attrs, pgprot_t prot,
>   				 bool coherent)
> @@ -504,6 +505,14 @@ static int __init arm64_dma_init(void)
>   	    max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT))
>   		swiotlb = 1;
>   
> +	if (WARN_TAINT(ARCH_DMA_MINALIGN < cache_line_size(),
> +		       TAINT_CPU_OUT_OF_SPEC,
> +		       "ARCH_DMA_MINALIGN smaller than CTR_EL0.CWG (%d < %d)",
> +		       ARCH_DMA_MINALIGN, cache_line_size())) {
> +		swiotlb = 1;
> +		static_branch_enable(&swiotlb_noncoherent_bounce);
> +	}
> +
>   	return atomic_pool_init();
>   }
>   arch_initcall(arm64_dma_init);
> @@ -882,6 +891,14 @@ static void __iommu_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
>   void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size,
>   			const struct iommu_ops *iommu, bool coherent)
>   {
> +	/*
> +	 * Enable swiotlb for buffer bouncing if ARCH_DMA_MINALIGN < CWG.
> +	 * dma_capable() forces the actual bounce if the device is
> +	 * non-coherent.
> +	 */
> +	if (static_branch_unlikely(&swiotlb_noncoherent_bounce) && !coherent)
> +		iommu = NULL;
> +
>   	if (!dev->dma_ops)
>   		dev->dma_ops = &arm64_swiotlb_dma_ops;
>   
> diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c
> index 9f3c47acf8ff..664acf177799 100644
> --- a/arch/arm64/mm/init.c
> +++ b/arch/arm64/mm/init.c
> @@ -586,7 +586,8 @@ static void __init free_unused_memmap(void)
>   void __init mem_init(void)
>   {
>   	if (swiotlb_force == SWIOTLB_FORCE ||
> -	    max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT))
> +	    max_pfn > (arm64_dma_phys_limit >> PAGE_SHIFT) ||
> +	    ARCH_DMA_MINALIGN < cache_line_size())
>   		swiotlb_init(1);
>   	else
>   		swiotlb_force = SWIOTLB_NO_FORCE;
> 



More information about the linux-arm-kernel mailing list