[PATCH 1/2] ARM: lib: Add optimized memcpy with 64 byte pld size

Mon Mar 26 22:35:37 EDT 2012

On Tue, 27 Mar 2012, Boojin Kim wrote:

> This patch adds the optimized memcpy() for the architecture that has 64 byte PLD size.
> 
> Signed-off-by: Boojin Kim <boojin.kim at samsung.com>
> Cc: Russell King <rmk+kernel at arm.linux.org.uk>

This creates quite convoluted code.  If this is worth doing, we'll have 
to find a cleaner way to do this.

Could you please provide performance measurement numbers with and 
without this patch, and similarly for the next patch?

Did you try enabling the cache alignment code?  What performance 
difference if any did you see?

> ---
>  arch/arm/Kconfig             |    7 ++++++
>  arch/arm/lib/copy_template.S |   44 +++++++++++++++++++++++++++++++++--------
>  2 files changed, 42 insertions(+), 9 deletions(-)
> 
> diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
> index 8fec56d..ba306b3 100644
> --- a/arch/arm/Kconfig
> +++ b/arch/arm/Kconfig
> @@ -1132,6 +1132,13 @@ config ARM_NR_BANKS
>  	default 16 if ARCH_EP93XX
>  	default 8
> 
> +config ARM_PLD_SIZE
> +	int
> +	default 64 if ARCH_EXYNOS5
> +	default 32
> +	help
> +	  Configure preload size used on memcpy(). Select 64 for cortex-a15.
> +
>  config IWMMXT
>  	bool "Enable iWMMXt support"
>  	depends on CPU_XSCALE || CPU_XSC3 || CPU_MOHAWK || CPU_PJ4
> diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
> index 805e3f8..7dc5b8c 100644
> --- a/arch/arm/lib/copy_template.S
> +++ b/arch/arm/lib/copy_template.S
> @@ -66,6 +66,7 @@
>   *	than one 32bit instruction in Thumb-2)
>   */
> 
> +#define PLDSIZE	(CONFIG_ARM_PLD_SIZE)
> 
>  		enter	r4, lr
> 
> @@ -90,19 +91,44 @@
>  	CALGN(	add	pc, r4, ip		)
> 
>  	PLD(	pld	[r1, #0]		)
> -2:	PLD(	subs	r2, r2, #96		)
> -	PLD(	pld	[r1, #28]		)
> +
> +#if (PLDSIZE == 64)
> +2:	PLD(	cmp	r2, #32)
> +	PLD(	blt	.32cpy)
> +.64cpy:	PLD(	subs	r2, r2, #(PLDSIZE*3+32)	)
> +	PLD(	pld	[r1, #PLDSIZE-4]	)
>  	PLD(	blt	4f			)
> -	PLD(	pld	[r1, #60]		)
> -	PLD(	pld	[r1, #92]		)
> +	PLD(	pld	[r1, #PLDSIZE*2-4]	)
> +	PLD(	pld	[r1, #PLDSIZE*3-4]	)
> +
> +3:	PLD(	pld	[r1, #PLDSIZE*4-4]	)
> +4:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		subs	r2, r2, #PLDSIZE
> +		bge	3b
> +	PLD(	cmn	r2, #(PLDSIZE*3)	)
> +	PLD(	bge	4b			)
> +	PLD(	cmn	r2, #(PLDSIZE*4-32)	)
> +	PLD(	blt	5f)
> +.32cpy:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +#else
> +2:	PLD(	subs	r2, r2, #(PLDSIZE*3)	)
> +	PLD(	pld	[r1, #(PLDSIZE-4)]	)
> +	PLD(	blt	4f			)
> +	PLD(	pld	[r1, #(PLDSIZE*2-4)]	)
> +	PLD(	pld	[r1, #(PLDSIZE*3-4)]	)
> 
> -3:	PLD(	pld	[r1, #124]		)
> -4:		ldr8w	r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> -		subs	r2, r2, #32
> -		str8w	r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +3:	PLD(	pld	[r1, #(PLDSIZE*4-4)]	)
> +4:		ldr8w   r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
> +		subs	r2, r2, #PLDSIZE
> +		str8w   r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
>  		bge	3b
> -	PLD(	cmn	r2, #96			)
> +	PLD(	cmn	r2, #(PLDSIZE*3)	)
>  	PLD(	bge	4b			)
> +#endif
> 
>  5:		ands	ip, r2, #28
>  		rsb	ip, ip, #32
> --
> 1.7.1
> 
> 
> 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>