[PATCHv2 1/6] arm64: lib: Implement optimized memcpy routine

Sun Apr 27 22:11:29 PDT 2014

From: "zhichang.yuan" <zhichang.yuan at linaro.org>

This patch, based on Linaro's Cortex Strings library, improves
the performance of the assembly optimized memcpy() function.

Signed-off-by: Zhichang Yuan <zhichang.yuan at linaro.org>
Signed-off-by: Deepak Saxena <dsaxena at linaro.org>
---
 arch/arm64/lib/memcpy.S |  192 +++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 170 insertions(+), 22 deletions(-)

diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
index 27b5003..8a9a96d 100644
--- a/arch/arm64/lib/memcpy.S
+++ b/arch/arm64/lib/memcpy.S
@@ -1,5 +1,13 @@
 /*
  * Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -16,6 +24,7 @@
 
 #include <linux/linkage.h>
 #include <asm/assembler.h>
+#include <asm/cache.h>
 
 /*
  * Copy a buffer from src to dest (alignment handled by the hardware)
@@ -27,27 +36,166 @@
  * Returns:
  *	x0 - dest
  */
+dstin	.req	x0
+src	.req	x1
+count	.req	x2
+tmp1	.req	x3
+tmp1w	.req	w3
+tmp2	.req	x4
+tmp2w	.req	w4
+tmp3	.req	x5
+tmp3w	.req	w5
+dst	.req	x6
+
+A_l	.req	x7
+A_h	.req	x8
+B_l	.req	x9
+B_h	.req	x10
+C_l	.req	x11
+C_h	.req	x12
+D_l	.req	x13
+D_h	.req	x14
+
 ENTRY(memcpy)
-	mov	x4, x0
-	subs	x2, x2, #8
-	b.mi	2f
-1:	ldr	x3, [x1], #8
-	subs	x2, x2, #8
-	str	x3, [x4], #8
-	b.pl	1b
-2:	adds	x2, x2, #4
-	b.mi	3f
-	ldr	w3, [x1], #4
-	sub	x2, x2, #4
-	str	w3, [x4], #4
-3:	adds	x2, x2, #2
-	b.mi	4f
-	ldrh	w3, [x1], #2
-	sub	x2, x2, #2
-	strh	w3, [x4], #2
-4:	adds	x2, x2, #1
-	b.mi	5f
-	ldrb	w3, [x1]
-	strb	w3, [x4]
-5:	ret
+	mov	dst, dstin
+	cmp	count, #16
+	/*When memory length is less than 16, the accessed are not aligned.*/
+	b.lo	.Ltiny15
+
+	neg	tmp2, src
+	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
+	b.eq	.LSrcAligned
+	sub	count, count, tmp2
+	/*
+	* Copy the leading memory data from src to dst in an increasing
+	* address order.By this way,the risk of overwritting the source
+	* memory data is eliminated when the distance between src and
+	* dst is less than 16. The memory accesses here are alignment.
+	*/
+	tbz	tmp2, #0, 1f
+	ldrb	tmp1w, [src], #1
+	strb	tmp1w, [dst], #1
+1:
+	tbz	tmp2, #1, 2f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+2:
+	tbz	tmp2, #2, 3f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+3:
+	tbz	tmp2, #3, .LSrcAligned
+	ldr	tmp1, [src],#8
+	str	tmp1, [dst],#8
+
+.LSrcAligned:
+	cmp	count, #64
+	b.ge	.Lcpy_over64
+	/*
+	* Deal with small copies quickly by dropping straight into the
+	* exit block.
+	*/
+.Ltail63:
+	/*
+	* Copy up to 48 bytes of data. At this point we only need the
+	* bottom 6 bits of count to be accurate.
+	*/
+	ands	tmp1, count, #0x30
+	b.eq	.Ltiny15
+	cmp	tmp1w, #0x20
+	b.eq	1f
+	b.lt	2f
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+1:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+2:
+	ldp	A_l, A_h, [src], #16
+	stp	A_l, A_h, [dst], #16
+.Ltiny15:
+	/*
+	* Prefer to break one ldp/stp into several load/store to access
+	* memory in an increasing address order,rather than to load/store 16
+	* bytes from (src-16) to (dst-16) and to backward the src to aligned
+	* address,which way is used in original cortex memcpy. If keeping
+	* the original memcpy process here, memmove need to satisfy the
+	* precondition that src address is at least 16 bytes bigger than dst
+	* address,otherwise some source data will be overwritten when memove
+	* call memcpy directly. To make memmove simpler and decouple the
+	* memcpy's dependency on memmove, withdrew the original process.
+	*/
+	tbz	count, #3, 1f
+	ldr	tmp1, [src], #8
+	str	tmp1, [dst], #8
+1:
+	tbz	count, #2, 2f
+	ldr	tmp1w, [src], #4
+	str	tmp1w, [dst], #4
+2:
+	tbz	count, #1, 3f
+	ldrh	tmp1w, [src], #2
+	strh	tmp1w, [dst], #2
+3:
+	tbz	count, #0, .Lexitfunc
+	ldrb	tmp1w, [src]
+	strb	tmp1w, [dst]
+
+.Lexitfunc:
+	ret
+
+.Lcpy_over64:
+	subs	count, count, #128
+	b.ge	.Lcpy_body_large
+	/*
+	* Less than 128 bytes to copy, so handle 64 here and then jump
+	* to the tail.
+	*/
+	ldp	A_l, A_h, [src],#16
+	stp	A_l, A_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
+
+	/*
+	* Critical loop.  Start at a new cache line boundary.  Assuming
+	* 64 bytes per line this ensures the entire loop is in one line.
+	*/
+	.p2align	L1_CACHE_SHIFT
+.Lcpy_body_large:
+	/* pre-get 64 bytes data. */
+	ldp	A_l, A_h, [src],#16
+	ldp	B_l, B_h, [src],#16
+	ldp	C_l, C_h, [src],#16
+	ldp	D_l, D_h, [src],#16
+1:
+	/*
+	* interlace the load of next 64 bytes data block with store of the last
+	* loaded 64 bytes data.
+	*/
+	stp	A_l, A_h, [dst],#16
+	ldp	A_l, A_h, [src],#16
+	stp	B_l, B_h, [dst],#16
+	ldp	B_l, B_h, [src],#16
+	stp	C_l, C_h, [dst],#16
+	ldp	C_l, C_h, [src],#16
+	stp	D_l, D_h, [dst],#16
+	ldp	D_l, D_h, [src],#16
+	subs	count, count, #64
+	b.ge	1b
+	stp	A_l, A_h, [dst],#16
+	stp	B_l, B_h, [dst],#16
+	stp	C_l, C_h, [dst],#16
+	stp	D_l, D_h, [dst],#16
+
+	tst	count, #0x3f
+	b.ne	.Ltail63
+	ret
 ENDPROC(memcpy)
-- 
1.7.9.5