[PATCH 2/3] arm64: lib: improve copy performance when size is ge 128 bytes

Yang Yingliang yangyingliang at huawei.com
Tue Mar 23 07:34:31 GMT 2021


When copy over 128 bytes, src/dst is added after
each ldp/stp instruction, it will cost more time.
To improve this, we only add src/dst after load
or store 64 bytes.

Copy 4096 bytes cost on Kunpeng920 (ms):
Without this patch:
memcpy: 143.85 copy_from_user: 172.69 copy_to_user: 199.23

With this patch:
memcpy: 107.12 copy_from_user: 157.50 copy_to_user: 198.85

It's about 25% improvement in memcpy().

Signed-off-by: Yang Yingliang <yangyingliang at huawei.com>
---
 arch/arm64/lib/copy_template.S | 36 +++++++++++++++++++---------------
 1 file changed, 20 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
index 488df234c49a..c3cd6f84c9c0 100644
--- a/arch/arm64/lib/copy_template.S
+++ b/arch/arm64/lib/copy_template.S
@@ -152,29 +152,33 @@ D_h	.req	x14
 	.p2align	L1_CACHE_SHIFT
 .Lcpy_body_large:
 	/* pre-get 64 bytes data. */
-	ldp1	A_l, A_h, src, #16
-	ldp1	B_l, B_h, src, #16
-	ldp1	C_l, C_h, src, #16
-	ldp1	D_l, D_h, src, #16
+	ldp2	A_l, A_h, src, #0,  #8
+	ldp2	B_l, B_h, src, #16, #24
+	ldp2	C_l, C_h, src, #32, #40
+	ldp2	D_l, D_h, src, #48, #56
+	add	src, src, #64
 1:
 	/*
 	* interlace the load of next 64 bytes data block with store of the last
 	* loaded 64 bytes data.
 	*/
-	stp1	A_l, A_h, dst, #16
-	ldp1	A_l, A_h, src, #16
-	stp1	B_l, B_h, dst, #16
-	ldp1	B_l, B_h, src, #16
-	stp1	C_l, C_h, dst, #16
-	ldp1	C_l, C_h, src, #16
-	stp1	D_l, D_h, dst, #16
-	ldp1	D_l, D_h, src, #16
+	stp2	A_l, A_h, dst, #0,  #8
+	ldp2	A_l, A_h, src, #0,  #8
+	stp2	B_l, B_h, dst, #16, #24
+	ldp2	B_l, B_h, src, #16, #24
+	stp2	C_l, C_h, dst, #32, #40
+	ldp2	C_l, C_h, src, #32, #40
+	stp2	D_l, D_h, dst, #48, #56
+	ldp2	D_l, D_h, src, #48, #56
+	add	src, src, #64
+	add	dst, dst, #64
 	subs	count, count, #64
 	b.ge	1b
-	stp1	A_l, A_h, dst, #16
-	stp1	B_l, B_h, dst, #16
-	stp1	C_l, C_h, dst, #16
-	stp1	D_l, D_h, dst, #16
+	stp2	A_l, A_h, dst, #0,  #8
+	stp2	B_l, B_h, dst, #16, #24
+	stp2	C_l, C_h, dst, #32, #40
+	stp2	D_l, D_h, dst, #48, #56
+	add	dst, dst, #64
 
 	tst	count, #0x3f
 	b.ne	.Ltail63
-- 
2.25.1




More information about the linux-arm-kernel mailing list