[PATCH] arm64/lib: copy_page: s/stnp/stp

Jisheng Zhang jszhang at kernel.org
Wed Jun 12 17:18:12 PDT 2024


stnp performs non-temporal store, give a hints to the memory system
that caching is not useful for this data. But the scenario where
copy_page() used may not have this implication, although I must admit
there's such case where stnp helps performance(good). In this good
case, we can rely on the HW write streaming mechanism in some
implementations such as cortex-a55 to detect the case and take actions.

testing with https://github.com/apinski-cavium/copy_page_benchmark
this patch can reduce the time by about 3% on cortex-a55 platforms.

Signed-off-by: Jisheng Zhang <jszhang at kernel.org>
---
 arch/arm64/lib/copy_page.S | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/lib/copy_page.S b/arch/arm64/lib/copy_page.S
index 6a56d7cf309d..4c74fe2d8bd6 100644
--- a/arch/arm64/lib/copy_page.S
+++ b/arch/arm64/lib/copy_page.S
@@ -32,21 +32,21 @@ SYM_FUNC_START(__pi_copy_page)
 1:
 	tst	x0, #(PAGE_SIZE - 1)
 
-	stnp	x2, x3, [x0, #-256]
+	stp	x2, x3, [x0, #-256]
 	ldp	x2, x3, [x1]
-	stnp	x4, x5, [x0, #16 - 256]
+	stp	x4, x5, [x0, #16 - 256]
 	ldp	x4, x5, [x1, #16]
-	stnp	x6, x7, [x0, #32 - 256]
+	stp	x6, x7, [x0, #32 - 256]
 	ldp	x6, x7, [x1, #32]
-	stnp	x8, x9, [x0, #48 - 256]
+	stp	x8, x9, [x0, #48 - 256]
 	ldp	x8, x9, [x1, #48]
-	stnp	x10, x11, [x0, #64 - 256]
+	stp	x10, x11, [x0, #64 - 256]
 	ldp	x10, x11, [x1, #64]
-	stnp	x12, x13, [x0, #80 - 256]
+	stp	x12, x13, [x0, #80 - 256]
 	ldp	x12, x13, [x1, #80]
-	stnp	x14, x15, [x0, #96 - 256]
+	stp	x14, x15, [x0, #96 - 256]
 	ldp	x14, x15, [x1, #96]
-	stnp	x16, x17, [x0, #112 - 256]
+	stp	x16, x17, [x0, #112 - 256]
 	ldp	x16, x17, [x1, #112]
 
 	add	x0, x0, #128
@@ -54,14 +54,14 @@ SYM_FUNC_START(__pi_copy_page)
 
 	b.ne	1b
 
-	stnp	x2, x3, [x0, #-256]
-	stnp	x4, x5, [x0, #16 - 256]
-	stnp	x6, x7, [x0, #32 - 256]
-	stnp	x8, x9, [x0, #48 - 256]
-	stnp	x10, x11, [x0, #64 - 256]
-	stnp	x12, x13, [x0, #80 - 256]
-	stnp	x14, x15, [x0, #96 - 256]
-	stnp	x16, x17, [x0, #112 - 256]
+	stp	x2, x3, [x0, #-256]
+	stp	x4, x5, [x0, #16 - 256]
+	stp	x6, x7, [x0, #32 - 256]
+	stp	x8, x9, [x0, #48 - 256]
+	stp	x10, x11, [x0, #64 - 256]
+	stp	x12, x13, [x0, #80 - 256]
+	stp	x14, x15, [x0, #96 - 256]
+	stp	x16, x17, [x0, #112 - 256]
 
 	ret
 SYM_FUNC_END(__pi_copy_page)
-- 
2.43.0




More information about the linux-arm-kernel mailing list