[RFT PATCH] arm64: atomics: prefetch the destination prior to LSE operations

Thu Jul 24 05:06:51 PDT 2025

From: Yicong Yang <yangyicong at hisilicon.com>

commit 0ea366f5e1b6 ("arm64: atomics: prefetch the destination word for write prior to stxr")
adds prefetch prior to LL/SC operations due to performance concerns -
change the cacheline status from exclusive could be significant. This is
also true for LSE operations, so prefetch the destination prior to LSE
operations.

Tested on my HIP08 server (2 * 64 CPU) using `perf bench -r 100 futex all`
which could stress the spinlock of the futex hash bucket:
                        6.16-rc7 patched
futex/hash(ops/sec)     171843   204757 +19.15%
futex/wake(ms)          0.4630   0.4216 +8.94%
futex/wake-parallel(ms) 0.0048   0.0039 +18.75%
futex/requeue(ms)       0.1487   0.1508 -1.41%
(2nd validation)                 0.1484 +0.2%
futex/lock-pi(ops/sec)  125      126    +0.8%

For a single wake test for different threads number using `perf bench
-r 100 futex wake -t <threads>`:
threads 6.16-rc7 patched
1       0.0035   0.0032 +8.57%
48      0.1454   0.1221 +16.02%
96      0.3047   0.2304 +24.38%
160     0.5489   0.5012 +8.69%
192     0.6675   0.5906 +11.52%
256     0.9445   0.8092 +14.33%

There're some variation for close numbers but overall results
look positive.

Signed-off-by: Yicong Yang <yangyicong at hisilicon.com>
---

RFT for tests and feedbacks since not sure it's general or just the optimization
on some specific implementations.

 arch/arm64/include/asm/atomic_lse.h | 7 +++++++
 arch/arm64/include/asm/cmpxchg.h    | 3 ++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 87f568a94e55..a45e49d5d857 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -16,6 +16,7 @@ __lse_atomic_##op(int i, atomic_t *v)					\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	" #asm_op "	%w[i], %[v]\n"				\
 	: [v] "+Q" (v->counter)						\
 	: [i] "r" (i));							\
@@ -41,6 +42,7 @@ __lse_atomic_fetch_##op##name(int i, atomic_t *v)			\
 									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	" #asm_op #mb "	%w[i], %w[old], %[v]"			\
 	: [v] "+Q" (v->counter),					\
 	  [old] "=r" (old)						\
@@ -123,6 +125,7 @@ __lse_atomic64_##op(s64 i, atomic64_t *v)				\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	" #asm_op "	%[i], %[v]\n"				\
 	: [v] "+Q" (v->counter)						\
 	: [i] "r" (i));							\
@@ -148,6 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v)			\
 									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
 	: [v] "+Q" (v->counter),					\
 	  [old] "=r" (old)						\
@@ -230,6 +234,7 @@ static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
 
 	asm volatile(
 	__LSE_PREAMBLE
+	"	prfm	pstl1strm, %[v]\n"				\
 	"1:	ldr	%x[tmp], %[v]\n"
 	"	subs	%[ret], %x[tmp], #1\n"
 	"	b.lt	2f\n"
@@ -253,6 +258,7 @@ __lse__cmpxchg_case_##name##sz(volatile void *ptr,			\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	cas" #mb #sfx "	%" #w "[old], %" #w "[new], %[v]\n"	\
 	: [v] "+Q" (*(u##sz *)ptr),					\
 	  [old] "+r" (old)						\
@@ -295,6 +301,7 @@ __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new)		\
 									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
+	"	prfm	pstl1strm, %[v]\n"				\
 	"	casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
 	: [old1] "+&r" (x0), [old2] "+&r" (x1),				\
 	  [v] "+Q" (*(u128 *)ptr)					\
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index d7a540736741..daacbabeadb7 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -32,8 +32,9 @@ static inline u##sz __xchg_case_##name##sz(u##sz x, volatile void *ptr)		\
 	"	cbnz	%w1, 1b\n"						\
 	"	" #mb,								\
 	/* LSE atomics */							\
+	"	prfm	pstl1strm, %2\n"					\
 	"	swp" #acq_lse #rel #sfx "\t%" #w "3, %" #w "0, %2\n"		\
-		__nops(3)							\
+		__nops(2)							\
 	"	" #nop_lse)							\
 	: "=&r" (ret), "=&r" (tmp), "+Q" (*(u##sz *)ptr)			\
 	: "r" (x)								\
-- 
2.24.0