[RFT PATCH] arm64: atomics: prefetch the destination prior to LSE operations
Yicong Yang
yangyicong at huawei.com
Thu Jul 24 05:06:51 PDT 2025
From: Yicong Yang <yangyicong at hisilicon.com>
commit 0ea366f5e1b6 ("arm64: atomics: prefetch the destination word for write prior to stxr")
adds prefetch prior to LL/SC operations due to performance concerns -
change the cacheline status from exclusive could be significant. This is
also true for LSE operations, so prefetch the destination prior to LSE
operations.
Tested on my HIP08 server (2 * 64 CPU) using `perf bench -r 100 futex all`
which could stress the spinlock of the futex hash bucket:
6.16-rc7 patched
futex/hash(ops/sec) 171843 204757 +19.15%
futex/wake(ms) 0.4630 0.4216 +8.94%
futex/wake-parallel(ms) 0.0048 0.0039 +18.75%
futex/requeue(ms) 0.1487 0.1508 -1.41%
(2nd validation) 0.1484 +0.2%
futex/lock-pi(ops/sec) 125 126 +0.8%
For a single wake test for different threads number using `perf bench
-r 100 futex wake -t <threads>`:
threads 6.16-rc7 patched
1 0.0035 0.0032 +8.57%
48 0.1454 0.1221 +16.02%
96 0.3047 0.2304 +24.38%
160 0.5489 0.5012 +8.69%
192 0.6675 0.5906 +11.52%
256 0.9445 0.8092 +14.33%
There're some variation for close numbers but overall results
look positive.
Signed-off-by: Yicong Yang <yangyicong at hisilicon.com>
---
RFT for tests and feedbacks since not sure it's general or just the optimization
on some specific implementations.
arch/arm64/include/asm/atomic_lse.h | 7 +++++++
arch/arm64/include/asm/cmpxchg.h | 3 ++-
2 files changed, 9 insertions(+), 1 deletion(-)
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 87f568a94e55..a45e49d5d857 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -16,6 +16,7 @@ __lse_atomic_##op(int i, atomic_t *v) \
{ \
asm volatile( \
__LSE_PREAMBLE \
+ " prfm pstl1strm, %[v]\n" \
" " #asm_op " %w[i], %[v]\n" \
: [v] "+Q" (v->counter) \
: [i] "r" (i)); \
@@ -41,6 +42,7 @@ __lse_atomic_fetch_##op##name(int i, atomic_t *v) \
\
asm volatile( \
__LSE_PREAMBLE \
+ " prfm pstl1strm, %[v]\n" \
" " #asm_op #mb " %w[i], %w[old], %[v]" \
: [v] "+Q" (v->counter), \
[old] "=r" (old) \
@@ -123,6 +125,7 @@ __lse_atomic64_##op(s64 i, atomic64_t *v) \
{ \
asm volatile( \
__LSE_PREAMBLE \
+ " prfm pstl1strm, %[v]\n" \
" " #asm_op " %[i], %[v]\n" \
: [v] "+Q" (v->counter) \
: [i] "r" (i)); \
@@ -148,6 +151,7 @@ __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v) \
\
asm volatile( \
__LSE_PREAMBLE \
+ " prfm pstl1strm, %[v]\n" \
" " #asm_op #mb " %[i], %[old], %[v]" \
: [v] "+Q" (v->counter), \
[old] "=r" (old) \
@@ -230,6 +234,7 @@ static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
asm volatile(
__LSE_PREAMBLE
+ " prfm pstl1strm, %[v]\n" \
"1: ldr %x[tmp], %[v]\n"
" subs %[ret], %x[tmp], #1\n"
" b.lt 2f\n"
@@ -253,6 +258,7 @@ __lse__cmpxchg_case_##name##sz(volatile void *ptr, \
{ \
asm volatile( \
__LSE_PREAMBLE \
+ " prfm pstl1strm, %[v]\n" \
" cas" #mb #sfx " %" #w "[old], %" #w "[new], %[v]\n" \
: [v] "+Q" (*(u##sz *)ptr), \
[old] "+r" (old) \
@@ -295,6 +301,7 @@ __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \
\
asm volatile( \
__LSE_PREAMBLE \
+ " prfm pstl1strm, %[v]\n" \
" casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
: [old1] "+&r" (x0), [old2] "+&r" (x1), \
[v] "+Q" (*(u128 *)ptr) \
diff --git a/arch/arm64/include/asm/cmpxchg.h b/arch/arm64/include/asm/cmpxchg.h
index d7a540736741..daacbabeadb7 100644
--- a/arch/arm64/include/asm/cmpxchg.h
+++ b/arch/arm64/include/asm/cmpxchg.h
@@ -32,8 +32,9 @@ static inline u##sz __xchg_case_##name##sz(u##sz x, volatile void *ptr) \
" cbnz %w1, 1b\n" \
" " #mb, \
/* LSE atomics */ \
+ " prfm pstl1strm, %2\n" \
" swp" #acq_lse #rel #sfx "\t%" #w "3, %" #w "0, %2\n" \
- __nops(3) \
+ __nops(2) \
" " #nop_lse) \
: "=&r" (ret), "=&r" (tmp), "+Q" (*(u##sz *)ptr) \
: "r" (x) \
--
2.24.0
More information about the linux-arm-kernel
mailing list