[PATCH RFC 2/2] arm64: atomics: add optimization support for implementations using near atomics
Yicong Yang
yangyicong at huawei.com
Fri Sep 19 02:17:47 PDT 2025
From: Yicong Yang <yangyicong at hisilicon.com>
Atomic operations can be implemented in the CPU (near atomics) or
in the interconnect or at the Subordinate holding the data (far
atomics) [1]. The former one can be further optimized by prefetching
the destination prior to the atomic operations, similar to the
optimization for LL/SC operations mentioned in
commit 0ea366f5e1b6 ("arm64: atomics: prefetch the destination word for write prior to stxr")
Add a kconfig ARM64_NEAR_ATOMICS_OPTIMIZATION for compile-in the
optimization and a boot option arm64.near_atomics_optimization for
enabling on platforms implemented as near atomics.
Tested on a HIP09 server using `perf bench -r 100 futex all` which could
stress the spinlock of the futex hash bucket:
6.17-rc1 arm64.near_atomics_optimization
futex/hash(ops/sec) 418742 415325 -0.82%
futex/wake(ms) 0.3954 0.3414 13.66%
futex/wake-parallel(ms) 0.0063 0.0067 -6.35%
(2nd validation) 0.0062 1.59%
futex/requeue(ms) 0.156 0.1565 -0.32%
futex/lock-pi(ops/sec) 103 104 0.97%
For a single wake test for different threads number using `perf bench
-r 100 futex wake -t <threads>`:
threads 6.17-rc1 arm64.near_atomics_optimization
1 0.0021 0.002 4.76%
48 0.0907 0.0761 16.10%
96 0.185 0.1561 15.62%
160 0.3983 0.3317 16.72%
192 0.4834 0.4308 10.88%
256 0.6342 0.5656 10.82%
There're some variation for close numbers but overall results
look positive. As for some more complex benchmarks like unixbench
and tbench, results also appears to be positive.
For unixbnech:
6.17-rc1 arm64.near_atomics_optimization
Dhrystone 2 using register variables 6422.8 6427.8 0.08%
Double-Precision Whetstone 1637.7 1637.7 0.00%
Execl Throughput 878.4 909.6 3.55%
File Copy 1024 bufsize 2000 maxblocks 1586.4 1603.8 1.10%
File Copy 256 bufsize 500 maxblocks 1022.6 1024.5 0.19%
File Copy 4096 bufsize 8000 maxblocks 2770.9 2935.6 5.94%
Pipe Throughput 552 552.4 0.07%
Pipe-based Context Switching 345.1 373.8 8.32%
Process Creation 394.5 410.5 4.06%
Shell Scripts (1 concurrent) 1602.2 1674.1 4.49%
Shell Scripts (8 concurrent) 5154.3 5382 4.42%
System Call Overhead 229.3 229.3 0.00%
======================
System Benchmarks Index Score 1159.6 1190.3 2.65%
For tbench:
threads 6.17-rc1 arm64.near_atomics_optimization
1: 267.4067 279.8757 ( 4.66%)
4: 1065.8133 1117.1133 ( 4.81%)
8: 2107.9800 2236.3200 ( 6.09%)
16: 4200.2300 4420.2733 ( 5.24%)
32: 8065.9033 8388.0267 ( 3.99%)
64: 14284.0333 14624.7000 ( 2.38%)
128: 7573.1800 8078.9900 ( 6.68%)
Though this will be beneficial for near atomic implementations in most
cases tested, especially for the latency sensitive ones, I do observed
regression for extreme test cases like locktorture, the throughput
reduced 1~3% according to the test thread numbers.
[1] https://developer.arm.com/documentation/102714/0100/Atomic-fundamentals
Signed-off-by: Yicong Yang <yangyicong at hisilicon.com>
---
arch/arm64/Kconfig | 18 ++++++++++++++++
arch/arm64/include/asm/atomic_lse.h | 32 +++++++++++++++++++++++++++++
arch/arm64/kernel/cpufeature.c | 26 +++++++++++++++++++++++
arch/arm64/tools/cpucaps | 1 +
4 files changed, 77 insertions(+)
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index e9bbfacc35a6..73b640f9d21b 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1925,6 +1925,24 @@ config ARM64_USE_LSE_ATOMICS
atomic routines. This incurs a small overhead on CPUs that do
not support these instructions.
+config ARM64_NEAR_ATOMICS_OPTIMIZATION
+ bool "Optimization for near atomics implementation"
+ depends on ARM64_USE_LSE_ATOMICS
+ default n
+ help
+ Atomic operations implemented in the CPU (near atomics) can be
+ optimized by prefetching the cacheline in prior to the atomic
+ operations. Choose this to compile the optimization in the kernel.
+ You still need to enable the optimization at boot time with CMDLINE
+ option arm64.near_atomics_optimization based on your system's
+ implementation.
+
+ Say Y here to compile the optimization code. N if your system's
+ implementation is not near atomics or unknown. You may refer to
+ https://developer.arm.com/documentation/102714/0100/Atomic-fundamentals
+ for the basic principles of near atomics and far atomics on arm64
+ CHI based systems.
+
endmenu # "ARMv8.1 architectural features"
menu "ARMv8.2 architectural features"
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index cb38c2120595..3e5b3fa02d37 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -16,6 +16,10 @@ __lse_atomic_##op(int i, atomic_t *v) \
{ \
asm volatile( \
__LSE_PREAMBLE \
+ ALTERNATIVE( \
+ " nop\n", \
+ " prfm pstl1strm, %[v]\n", \
+ ARM64_HAS_NEAR_ATOM_OPT, CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION)\
" " #asm_op " %w[i], %[v]\n" \
: [v] "+Q" (v->counter) \
: [i] "r" (i)); \
@@ -41,6 +45,10 @@ __lse_atomic_fetch_##op##name(int i, atomic_t *v) \
\
asm volatile( \
__LSE_PREAMBLE \
+ ALTERNATIVE( \
+ " nop\n", \
+ " prfm pstl1strm, %[v]\n", \
+ ARM64_HAS_NEAR_ATOM_OPT, CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION)\
" " #asm_op #mb " %w[i], %w[old], %[v]" \
: [v] "+Q" (v->counter), \
[old] "=r" (old) \
@@ -123,6 +131,10 @@ __lse_atomic64_##op(s64 i, atomic64_t *v) \
{ \
asm volatile( \
__LSE_PREAMBLE \
+ ALTERNATIVE( \
+ " nop\n", \
+ " prfm pstl1strm, %[v]\n", \
+ ARM64_HAS_NEAR_ATOM_OPT, CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION)\
" " #asm_op " %[i], %[v]\n" \
: [v] "+Q" (v->counter) \
: [i] "r" (i)); \
@@ -148,6 +160,10 @@ __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v) \
\
asm volatile( \
__LSE_PREAMBLE \
+ ALTERNATIVE( \
+ " nop\n", \
+ " prfm pstl1strm, %[v]\n", \
+ ARM64_HAS_NEAR_ATOM_OPT, CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION)\
" " #asm_op #mb " %[i], %[old], %[v]" \
: [v] "+Q" (v->counter), \
[old] "=r" (old) \
@@ -230,6 +246,10 @@ static __always_inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
asm volatile(
__LSE_PREAMBLE
+ ALTERNATIVE(
+ " nop\n",
+ " prfm pstl1strm, %[v]\n",
+ ARM64_HAS_NEAR_ATOM_OPT, CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION)
"1: ldr %x[tmp], %[v]\n"
" subs %[ret], %x[tmp], #1\n"
" b.lt 2f\n"
@@ -253,6 +273,10 @@ __lse__xchg_case_##name##sz(u##sz new, volatile void *ptr) \
\
asm volatile( \
__LSE_PREAMBLE \
+ ALTERNATIVE( \
+ " nop\n", \
+ " prfm pstl1strm, %[v]\n", \
+ ARM64_HAS_NEAR_ATOM_OPT, CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION)\
" swp" #mb #sfx "\t%" #w "[new], %" #w "[old], %[v]\n" \
: [old] "=r" (old), \
[v] "+Q" (*(u##sz *)ptr) \
@@ -289,6 +313,10 @@ __lse__cmpxchg_case_##name##sz(volatile void *ptr, \
{ \
asm volatile( \
__LSE_PREAMBLE \
+ ALTERNATIVE( \
+ " nop\n", \
+ " prfm pstl1strm, %[v]\n", \
+ ARM64_HAS_NEAR_ATOM_OPT, CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION)\
" cas" #mb #sfx " %" #w "[old], %" #w "[new], %[v]\n" \
: [v] "+Q" (*(u##sz *)ptr), \
[old] "+r" (old) \
@@ -331,6 +359,10 @@ __lse__cmpxchg128##name(volatile u128 *ptr, u128 old, u128 new) \
\
asm volatile( \
__LSE_PREAMBLE \
+ ALTERNATIVE( \
+ " nop\n", \
+ " prfm pstl1strm, %[v]\n", \
+ ARM64_HAS_NEAR_ATOM_OPT, CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION)\
" casp" #mb "\t%[old1], %[old2], %[new1], %[new2], %[v]\n"\
: [old1] "+&r" (x0), [old2] "+&r" (x1), \
[v] "+Q" (*(u128 *)ptr) \
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 9ad065f15f1d..021f9bed4671 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -2520,6 +2520,24 @@ test_has_mpam_hcr(const struct arm64_cpu_capabilities *entry, int scope)
return idr & MPAMIDR_EL1_HAS_HCR;
}
+#ifdef CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION
+
+static bool near_atom_opt;
+
+static int __init early_near_atom_opt(char *p)
+{
+ return kstrtobool(p, &near_atom_opt);
+}
+early_param("arm64.near_atomics_optimization", early_near_atom_opt);
+
+static bool
+has_near_atom_opt(const struct arm64_cpu_capabilities *entry, int scope)
+{
+ return near_atom_opt;
+}
+
+#endif /* CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION */
+
static const struct arm64_cpu_capabilities arm64_features[] = {
{
.capability = ARM64_ALWAYS_BOOT,
@@ -2580,6 +2598,14 @@ static const struct arm64_cpu_capabilities arm64_features[] = {
ARM64_CPUID_FIELDS(ID_AA64ISAR0_EL1, ATOMIC, IMP)
},
#endif /* CONFIG_ARM64_LSE_ATOMICS */
+#ifdef CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION
+ {
+ .desc = "Optimization for near atomics implementation",
+ .capability = ARM64_HAS_NEAR_ATOM_OPT,
+ .type = ARM64_CPUCAP_SYSTEM_FEATURE,
+ .matches = has_near_atom_opt,
+ },
+#endif /* CONFIG_ARM64_NEAR_ATOMICS_OPTIMIZATION */
{
.desc = "Virtualization Host Extensions",
.capability = ARM64_HAS_VIRT_HOST_EXTN,
diff --git a/arch/arm64/tools/cpucaps b/arch/arm64/tools/cpucaps
index ef0b7946f5a4..75ffc451940d 100644
--- a/arch/arm64/tools/cpucaps
+++ b/arch/arm64/tools/cpucaps
@@ -44,6 +44,7 @@ HAS_HCX
HAS_LDAPR
HAS_LPA2
HAS_LSE_ATOMICS
+HAS_NEAR_ATOM_OPT
HAS_MOPS
HAS_NESTED_VIRT
HAS_BBML2_NOABORT
--
2.24.0
More information about the linux-arm-kernel
mailing list