[PATCH 4/5] arm64: atomics: lse: improve constraints for simple ops
Mark Rutland
mark.rutland at arm.com
Fri Dec 10 07:14:09 PST 2021
We have overly conservative assembly constraints for the basic FEAT_LSE
atomic instructions, and using more accurate and permissive constraints
will allow for better code generation.
The FEAT_LSE basic atomic instructions have come in two forms:
LD{op}{order}{size} <Rs>, <Rt>, [<Rn>]
ST{op}{order}{size} <Rs>, [<Rn>]
The ST* forms are aliases of the LD* forms where:
ST{op}{order}{size} <Rs>, [<Rn>]
Is:
LD{op}{order}{size} <Rs>, XZR, [<Rn>]
For either form, both <Rs> and <Rn> are read but not written back to,
and <Rt> is written with the original value of the memory location.
Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the
other register value(s) are consumed. There are no UNPREDICTABLE or
CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or
<Rn> are the same register.
Our current inline assembly always uses <Rs> == <Rt>, treating this
register as both an input and an output (using a '+r' constraint). This
forces the compiler to do some unnecessary register shuffling and/or
redundant value generation.
For example, the compiler cannot reuse the <Rs> value, and currently GCC
11.1.0 will compile:
__lse_atomic_add(1, a);
__lse_atomic_add(1, b);
__lse_atomic_add(1, c);
As:
mov w3, #0x1
mov w4, w3
stadd w4, [x0]
mov w0, w3
stadd w0, [x1]
stadd w3, [x2]
We can improve this with more accurate constraints, separating <Rs> and
<Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an
output-only value ('=r'). As <Rt> is written back after <Rs> is
consumed, it does not need to be earlyclobber ('=&r'), leaving the
compiler free to use the same register for both <Rs> and <Rt> where this
is desirable.
At the same time, the redundant 'r' constraint for `v` is removed, as
the `+Q` constraint is sufficient.
With this change, the above example becomes:
mov w3, #0x1
stadd w3, [x0]
stadd w3, [x1]
stadd w3, [x2]
I've made this change for the non-value-returning and FETCH ops. The
RETURN ops have a multi-instruction sequence for which we cannot use the
same constraints, and a subsequent patch will rewrite hte RETURN ops in
terms of the FETCH ops, relying on the ability for the compiler to reuse
the <Rs> value.
This is intended as an optimization.
There should be no functional change as a result of this patch.
Signed-off-by: Mark Rutland <mark.rutland at arm.com>
Cc: Boqun Feng <boqun.feng at gmail.com>
Cc: Catalin Marinas <catalin.marinas at arm.com>
Cc: Peter Zijlstra <peterz at infradead.org>
Cc: Will Deacon <will at kernel.org>
---
arch/arm64/include/asm/atomic_lse.h | 30 +++++++++++++++++------------
1 file changed, 18 insertions(+), 12 deletions(-)
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index d707eafb7677..e4c5c4c34ce6 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -16,8 +16,8 @@ static inline void __lse_atomic_##op(int i, atomic_t *v) \
asm volatile( \
__LSE_PREAMBLE \
" " #asm_op " %w[i], %[v]\n" \
- : [i] "+r" (i), [v] "+Q" (v->counter) \
- : "r" (v)); \
+ : [v] "+Q" (v->counter) \
+ : [i] "r" (i)); \
}
ATOMIC_OP(andnot, stclr)
@@ -35,14 +35,17 @@ static inline void __lse_atomic_sub(int i, atomic_t *v)
#define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...) \
static inline int __lse_atomic_fetch_##op##name(int i, atomic_t *v) \
{ \
+ int old; \
+ \
asm volatile( \
__LSE_PREAMBLE \
- " " #asm_op #mb " %w[i], %w[i], %[v]" \
- : [i] "+r" (i), [v] "+Q" (v->counter) \
- : "r" (v) \
+ " " #asm_op #mb " %w[i], %w[old], %[v]" \
+ : [v] "+Q" (v->counter), \
+ [old] "=r" (old) \
+ : [i] "r" (i) \
: cl); \
\
- return i; \
+ return old; \
}
#define ATOMIC_FETCH_OPS(op, asm_op) \
@@ -124,8 +127,8 @@ static inline void __lse_atomic64_##op(s64 i, atomic64_t *v) \
asm volatile( \
__LSE_PREAMBLE \
" " #asm_op " %[i], %[v]\n" \
- : [i] "+r" (i), [v] "+Q" (v->counter) \
- : "r" (v)); \
+ : [v] "+Q" (v->counter) \
+ : [i] "r" (i)); \
}
ATOMIC64_OP(andnot, stclr)
@@ -143,14 +146,17 @@ static inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
#define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...) \
static inline long __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v)\
{ \
+ s64 old; \
+ \
asm volatile( \
__LSE_PREAMBLE \
- " " #asm_op #mb " %[i], %[i], %[v]" \
- : [i] "+r" (i), [v] "+Q" (v->counter) \
- : "r" (v) \
+ " " #asm_op #mb " %[i], %[old], %[v]" \
+ : [v] "+Q" (v->counter), \
+ [old] "=r" (old) \
+ : [i] "r" (i) \
: cl); \
\
- return i; \
+ return old; \
}
#define ATOMIC64_FETCH_OPS(op, asm_op) \
--
2.30.2
More information about the linux-arm-kernel
mailing list