[PATCH V4 5/5] riscv: atomic: Optimize LRSC-pairs atomic ops with .aqrl annotation

Sat May 21 13:46:39 PDT 2022

On Wed, 04 May 2022 20:55:26 PDT (-0700), guoren at kernel.org wrote:
> From: Guo Ren <guoren at linux.alibaba.com>
>
> The current implementation is the same with 8e86f0b409a4
> ("arm64: atomics: fix use of acquire + release for full barrier
> semantics"). RISC-V could combine acquire and release into the SC
> instructions and it could reduce a fence instruction to gain better
> performance. Here is related descriptio from RISC-V ISA 10.2
> Load-Reserved/Store-Conditional Instructions:
>
>  - .aq:   The LR/SC sequence can be given acquire semantics by
>           setting the aq bit on the LR instruction.
>  - .rl:   The LR/SC sequence can be given release semantics by
>           setting the rl bit on the SC instruction.
>  - .aqrl: Setting the aq bit on the LR instruction, and setting
>           both the aq and the rl bit on the SC instruction makes
>           the LR/SC sequence sequentially consistent, meaning that
>           it cannot be reordered with earlier or later memory
>           operations from the same hart.
>
>  Software should not set the rl bit on an LR instruction unless
>  the aq bit is also set, nor should software set the aq bit on an
>  SC instruction unless the rl bit is also set. LR.rl and SC.aq
>  instructions are not guaranteed to provide any stronger ordering
>  than those with both bits clear, but may result in lower
>  performance.
>
> The only difference is when sc.w/d.aqrl failed, it would cause .aq
> effect than before. But it's okay for sematic because overlap
> address LR couldn't beyond relating SC.

IIUC that's not accurate, or at least wasn't in 2018.  The ISA tends to 
drift around a bit, so it's possible things have changed since then.  
5ce6c1f3535f ("riscv/atomic: Strengthen implementations with fences") 
describes the issue more specifically, that's when we added these 
fences.  There have certainly been complains that these fences are too 
heavyweight for the HW to go fast, but IIUC it's the best option we have 
given the current set of memory model primitives we implement in the 
ISA (ie, there's more in RVWMO but no way to encode that).

The others all look good, though, and as these are really all 
independent cleanups I'm going to go ahead and put those three on 
for-next.

There's also a bunch of checkpatch errors.  The ones about "*" seem 
spurious, but the alignment ones aren't.  Here's my fixups:

    diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
    index 34f757dfc8f2..0bde499fa8bc 100644
    --- a/arch/riscv/include/asm/atomic.h
    +++ b/arch/riscv/include/asm/atomic.h
    @@ -86,9 +86,9 @@ ATOMIC_OPS(xor, xor,  i)
      * versions, while the logical ops only have fetch versions.
      */
     #define ATOMIC_FETCH_OP(op, asm_op, I, asm_type, c_type, prefix)	\
    -static __always_inline							\
    -c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i,		\
    -					     atomic##prefix##_t *v)	\
    +static __always_inline c_type						\
    +arch_atomic##prefix##_fetch_##op##_relaxed(c_type i,			\
    +					   atomic##prefix##_t *v)	\
     {									\
     	register c_type ret;						\
     	__asm__ __volatile__ (						\
    @@ -98,9 +98,9 @@ c_type arch_atomic##prefix##_fetch_##op##_relaxed(c_type i,		\
     		: "memory");						\
     	return ret;							\
     }									\
    -static __always_inline							\
    -c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i,		\
    -					     atomic##prefix##_t *v)	\
    +static __always_inline c_type						\
    +arch_atomic##prefix##_fetch_##op##_acquire(c_type i,			\
    +					   atomic##prefix##_t *v)	\
     {									\
     	register c_type ret;						\
     	__asm__ __volatile__ (						\
    @@ -110,9 +110,9 @@ c_type arch_atomic##prefix##_fetch_##op##_acquire(c_type i,		\
     		: "memory");						\
     	return ret;							\
     }									\
    -static __always_inline							\
    -c_type arch_atomic##prefix##_fetch_##op##_release(c_type i,		\
    -					     atomic##prefix##_t *v)	\
    +static __always_inline c_type						\
    +arch_atomic##prefix##_fetch_##op##_release(c_type i,			\
    +					   atomic##prefix##_t *v)	\
     {									\
     	register c_type ret;						\
     	__asm__ __volatile__ (						\
    @@ -122,8 +122,8 @@ c_type arch_atomic##prefix##_fetch_##op##_release(c_type i,		\
     		: "memory");						\
     	return ret;							\
     }									\
    -static __always_inline							\
    -c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v)	\
    +static __always_inline c_type						\
    +arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v)	\
     {									\
     	register c_type ret;						\
     	__asm__ __volatile__ (						\
    @@ -135,28 +135,28 @@ c_type arch_atomic##prefix##_fetch_##op(c_type i, atomic##prefix##_t *v)	\
     }

     #define ATOMIC_OP_RETURN(op, asm_op, c_op, I, asm_type, c_type, prefix)	\
    -static __always_inline							\
    -c_type arch_atomic##prefix##_##op##_return_relaxed(c_type i,		\
    -					      atomic##prefix##_t *v)	\
    +static __always_inline c_type						\
    +arch_atomic##prefix##_##op##_return_relaxed(c_type i,			\
    +					    atomic##prefix##_t *v)	\
     {									\
    -        return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I;	\
    +	return arch_atomic##prefix##_fetch_##op##_relaxed(i, v) c_op I;	\
     }									\
    -static __always_inline							\
    -c_type arch_atomic##prefix##_##op##_return_acquire(c_type i,		\
    -					      atomic##prefix##_t *v)	\
    +static __always_inline c_type						\
    +arch_atomic##prefix##_##op##_return_acquire(c_type i,			\
    +					    atomic##prefix##_t *v)	\
     {									\
    -        return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I;	\
    +	return arch_atomic##prefix##_fetch_##op##_acquire(i, v) c_op I;	\
     }									\
    -static __always_inline							\
    -c_type arch_atomic##prefix##_##op##_return_release(c_type i,		\
    -					      atomic##prefix##_t *v)	\
    +static __always_inline c_type						\
    +arch_atomic##prefix##_##op##_return_release(c_type i,			\
    +					    atomic##prefix##_t *v)	\
     {									\
    -        return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I;	\
    +	return arch_atomic##prefix##_fetch_##op##_release(i, v) c_op I;	\
     }									\
    -static __always_inline							\
    -c_type arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v)	\
    +static __always_inline c_type						\
    +arch_atomic##prefix##_##op##_return(c_type i, atomic##prefix##_t *v)	\
     {									\
    -        return arch_atomic##prefix##_fetch_##op(i, v) c_op I;		\
    +	return arch_atomic##prefix##_fetch_##op(i, v) c_op I;		\
     }

     #ifdef CONFIG_GENERIC_ATOMIC64

>
> Signed-off-by: Guo Ren <guoren at linux.alibaba.com>
> Signed-off-by: Guo Ren <guoren at kernel.org>
> Cc: Palmer Dabbelt <palmer at dabbelt.com>
> Cc: Mark Rutland <mark.rutland at arm.com>
> Cc: Dan Lustig <dlustig at nvidia.com>
> Cc: Andrea Parri <parri.andrea at gmail.com>
> ---
>  arch/riscv/include/asm/atomic.h  | 24 ++++++++----------------
>  arch/riscv/include/asm/cmpxchg.h |  6 ++----
>  2 files changed, 10 insertions(+), 20 deletions(-)
>
> diff --git a/arch/riscv/include/asm/atomic.h b/arch/riscv/include/asm/atomic.h
> index 34f757dfc8f2..aef8aa9ac4f4 100644
> --- a/arch/riscv/include/asm/atomic.h
> +++ b/arch/riscv/include/asm/atomic.h
> @@ -269,9 +269,8 @@ static __always_inline int arch_atomic_fetch_add_unless(atomic_t *v, int a, int
>  		"0:	lr.w     %[p],  %[c]\n"
>  		"	beq      %[p],  %[u], 1f\n"
>  		"	add      %[rc], %[p], %[a]\n"
> -		"	sc.w.rl  %[rc], %[rc], %[c]\n"
> +		"	sc.w.aqrl  %[rc], %[rc], %[c]\n"
>  		"	bnez     %[rc], 0b\n"
> -		"	fence    rw, rw\n"
>  		"1:\n"
>  		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>  		: [a]"r" (a), [u]"r" (u)
> @@ -290,9 +289,8 @@ static __always_inline s64 arch_atomic64_fetch_add_unless(atomic64_t *v, s64 a,
>  		"0:	lr.d     %[p],  %[c]\n"
>  		"	beq      %[p],  %[u], 1f\n"
>  		"	add      %[rc], %[p], %[a]\n"
> -		"	sc.d.rl  %[rc], %[rc], %[c]\n"
> +		"	sc.d.aqrl  %[rc], %[rc], %[c]\n"
>  		"	bnez     %[rc], 0b\n"
> -		"	fence    rw, rw\n"
>  		"1:\n"
>  		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>  		: [a]"r" (a), [u]"r" (u)
> @@ -382,9 +380,8 @@ static __always_inline bool arch_atomic_inc_unless_negative(atomic_t *v)
>  		"0:	lr.w      %[p],  %[c]\n"
>  		"	bltz      %[p],  1f\n"
>  		"	addi      %[rc], %[p], 1\n"
> -		"	sc.w.rl   %[rc], %[rc], %[c]\n"
> +		"	sc.w.aqrl %[rc], %[rc], %[c]\n"
>  		"	bnez      %[rc], 0b\n"
> -		"	fence     rw, rw\n"
>  		"1:\n"
>  		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>  		:
> @@ -402,9 +399,8 @@ static __always_inline bool arch_atomic_dec_unless_positive(atomic_t *v)
>  		"0:	lr.w      %[p],  %[c]\n"
>  		"	bgtz      %[p],  1f\n"
>  		"	addi      %[rc], %[p], -1\n"
> -		"	sc.w.rl   %[rc], %[rc], %[c]\n"
> +		"	sc.w.aqrl %[rc], %[rc], %[c]\n"
>  		"	bnez      %[rc], 0b\n"
> -		"	fence     rw, rw\n"
>  		"1:\n"
>  		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>  		:
> @@ -422,9 +418,8 @@ static __always_inline int arch_atomic_dec_if_positive(atomic_t *v)
>  		"0:	lr.w     %[p],  %[c]\n"
>  		"	addi     %[rc], %[p], -1\n"
>  		"	bltz     %[rc], 1f\n"
> -		"	sc.w.rl  %[rc], %[rc], %[c]\n"
> +		"	sc.w.aqrl %[rc], %[rc], %[c]\n"
>  		"	bnez     %[rc], 0b\n"
> -		"	fence    rw, rw\n"
>  		"1:\n"
>  		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>  		:
> @@ -444,9 +439,8 @@ static __always_inline bool arch_atomic64_inc_unless_negative(atomic64_t *v)
>  		"0:	lr.d      %[p],  %[c]\n"
>  		"	bltz      %[p],  1f\n"
>  		"	addi      %[rc], %[p], 1\n"
> -		"	sc.d.rl   %[rc], %[rc], %[c]\n"
> +		"	sc.d.aqrl %[rc], %[rc], %[c]\n"
>  		"	bnez      %[rc], 0b\n"
> -		"	fence     rw, rw\n"
>  		"1:\n"
>  		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>  		:
> @@ -465,9 +459,8 @@ static __always_inline bool arch_atomic64_dec_unless_positive(atomic64_t *v)
>  		"0:	lr.d      %[p],  %[c]\n"
>  		"	bgtz      %[p],  1f\n"
>  		"	addi      %[rc], %[p], -1\n"
> -		"	sc.d.rl   %[rc], %[rc], %[c]\n"
> +		"	sc.d.aqrl %[rc], %[rc], %[c]\n"
>  		"	bnez      %[rc], 0b\n"
> -		"	fence     rw, rw\n"
>  		"1:\n"
>  		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>  		:
> @@ -486,9 +479,8 @@ static __always_inline s64 arch_atomic64_dec_if_positive(atomic64_t *v)
>  		"0:	lr.d     %[p],  %[c]\n"
>  		"	addi      %[rc], %[p], -1\n"
>  		"	bltz     %[rc], 1f\n"
> -		"	sc.d.rl  %[rc], %[rc], %[c]\n"
> +		"	sc.d.aqrl %[rc], %[rc], %[c]\n"
>  		"	bnez     %[rc], 0b\n"
> -		"	fence    rw, rw\n"
>  		"1:\n"
>  		: [p]"=&r" (prev), [rc]"=&r" (rc), [c]"+A" (v->counter)
>  		:
> diff --git a/arch/riscv/include/asm/cmpxchg.h b/arch/riscv/include/asm/cmpxchg.h
> index 1af8db92250b..9269fceb86e0 100644
> --- a/arch/riscv/include/asm/cmpxchg.h
> +++ b/arch/riscv/include/asm/cmpxchg.h
> @@ -307,9 +307,8 @@
>  		__asm__ __volatile__ (					\
>  			"0:	lr.w %0, %2\n"				\
>  			"	bne  %0, %z3, 1f\n"			\
> -			"	sc.w.rl %1, %z4, %2\n"			\
> +			"	sc.w.aqrl %1, %z4, %2\n"		\
>  			"	bnez %1, 0b\n"				\
> -			"	fence rw, rw\n"				\
>  			"1:\n"						\
>  			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
>  			: "rJ" ((long)__old), "rJ" (__new)		\
> @@ -319,9 +318,8 @@
>  		__asm__ __volatile__ (					\
>  			"0:	lr.d %0, %2\n"				\
>  			"	bne %0, %z3, 1f\n"			\
> -			"	sc.d.rl %1, %z4, %2\n"			\
> +			"	sc.d.aqrl %1, %z4, %2\n"		\
>  			"	bnez %1, 0b\n"				\
> -			"	fence rw, rw\n"				\
>  			"1:\n"						\
>  			: "=&r" (__ret), "=&r" (__rc), "+A" (*__ptr)	\
>  			: "rJ" (__old), "rJ" (__new)			\