[PATCH 07/18] arm64: locks: patch in lse instructions when supported by the CPU
Catalin Marinas
catalin.marinas at arm.com
Tue Jul 21 09:53:39 PDT 2015
On Mon, Jul 13, 2015 at 10:25:08AM +0100, Will Deacon wrote:
> @@ -67,15 +78,25 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
> unsigned int tmp;
> arch_spinlock_t lockval;
>
> - asm volatile(
> -" prfm pstl1strm, %2\n"
> -"1: ldaxr %w0, %2\n"
> -" eor %w1, %w0, %w0, ror #16\n"
> -" cbnz %w1, 2f\n"
> -" add %w0, %w0, %3\n"
> -" stxr %w1, %w0, %2\n"
> -" cbnz %w1, 1b\n"
> -"2:"
> + asm volatile(ARM64_LSE_ATOMIC_INSN(
> + /* LL/SC */
> + " prfm pstl1strm, %2\n"
> + "1: ldaxr %w0, %2\n"
> + " eor %w1, %w0, %w0, ror #16\n"
> + " cbnz %w1, 2f\n"
> + " add %w0, %w0, %3\n"
> + " stxr %w1, %w0, %2\n"
> + " cbnz %w1, 1b\n"
> + "2:",
> + /* LSE atomics */
> + " ldar %w0, %2\n"
Do we still need an acquire if we fail to take the lock?
> + " eor %w1, %w0, %w0, ror #16\n"
> + " cbnz %w1, 1f\n"
> + " add %w1, %w0, %3\n"
> + " casa %w0, %w1, %2\n"
> + " and %w1, %w1, #0xffff\n"
> + " eor %w1, %w1, %w0, lsr #16\n"
> + "1:")
> : "=&r" (lockval), "=&r" (tmp), "+Q" (*lock)
> : "I" (1 << TICKET_SHIFT)
> : "memory");
I wonder if this is any faster with LSE. CAS would have to re-load the
lock but we already have the value loaded (though most likely the reload
will be from cache and we save a cbnz that can be mispredicted). I guess
we'll re-test when we get some real hardware.
Does prfm help in any way with the LDAR?
> @@ -85,10 +106,19 @@ static inline int arch_spin_trylock(arch_spinlock_t *lock)
>
> static inline void arch_spin_unlock(arch_spinlock_t *lock)
> {
> - asm volatile(
> -" stlrh %w1, %0\n"
> - : "=Q" (lock->owner)
> - : "r" (lock->owner + 1)
> + unsigned long tmp;
> +
> + asm volatile(ARM64_LSE_ATOMIC_INSN(
> + /* LL/SC */
> + " ldr %w1, %0\n"
> + " add %w1, %w1, #1\n"
> + " stlrh %w1, %0",
> + /* LSE atomics */
> + " mov %w1, #1\n"
> + " nop\n"
> + " staddlh %w1, %0")
> + : "=Q" (lock->owner), "=&r" (tmp)
> + :
> : "memory");
> }
>
> @@ -125,11 +155,19 @@ static inline void arch_write_lock(arch_rwlock_t *rw)
>
> asm volatile(
> " sevl\n"
> + ARM64_LSE_ATOMIC_INSN(
> + /* LL/SC */
> "1: wfe\n"
> "2: ldaxr %w0, %1\n"
> " cbnz %w0, 1b\n"
> " stxr %w0, %w2, %1\n"
> - " cbnz %w0, 2b\n"
> + " cbnz %w0, 2b",
> + /* LSE atomics */
> + "1: wfe\n"
> + " mov %w0, wzr\n"
> + " casa %w0, %w2, %1\n"
> + " nop\n"
> + " cbnz %w0, 1b")
> : "=&r" (tmp), "+Q" (rw->lock)
> : "r" (0x80000000)
> : "memory");
With WFE in the LL/SC case, we rely on LDAXR to set the exclusive
monitor and an event would be generated every time it gets cleared. With
CAS, we no longer have this behaviour, so what guarantees a SEV?
> @@ -139,11 +177,16 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
> {
> unsigned int tmp;
>
> - asm volatile(
> + asm volatile(ARM64_LSE_ATOMIC_INSN(
> + /* LL/SC */
> " ldaxr %w0, %1\n"
> " cbnz %w0, 1f\n"
> " stxr %w0, %w2, %1\n"
Not a comment to your patch but to the original code: don't we need a
branch back to ldaxr if stxr fails, like we do for arch_spin_trylock?
The same comment for arch_read_trylock.
> - "1:\n"
> + "1:",
> + /* LSE atomics */
> + " mov %w0, wzr\n"
> + " casa %w0, %w2, %1\n"
> + " nop")
> : "=&r" (tmp), "+Q" (rw->lock)
> : "r" (0x80000000)
> : "memory");
> @@ -153,9 +196,10 @@ static inline int arch_write_trylock(arch_rwlock_t *rw)
>
> static inline void arch_write_unlock(arch_rwlock_t *rw)
> {
> - asm volatile(
> - " stlr %w1, %0\n"
> - : "=Q" (rw->lock) : "r" (0) : "memory");
> + asm volatile(ARM64_LSE_ATOMIC_INSN(
> + " stlr wzr, %0",
> + " swpl wzr, wzr, %0")
> + : "=Q" (rw->lock) :: "memory");
Is this any better than just STLR? We don't need the memory read.
> @@ -172,6 +216,10 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
> *
> * The memory barriers are implicit with the load-acquire and store-release
> * instructions.
> + *
> + * Note that in UNDEFINED cases, such as unlocking a lock twice, the LL/SC
> + * and LSE implementations may exhibit different behaviour (although this
> + * will have no effect on lockdep).
> */
> static inline void arch_read_lock(arch_rwlock_t *rw)
> {
> @@ -179,26 +227,43 @@ static inline void arch_read_lock(arch_rwlock_t *rw)
>
> asm volatile(
> " sevl\n"
> + ARM64_LSE_ATOMIC_INSN(
> + /* LL/SC */
> "1: wfe\n"
> "2: ldaxr %w0, %2\n"
> " add %w0, %w0, #1\n"
> " tbnz %w0, #31, 1b\n"
> " stxr %w1, %w0, %2\n"
> - " cbnz %w1, 2b\n"
> + " nop\n"
> + " cbnz %w1, 2b",
> + /* LSE atomics */
> + "1: wfe\n"
> + "2: ldr %w0, %2\n"
> + " adds %w1, %w0, #1\n"
> + " tbnz %w1, #31, 1b\n"
> + " casa %w0, %w1, %2\n"
> + " sbc %w0, %w1, %w0\n"
> + " cbnz %w0, 2b")
> : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock)
> :
> - : "memory");
> + : "cc", "memory");
Same comment here about WFE.
--
Catalin
More information about the linux-arm-kernel
mailing list