[PATCH v3 11/13] ARM: cacheflush: avoid clobbering the frame pointer
Nick Desaulniers
ndesaulniers at google.com
Mon Feb 7 11:12:26 PST 2022
On Thu, Feb 3, 2022 at 12:22 AM Ard Biesheuvel <ardb at kernel.org> wrote:
>
> Thumb2 uses R7 rather than R11 as the frame pointer, and even if we
> rarely use a frame pointer to begin with when building in Thumb2 mode,
> there are cases where it is required by the compiler (Clang when
> inserting profiling hooks via -pg)
>
> However, preserving and restoring the frame pointer is risky, as any
> unhandled exceptions raised in the mean time will produce a bogus
> backtrace, and it would be better not to touch the frame pointer at all.
> This is the case even when CONFIG_FRAME_POINTER is not set, as the
> unwind directive used by the unwinder may also use R7 or R11 as the
> unwind anchor, even if the frame pointer is not managed strictly
> according to the frame pointer ABI.
>
> So let's tweak the cacheflush asm code not to clobber R7 or R11 at all,
> so that we can drop R7 from the clobber lists of the inline asm blocks
> that call these routines, and remove the code that preserves/restores
> R11.
>
> Signed-off-by: Ard Biesheuvel <ardb at kernel.org>
> ---
> arch/arm/include/asm/cacheflush.h | 12 ++----
> arch/arm/mach-exynos/mcpm-exynos.c | 6 +--
> arch/arm/mm/cache-v7.S | 40 +++++++++-----------
> 3 files changed, 23 insertions(+), 35 deletions(-)
>
> diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
> index e68fb879e4f9..d27782331556 100644
> --- a/arch/arm/include/asm/cacheflush.h
> +++ b/arch/arm/include/asm/cacheflush.h
> @@ -446,15 +446,10 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size)
> * however some exceptions may exist. Caveat emptor.
> *
> * - The clobber list is dictated by the call to v7_flush_dcache_*.
> - * fp is preserved to the stack explicitly prior disabling the cache
> - * since adding it to the clobber list is incompatible with having
> - * CONFIG_FRAME_POINTER=y. ip is saved as well if ever r12-clobbering
> - * trampoline are inserted by the linker and to keep sp 64-bit aligned.
> */
> #define v7_exit_coherency_flush(level) \
> asm volatile( \
> ".arch armv7-a \n\t" \
> - "stmfd sp!, {fp, ip} \n\t" \
> "mrc p15, 0, r0, c1, c0, 0 @ get SCTLR \n\t" \
> "bic r0, r0, #"__stringify(CR_C)" \n\t" \
> "mcr p15, 0, r0, c1, c0, 0 @ set SCTLR \n\t" \
> @@ -464,10 +459,9 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size)
> "bic r0, r0, #(1 << 6) @ disable local coherency \n\t" \
> "mcr p15, 0, r0, c1, c0, 1 @ set ACTLR \n\t" \
> "isb \n\t" \
> - "dsb \n\t" \
> - "ldmfd sp!, {fp, ip}" \
> - : : : "r0","r1","r2","r3","r4","r5","r6","r7", \
> - "r9","r10","lr","memory" )
> + "dsb" \
> + : : : "r0","r1","r2","r3","r4","r5","r6", \
> + "r9","r10","ip","lr","memory" )
>
> void flush_uprobe_xol_access(struct page *page, unsigned long uaddr,
> void *kaddr, unsigned long len);
> diff --git a/arch/arm/mach-exynos/mcpm-exynos.c b/arch/arm/mach-exynos/mcpm-exynos.c
> index cd861c57d5ad..fd0dbeb93357 100644
> --- a/arch/arm/mach-exynos/mcpm-exynos.c
> +++ b/arch/arm/mach-exynos/mcpm-exynos.c
> @@ -35,7 +35,6 @@ static bool secure_firmware __ro_after_init;
> */
> #define exynos_v7_exit_coherency_flush(level) \
> asm volatile( \
> - "stmfd sp!, {fp, ip}\n\t"\
> "mrc p15, 0, r0, c1, c0, 0 @ get SCTLR\n\t" \
> "bic r0, r0, #"__stringify(CR_C)"\n\t" \
> "mcr p15, 0, r0, c1, c0, 0 @ set SCTLR\n\t" \
> @@ -50,11 +49,10 @@ static bool secure_firmware __ro_after_init;
> "mcr p15, 0, r0, c1, c0, 1 @ set ACTLR\n\t" \
> "isb\n\t" \
> "dsb\n\t" \
> - "ldmfd sp!, {fp, ip}" \
> : \
> : "Ir" (pmu_base_addr + S5P_INFORM0) \
> - : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", \
> - "r9", "r10", "lr", "memory")
> + : "r0", "r1", "r2", "r3", "r4", "r5", "r6", \
> + "r9", "r10", "ip", "lr", "memory")
>
> static int exynos_cpu_powerup(unsigned int cpu, unsigned int cluster)
> {
> diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
> index 830bbfb26ca5..b9f55f0bc278 100644
> --- a/arch/arm/mm/cache-v7.S
> +++ b/arch/arm/mm/cache-v7.S
> @@ -90,7 +90,7 @@ ENDPROC(v7_flush_icache_all)
> *
> * Flush the D-cache up to the Level of Unification Inner Shareable
> *
> - * Corrupted registers: r0-r7, r9-r11 (r6 only in Thumb mode)
> + * Corrupted registers: r0-r6, r9-r10
> */
>
> ENTRY(v7_flush_dcache_louis)
> @@ -117,7 +117,7 @@ ENDPROC(v7_flush_dcache_louis)
> *
> * Flush the whole D-cache.
> *
> - * Corrupted registers: r0-r7, r9-r11 (r6 only in Thumb mode)
> + * Corrupted registers: r0-r6, r9-r10
> *
> * - mm - mm_struct describing address space
> */
> @@ -149,22 +149,22 @@ flush_levels:
> movw r4, #0x3ff
> ands r4, r4, r1, lsr #3 @ find maximum number on the way size
> clz r5, r4 @ find bit position of way size increment
> - movw r7, #0x7fff
> - ands r7, r7, r1, lsr #13 @ extract max number of the index size
> + movw r6, #0x7fff
> + and r1, r6, r1, lsr #13 @ extract max number of the index size
> + mov r6, #1
> + movne r4, r4, lsl r5 @ # of ways shifted into bits [31:...]
> + movne r6, r6, lsl r5 @ 1 shifted left by same amount
This group of changes was definitely trickier to review...can you
provide more color as to why the conditional `mov`s and the change in
the loop back-edge? (I appreciate you explaining some on IRC; it might
be helpful to other reviewers to have more commentary on list) The
rest LGTM.
> loop1:
> - mov r9, r7 @ create working copy of max index
> + mov r9, r1 @ create working copy of max index
> loop2:
> - ARM( orr r11, r10, r4, lsl r5 ) @ factor way and cache number into r11
> - THUMB( lsl r6, r4, r5 )
> - THUMB( orr r11, r10, r6 ) @ factor way and cache number into r11
> - ARM( orr r11, r11, r9, lsl r2 ) @ factor index number into r11
> - THUMB( lsl r6, r9, r2 )
> - THUMB( orr r11, r11, r6 ) @ factor index number into r11
> - mcr p15, 0, r11, c7, c14, 2 @ clean & invalidate by set/way
> + mov r5, r9, lsl r2 @ factor way into r5
> + orr r5, r5, r4 @ factor index number into r5
> + orr r5, r5, r10 @ factor cache number into r5
> + mcr p15, 0, r5, c7, c14, 2 @ clean & invalidate by set/way
> subs r9, r9, #1 @ decrement the index
> bge loop2
> - subs r4, r4, #1 @ decrement the way
> - bge loop1
> + subs r4, r4, r6 @ decrement the way
> + bcs loop1
> skip:
> add r10, r10, #2 @ increment cache number
> cmp r3, r10
> @@ -192,14 +192,12 @@ ENDPROC(v7_flush_dcache_all)
> *
> */
> ENTRY(v7_flush_kern_cache_all)
> - ARM( stmfd sp!, {r4-r5, r7, r9-r11, lr} )
> - THUMB( stmfd sp!, {r4-r7, r9-r11, lr} )
> + stmfd sp!, {r4-r6, r9-r10, lr}
> bl v7_flush_dcache_all
> mov r0, #0
> ALT_SMP(mcr p15, 0, r0, c7, c1, 0) @ invalidate I-cache inner shareable
> ALT_UP(mcr p15, 0, r0, c7, c5, 0) @ I+BTB cache invalidate
> - ARM( ldmfd sp!, {r4-r5, r7, r9-r11, lr} )
> - THUMB( ldmfd sp!, {r4-r7, r9-r11, lr} )
> + ldmfd sp!, {r4-r6, r9-r10, lr}
> ret lr
> ENDPROC(v7_flush_kern_cache_all)
>
> @@ -210,14 +208,12 @@ ENDPROC(v7_flush_kern_cache_all)
> * Invalidate the I-cache to the point of unification.
> */
> ENTRY(v7_flush_kern_cache_louis)
> - ARM( stmfd sp!, {r4-r5, r7, r9-r11, lr} )
> - THUMB( stmfd sp!, {r4-r7, r9-r11, lr} )
> + stmfd sp!, {r4-r6, r9-r10, lr}
> bl v7_flush_dcache_louis
> mov r0, #0
> ALT_SMP(mcr p15, 0, r0, c7, c1, 0) @ invalidate I-cache inner shareable
> ALT_UP(mcr p15, 0, r0, c7, c5, 0) @ I+BTB cache invalidate
> - ARM( ldmfd sp!, {r4-r5, r7, r9-r11, lr} )
> - THUMB( ldmfd sp!, {r4-r7, r9-r11, lr} )
> + ldmfd sp!, {r4-r6, r9-r10, lr}
> ret lr
> ENDPROC(v7_flush_kern_cache_louis)
>
> --
> 2.30.2
>
--
Thanks,
~Nick Desaulniers
More information about the linux-arm-kernel
mailing list