[PATCH v3 11/13] ARM: cacheflush: avoid clobbering the frame pointer

Mon Feb 7 11:12:26 PST 2022

On Thu, Feb 3, 2022 at 12:22 AM Ard Biesheuvel <ardb at kernel.org> wrote:
>
> Thumb2 uses R7 rather than R11 as the frame pointer, and even if we
> rarely use a frame pointer to begin with when building in Thumb2 mode,
> there are cases where it is required by the compiler (Clang when
> inserting profiling hooks via -pg)
>
> However, preserving and restoring the frame pointer is risky, as any
> unhandled exceptions raised in the mean time will produce a bogus
> backtrace, and it would be better not to touch the frame pointer at all.
> This is the case even when CONFIG_FRAME_POINTER is not set, as the
> unwind directive used by the unwinder may also use R7 or R11 as the
> unwind anchor, even if the frame pointer is not managed strictly
> according to the frame pointer ABI.
>
> So let's tweak the cacheflush asm code not to clobber R7 or R11 at all,
> so that we can drop R7 from the clobber lists of the inline asm blocks
> that call these routines, and remove the code that preserves/restores
> R11.
>
> Signed-off-by: Ard Biesheuvel <ardb at kernel.org>
> ---
>  arch/arm/include/asm/cacheflush.h  | 12 ++----
>  arch/arm/mach-exynos/mcpm-exynos.c |  6 +--
>  arch/arm/mm/cache-v7.S             | 40 +++++++++-----------
>  3 files changed, 23 insertions(+), 35 deletions(-)
>
> diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h
> index e68fb879e4f9..d27782331556 100644
> --- a/arch/arm/include/asm/cacheflush.h
> +++ b/arch/arm/include/asm/cacheflush.h
> @@ -446,15 +446,10 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size)
>   *   however some exceptions may exist.  Caveat emptor.
>   *
>   * - The clobber list is dictated by the call to v7_flush_dcache_*.
> - *   fp is preserved to the stack explicitly prior disabling the cache
> - *   since adding it to the clobber list is incompatible with having
> - *   CONFIG_FRAME_POINTER=y.  ip is saved as well if ever r12-clobbering
> - *   trampoline are inserted by the linker and to keep sp 64-bit aligned.
>   */
>  #define v7_exit_coherency_flush(level) \
>         asm volatile( \
>         ".arch  armv7-a \n\t" \
> -       "stmfd  sp!, {fp, ip} \n\t" \
>         "mrc    p15, 0, r0, c1, c0, 0   @ get SCTLR \n\t" \
>         "bic    r0, r0, #"__stringify(CR_C)" \n\t" \
>         "mcr    p15, 0, r0, c1, c0, 0   @ set SCTLR \n\t" \
> @@ -464,10 +459,9 @@ static inline void __sync_cache_range_r(volatile void *p, size_t size)
>         "bic    r0, r0, #(1 << 6)       @ disable local coherency \n\t" \
>         "mcr    p15, 0, r0, c1, c0, 1   @ set ACTLR \n\t" \
>         "isb    \n\t" \
> -       "dsb    \n\t" \
> -       "ldmfd  sp!, {fp, ip}" \
> -       : : : "r0","r1","r2","r3","r4","r5","r6","r7", \
> -             "r9","r10","lr","memory" )
> +       "dsb" \
> +       : : : "r0","r1","r2","r3","r4","r5","r6", \
> +             "r9","r10","ip","lr","memory" )
>
>  void flush_uprobe_xol_access(struct page *page, unsigned long uaddr,
>                              void *kaddr, unsigned long len);
> diff --git a/arch/arm/mach-exynos/mcpm-exynos.c b/arch/arm/mach-exynos/mcpm-exynos.c
> index cd861c57d5ad..fd0dbeb93357 100644
> --- a/arch/arm/mach-exynos/mcpm-exynos.c
> +++ b/arch/arm/mach-exynos/mcpm-exynos.c
> @@ -35,7 +35,6 @@ static bool secure_firmware __ro_after_init;
>   */
>  #define exynos_v7_exit_coherency_flush(level) \
>         asm volatile( \
> -       "stmfd  sp!, {fp, ip}\n\t"\
>         "mrc    p15, 0, r0, c1, c0, 0   @ get SCTLR\n\t" \
>         "bic    r0, r0, #"__stringify(CR_C)"\n\t" \
>         "mcr    p15, 0, r0, c1, c0, 0   @ set SCTLR\n\t" \
> @@ -50,11 +49,10 @@ static bool secure_firmware __ro_after_init;
>         "mcr    p15, 0, r0, c1, c0, 1   @ set ACTLR\n\t" \
>         "isb\n\t" \
>         "dsb\n\t" \
> -       "ldmfd  sp!, {fp, ip}" \
>         : \
>         : "Ir" (pmu_base_addr + S5P_INFORM0) \
> -       : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", \
> -         "r9", "r10", "lr", "memory")
> +       : "r0", "r1", "r2", "r3", "r4", "r5", "r6", \
> +         "r9", "r10", "ip", "lr", "memory")
>
>  static int exynos_cpu_powerup(unsigned int cpu, unsigned int cluster)
>  {
> diff --git a/arch/arm/mm/cache-v7.S b/arch/arm/mm/cache-v7.S
> index 830bbfb26ca5..b9f55f0bc278 100644
> --- a/arch/arm/mm/cache-v7.S
> +++ b/arch/arm/mm/cache-v7.S
> @@ -90,7 +90,7 @@ ENDPROC(v7_flush_icache_all)
>   *
>   *     Flush the D-cache up to the Level of Unification Inner Shareable
>   *
> - *     Corrupted registers: r0-r7, r9-r11 (r6 only in Thumb mode)
> + *     Corrupted registers: r0-r6, r9-r10
>   */
>
>  ENTRY(v7_flush_dcache_louis)
> @@ -117,7 +117,7 @@ ENDPROC(v7_flush_dcache_louis)
>   *
>   *     Flush the whole D-cache.
>   *
> - *     Corrupted registers: r0-r7, r9-r11 (r6 only in Thumb mode)
> + *     Corrupted registers: r0-r6, r9-r10
>   *
>   *     - mm    - mm_struct describing address space
>   */
> @@ -149,22 +149,22 @@ flush_levels:
>         movw    r4, #0x3ff
>         ands    r4, r4, r1, lsr #3              @ find maximum number on the way size
>         clz     r5, r4                          @ find bit position of way size increment
> -       movw    r7, #0x7fff
> -       ands    r7, r7, r1, lsr #13             @ extract max number of the index size
> +       movw    r6, #0x7fff
> +       and     r1, r6, r1, lsr #13             @ extract max number of the index size
> +       mov     r6, #1
> +       movne   r4, r4, lsl r5                  @ # of ways shifted into bits [31:...]
> +       movne   r6, r6, lsl r5                  @ 1 shifted left by same amount

This group of changes was definitely trickier to review...can you
provide more color as to why the conditional `mov`s and the change in
the loop back-edge? (I appreciate you explaining some on IRC; it might
be helpful to other reviewers to have more commentary on list) The
rest LGTM.

>  loop1:
> -       mov     r9, r7                          @ create working copy of max index
> +       mov     r9, r1                          @ create working copy of max index
>  loop2:
> - ARM(  orr     r11, r10, r4, lsl r5    )       @ factor way and cache number into r11
> - THUMB(        lsl     r6, r4, r5              )
> - THUMB(        orr     r11, r10, r6            )       @ factor way and cache number into r11
> - ARM(  orr     r11, r11, r9, lsl r2    )       @ factor index number into r11
> - THUMB(        lsl     r6, r9, r2              )
> - THUMB(        orr     r11, r11, r6            )       @ factor index number into r11
> -       mcr     p15, 0, r11, c7, c14, 2         @ clean & invalidate by set/way
> +       mov     r5, r9, lsl r2                  @ factor way into r5
> +       orr     r5, r5, r4                      @ factor index number into r5
> +       orr     r5, r5, r10                     @ factor cache number into r5
> +       mcr     p15, 0, r5, c7, c14, 2          @ clean & invalidate by set/way
>         subs    r9, r9, #1                      @ decrement the index
>         bge     loop2
> -       subs    r4, r4, #1                      @ decrement the way
> -       bge     loop1
> +       subs    r4, r4, r6                      @ decrement the way
> +       bcs     loop1
>  skip:
>         add     r10, r10, #2                    @ increment cache number
>         cmp     r3, r10
> @@ -192,14 +192,12 @@ ENDPROC(v7_flush_dcache_all)
>   *
>   */
>  ENTRY(v7_flush_kern_cache_all)
> - ARM(  stmfd   sp!, {r4-r5, r7, r9-r11, lr}    )
> - THUMB(        stmfd   sp!, {r4-r7, r9-r11, lr}        )
> +       stmfd   sp!, {r4-r6, r9-r10, lr}
>         bl      v7_flush_dcache_all
>         mov     r0, #0
>         ALT_SMP(mcr     p15, 0, r0, c7, c1, 0)  @ invalidate I-cache inner shareable
>         ALT_UP(mcr      p15, 0, r0, c7, c5, 0)  @ I+BTB cache invalidate
> - ARM(  ldmfd   sp!, {r4-r5, r7, r9-r11, lr}    )
> - THUMB(        ldmfd   sp!, {r4-r7, r9-r11, lr}        )
> +       ldmfd   sp!, {r4-r6, r9-r10, lr}
>         ret     lr
>  ENDPROC(v7_flush_kern_cache_all)
>
> @@ -210,14 +208,12 @@ ENDPROC(v7_flush_kern_cache_all)
>   *     Invalidate the I-cache to the point of unification.
>   */
>  ENTRY(v7_flush_kern_cache_louis)
> - ARM(  stmfd   sp!, {r4-r5, r7, r9-r11, lr}    )
> - THUMB(        stmfd   sp!, {r4-r7, r9-r11, lr}        )
> +       stmfd   sp!, {r4-r6, r9-r10, lr}
>         bl      v7_flush_dcache_louis
>         mov     r0, #0
>         ALT_SMP(mcr     p15, 0, r0, c7, c1, 0)  @ invalidate I-cache inner shareable
>         ALT_UP(mcr      p15, 0, r0, c7, c5, 0)  @ I+BTB cache invalidate
> - ARM(  ldmfd   sp!, {r4-r5, r7, r9-r11, lr}    )
> - THUMB(        ldmfd   sp!, {r4-r7, r9-r11, lr}        )
> +       ldmfd   sp!, {r4-r6, r9-r10, lr}
>         ret     lr
>  ENDPROC(v7_flush_kern_cache_louis)
>
> --
> 2.30.2
>

-- 
Thanks,
~Nick Desaulniers