[PATCH] ARM: implement optimized percpu variable access

Rob Herring robherring2 at gmail.com
Fri Nov 23 12:06:07 EST 2012


On 11/22/2012 05:34 AM, Will Deacon wrote:
> Hi Rob,
> 
> On Sun, Nov 11, 2012 at 03:20:40AM +0000, Rob Herring wrote:
>> From: Rob Herring <rob.herring at calxeda.com>
>>
>> Use the previously unused TPIDRPRW register to store percpu offsets.
>> TPIDRPRW is only accessible in PL1, so it can only be used in the kernel.
>>
>> This saves 2 loads for each percpu variable access which should yield
>> improved performance, but the improvement has not been quantified.
>>
>> Signed-off-by: Rob Herring <rob.herring at calxeda.com>
>> ---
>>  arch/arm/include/asm/Kbuild   |    1 -
>>  arch/arm/include/asm/percpu.h |   44 +++++++++++++++++++++++++++++++++++++++++
>>  arch/arm/kernel/smp.c         |    3 +++
>>  3 files changed, 47 insertions(+), 1 deletion(-)
>>  create mode 100644 arch/arm/include/asm/percpu.h
> 
> Russell pointed out to me that this patch will break on v6 CPUs if they don't
> have the thread ID registers and we're running with SMP_ON_UP=y. Looking at
> the TRMs, the only case we care about is 1136 < r1p0, but it does indeed break
> there (I have a board on my desk).

Are there any non ARM Ltd. cores without v6K we need to worry about? I
wouldn't think there are many 1136 < r1p0 out there (your desk being an
obvious exception).

> There are a few ways to fix this:
> 
> (1) Use the SMP alternates to patch the code when running on UP systems. I
>     tried this and the code is absolutely diabolical (see below).
> 
> (2) Rely on the registers being RAZ/WI rather than undef (which seems to be
>     the case on my board) and add on the pcpu delta manually. This is also
>     really horrible.
> 
> (3) Just make the thing depend on __LINUX_ARM_ARCH__ >= 7. Yes, we lose on
>     11MPCore, but we win on A8 and the code is much, much simpler.

I would lean towards this option. It really only has to depend on v6K
and !v6. We can refine the multi-platform selection to allow v7 and v6K
only builds in addition to v7 and v6. I think we are going to have
enough optimizations with v7 (gcc optimizations, thumb2, unaligned
accesses, etc.) that most people will do v7 only builds.

Rob

> 
> As an aside, you also need to make the asm block volatile in
> __my_cpu_offset -- I can see it being re-ordered before the set for
> secondary CPUs otherwise.
> 
> Will
> 
> --->8
> 
> From b12ab049b864c2b6f0be1558c934d7213871b223 Mon Sep 17 00:00:00 2001
> From: Will Deacon <will.deacon at arm.com>
> Date: Wed, 21 Nov 2012 10:53:28 +0000
> Subject: [PATCH 1/2] ARM: smp: move SMP_ON_UP alternate macros to common
>  header file
> 
> The ALT_{UP,SMP} macros are used to patch instructions at runtime
> depending on whether we are running on an SMP platform or not. This is
> generally done in out-of-line assembly code, but there is also the need
> for this functionality in inline assembly (e.g. spinlocks).
> 
> This patch moves the macros into their own header file, which provides
> the correct definitions depending on __ASSEMBLY__.
> 
> Signed-off-by: Will Deacon <will.deacon at arm.com>
> ---
>  arch/arm/include/asm/assembler.h | 29 +------------------
>  arch/arm/include/asm/smp_on_up.h | 60 ++++++++++++++++++++++++++++++++++++++++
>  arch/arm/include/asm/spinlock.h  | 21 +++++---------
>  arch/arm/kernel/head.S           |  1 +
>  arch/arm/kernel/module.c         |  8 ++----
>  5 files changed, 71 insertions(+), 48 deletions(-)
>  create mode 100644 arch/arm/include/asm/smp_on_up.h
> 
> diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
> index 2ef9581..7da33e0 100644
> --- a/arch/arm/include/asm/assembler.h
> +++ b/arch/arm/include/asm/assembler.h
> @@ -23,6 +23,7 @@
>  #include <asm/ptrace.h>
>  #include <asm/domain.h>
>  #include <asm/opcodes-virt.h>
> +#include <asm/smp_on_up.h>
>  
>  #define IOMEM(x)	(x)
>  
> @@ -166,34 +167,6 @@
>  	.long	9999b,9001f;			\
>  	.popsection
>  
> -#ifdef CONFIG_SMP
> -#define ALT_SMP(instr...)					\
> -9998:	instr
> -/*
> - * Note: if you get assembler errors from ALT_UP() when building with
> - * CONFIG_THUMB2_KERNEL, you almost certainly need to use
> - * ALT_SMP( W(instr) ... )
> - */
> -#define ALT_UP(instr...)					\
> -	.pushsection ".alt.smp.init", "a"			;\
> -	.long	9998b						;\
> -9997:	instr							;\
> -	.if . - 9997b != 4					;\
> -		.error "ALT_UP() content must assemble to exactly 4 bytes";\
> -	.endif							;\
> -	.popsection
> -#define ALT_UP_B(label)					\
> -	.equ	up_b_offset, label - 9998b			;\
> -	.pushsection ".alt.smp.init", "a"			;\
> -	.long	9998b						;\
> -	W(b)	. + up_b_offset					;\
> -	.popsection
> -#else
> -#define ALT_SMP(instr...)
> -#define ALT_UP(instr...) instr
> -#define ALT_UP_B(label) b label
> -#endif
> -
>  /*
>   * Instruction barrier
>   */
> diff --git a/arch/arm/include/asm/smp_on_up.h b/arch/arm/include/asm/smp_on_up.h
> new file mode 100644
> index 0000000..cc9c527
> --- /dev/null
> +++ b/arch/arm/include/asm/smp_on_up.h
> @@ -0,0 +1,60 @@
> +#ifndef __ASM_ARM_SMP_ON_UP_H_
> +#define __ASM_ARM_SMP_ON_UP_H_
> +
> +#ifndef __ASSEMBLY__
> +#ifdef CONFIG_SMP_ON_UP
> +extern int fixup_smp(const void *addr, unsigned long size);
> +#else
> +static inline int fixup_smp(const void *addr, unsigned long size)
> +{
> +	return -EINVAL;
> +}
> +#endif	/* CONFIG_SMP_ON_UP */
> +#endif	/* __ASSEMBLY__ */
> +
> +/*
> + * Note: if you get assembler errors from ALT_UP() when building with
> + * CONFIG_THUMB2_KERNEL, you almost certainly need to use
> + * ALT_SMP( W(instr) ... )
> + */
> +#ifdef CONFIG_SMP
> +#ifndef __ASSEMBLY__
> +
> +#define ALT_SMP(instr)						\
> +	"9998:	" instr "\n"
> +
> +#define ALT_UP(instr)						\
> +	"	.pushsection \".alt.smp.init\", \"a\"\n"	\
> +	"	.long	9998b\n"				\
> +	"	" instr "\n"					\
> +	"	.popsection\n"
> +
> +#else
> +
> +#define ALT_SMP(instr...)					\
> +9998:	instr
> +
> +#define ALT_UP(instr...)					\
> +	.pushsection ".alt.smp.init", "a"			;\
> +	.long	9998b						;\
> +9997:	instr							;\
> +	.if . - 9997b != 4					;\
> +		.error "ALT_UP() content must assemble to exactly 4 bytes";\
> +	.endif							;\
> +	.popsection
> +
> +#define ALT_UP_B(label)						\
> +	.equ	up_b_offset, label - 9998b			;\
> +	.pushsection ".alt.smp.init", "a"			;\
> +	.long	9998b						;\
> +	W(b)	. + up_b_offset					;\
> +	.popsection
> +
> +#endif	/* __ASSEMBLY */
> +#else
> +#define ALT_SMP(instr...)
> +#define ALT_UP(instr...) instr
> +#define ALT_UP_B(label) b label
> +#endif	/* CONFIG_SMP */
> +
> +#endif	/* __ASM_ARM_SMP_ON_UP_H_ */
> diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
> index b4ca707..58d2f42 100644
> --- a/arch/arm/include/asm/spinlock.h
> +++ b/arch/arm/include/asm/spinlock.h
> @@ -6,20 +6,14 @@
>  #endif
>  
>  #include <asm/processor.h>
> +#include <asm/smp_on_up.h>
>  
>  /*
>   * sev and wfe are ARMv6K extensions.  Uniprocessor ARMv6 may not have the K
>   * extensions, so when running on UP, we have to patch these instructions away.
>   */
> -#define ALT_SMP(smp, up)					\
> -	"9998:	" smp "\n"					\
> -	"	.pushsection \".alt.smp.init\", \"a\"\n"	\
> -	"	.long	9998b\n"				\
> -	"	" up "\n"					\
> -	"	.popsection\n"
> -
>  #ifdef CONFIG_THUMB2_KERNEL
> -#define SEV		ALT_SMP("sev.w", "nop.w")
> +#define SEV		ALT_SMP("sev.w")	ALT_UP("nop.w")
>  /*
>   * For Thumb-2, special care is needed to ensure that the conditional WFE
>   * instruction really does assemble to exactly 4 bytes (as required by
> @@ -33,13 +27,12 @@
>   */
>  #define WFE(cond)	ALT_SMP(		\
>  	"it " cond "\n\t"			\
> -	"wfe" cond ".n",			\
> -						\
> -	"nop.w"					\
> -)
> +	"wfe" cond ".n")			\
> +			ALT_UP(			\
> +	"nop.w")
>  #else
> -#define SEV		ALT_SMP("sev", "nop")
> -#define WFE(cond)	ALT_SMP("wfe" cond, "nop")
> +#define SEV		ALT_SMP("sev")		ALT_UP("nop")
> +#define WFE(cond)	ALT_SMP("wfe" cond)	ALT_UP("nop")
>  #endif
>  
>  static inline void dsb_sev(void)
> diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S
> index 4eee351..c44b51f 100644
> --- a/arch/arm/kernel/head.S
> +++ b/arch/arm/kernel/head.S
> @@ -495,6 +495,7 @@ smp_on_up:
>  	.text
>  __do_fixup_smp_on_up:
>  	cmp	r4, r5
> +	movhs	r0, #0
>  	movhs	pc, lr
>  	ldmia	r4!, {r0, r6}
>   ARM(	str	r6, [r0, r3]	)
> diff --git a/arch/arm/kernel/module.c b/arch/arm/kernel/module.c
> index 1e9be5d..f2299d0 100644
> --- a/arch/arm/kernel/module.c
> +++ b/arch/arm/kernel/module.c
> @@ -23,6 +23,7 @@
>  #include <asm/pgtable.h>
>  #include <asm/sections.h>
>  #include <asm/smp_plat.h>
> +#include <asm/smp_on_up.h>
>  #include <asm/unwind.h>
>  
>  #ifdef CONFIG_XIP_KERNEL
> @@ -266,7 +267,6 @@ static const Elf_Shdr *find_mod_section(const Elf32_Ehdr *hdr,
>  }
>  
>  extern void fixup_pv_table(const void *, unsigned long);
> -extern void fixup_smp(const void *, unsigned long);
>  
>  int module_finalize(const Elf32_Ehdr *hdr, const Elf_Shdr *sechdrs,
>  		    struct module *mod)
> @@ -323,11 +323,7 @@ int module_finalize(const Elf32_Ehdr *hdr, const Elf_Shdr *sechdrs,
>  #endif
>  	s = find_mod_section(hdr, sechdrs, ".alt.smp.init");
>  	if (s && !is_smp())
> -#ifdef CONFIG_SMP_ON_UP
> -		fixup_smp((void *)s->sh_addr, s->sh_size);
> -#else
> -		return -EINVAL;
> -#endif
> +		return fixup_smp((void *)s->sh_addr, s->sh_size);
>  	return 0;
>  }
>  
> 




More information about the linux-arm-kernel mailing list