[PATCH v2 3/5] ARM: use generic unaligned.h
Shawn Guo
shawn.guo at linaro.org
Mon Oct 8 12:43:57 EDT 2012
This patch has been merged into mainline as commit below.
d25c881 ARM: 7493/1: use generic unaligned.h
It introduces a regression for me. Check out the commit on mainline,
build a v7 only kernel (imx5/6) with imx_v6_v7_defconfig, the kernel
halts in decompressor. But v6/v7 kernel (imx3/5/6) works fine. The
kernel built on the parent commit below works all fine.
ef1c209 ARM: 7492/1: add strstr declaration for decompressors
Shawn
On Sat, Aug 04, 2012 at 08:23:58PM -0500, Rob Herring wrote:
> From: Rob Herring <rob.herring at calxeda.com>
>
> This moves ARM over to the asm-generic/unaligned.h header. This has the
> benefit of better code generated especially for ARMv7 on gcc 4.7+
> compilers.
>
> As Arnd Bergmann, points out: The asm-generic version uses the "struct"
> version for native-endian unaligned access and the "byteshift" version
> for the opposite endianess. The current ARM version however uses the
> "byteshift" implementation for both.
>
> Thanks to Nicolas Pitre for the excellent analysis:
>
> Test case:
>
> int foo (int *x) { return get_unaligned(x); }
> long long bar (long long *x) { return get_unaligned(x); }
>
> With the current ARM version:
>
> foo:
> ldrb r3, [r0, #2] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 2B], MEM[(const u8 *)x_1(D) + 2B]
> ldrb r1, [r0, #1] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 1B], MEM[(const u8 *)x_1(D) + 1B]
> ldrb r2, [r0, #0] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D)], MEM[(const u8 *)x_1(D)]
> mov r3, r3, asl #16 @ tmp154, MEM[(const u8 *)x_1(D) + 2B],
> ldrb r0, [r0, #3] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 3B], MEM[(const u8 *)x_1(D) + 3B]
> orr r3, r3, r1, asl #8 @, tmp155, tmp154, MEM[(const u8 *)x_1(D) + 1B],
> orr r3, r3, r2 @ tmp157, tmp155, MEM[(const u8 *)x_1(D)]
> orr r0, r3, r0, asl #24 @,, tmp157, MEM[(const u8 *)x_1(D) + 3B],
> bx lr @
>
> bar:
> stmfd sp!, {r4, r5, r6, r7} @,
> mov r2, #0 @ tmp184,
> ldrb r5, [r0, #6] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 6B], MEM[(const u8 *)x_1(D) + 6B]
> ldrb r4, [r0, #5] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 5B], MEM[(const u8 *)x_1(D) + 5B]
> ldrb ip, [r0, #2] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 2B], MEM[(const u8 *)x_1(D) + 2B]
> ldrb r1, [r0, #4] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 4B], MEM[(const u8 *)x_1(D) + 4B]
> mov r5, r5, asl #16 @ tmp175, MEM[(const u8 *)x_1(D) + 6B],
> ldrb r7, [r0, #1] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 1B], MEM[(const u8 *)x_1(D) + 1B]
> orr r5, r5, r4, asl #8 @, tmp176, tmp175, MEM[(const u8 *)x_1(D) + 5B],
> ldrb r6, [r0, #7] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 7B], MEM[(const u8 *)x_1(D) + 7B]
> orr r5, r5, r1 @ tmp178, tmp176, MEM[(const u8 *)x_1(D) + 4B]
> ldrb r4, [r0, #0] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D)], MEM[(const u8 *)x_1(D)]
> mov ip, ip, asl #16 @ tmp188, MEM[(const u8 *)x_1(D) + 2B],
> ldrb r1, [r0, #3] @ zero_extendqisi2 @ MEM[(const u8 *)x_1(D) + 3B], MEM[(const u8 *)x_1(D) + 3B]
> orr ip, ip, r7, asl #8 @, tmp189, tmp188, MEM[(const u8 *)x_1(D) + 1B],
> orr r3, r5, r6, asl #24 @,, tmp178, MEM[(const u8 *)x_1(D) + 7B],
> orr ip, ip, r4 @ tmp191, tmp189, MEM[(const u8 *)x_1(D)]
> orr ip, ip, r1, asl #24 @, tmp194, tmp191, MEM[(const u8 *)x_1(D) + 3B],
> mov r1, r3 @,
> orr r0, r2, ip @ tmp171, tmp184, tmp194
> ldmfd sp!, {r4, r5, r6, r7}
> bx lr
>
> In both cases the code is slightly suboptimal. One may wonder why
> wasting r2 with the constant 0 in the second case for example. And all
> the mov's could be folded in subsequent orr's, etc.
>
> Now with the asm-generic version:
>
> foo:
> ldr r0, [r0, #0] @ unaligned @,* x
> bx lr @
>
> bar:
> mov r3, r0 @ x, x
> ldr r0, [r0, #0] @ unaligned @,* x
> ldr r1, [r3, #4] @ unaligned @,
> bx lr @
>
> This is way better of course, but only because this was compiled for
> ARMv7. In this case the compiler knows that the hardware can do
> unaligned word access. This isn't that obvious for foo(), but if we
> remove the get_unaligned() from bar as follows:
>
> long long bar (long long *x) {return *x; }
>
> then the resulting code is:
>
> bar:
> ldmia r0, {r0, r1} @ x,,
> bx lr @
>
> So this proves that the presumed aligned vs unaligned cases does have
> influence on the instructions the compiler may use and that the above
> unaligned code results are not just an accident.
>
> Still... this isn't fully conclusive without at least looking at the
> resulting assembly fron a pre ARMv6 compilation. Let's see with an
> ARMv5 target:
>
> foo:
> ldrb r3, [r0, #0] @ zero_extendqisi2 @ tmp139,* x
> ldrb r1, [r0, #1] @ zero_extendqisi2 @ tmp140,
> ldrb r2, [r0, #2] @ zero_extendqisi2 @ tmp143,
> ldrb r0, [r0, #3] @ zero_extendqisi2 @ tmp146,
> orr r3, r3, r1, asl #8 @, tmp142, tmp139, tmp140,
> orr r3, r3, r2, asl #16 @, tmp145, tmp142, tmp143,
> orr r0, r3, r0, asl #24 @,, tmp145, tmp146,
> bx lr @
>
> bar:
> stmfd sp!, {r4, r5, r6, r7} @,
> ldrb r2, [r0, #0] @ zero_extendqisi2 @ tmp139,* x
> ldrb r7, [r0, #1] @ zero_extendqisi2 @ tmp140,
> ldrb r3, [r0, #4] @ zero_extendqisi2 @ tmp149,
> ldrb r6, [r0, #5] @ zero_extendqisi2 @ tmp150,
> ldrb r5, [r0, #2] @ zero_extendqisi2 @ tmp143,
> ldrb r4, [r0, #6] @ zero_extendqisi2 @ tmp153,
> ldrb r1, [r0, #7] @ zero_extendqisi2 @ tmp156,
> ldrb ip, [r0, #3] @ zero_extendqisi2 @ tmp146,
> orr r2, r2, r7, asl #8 @, tmp142, tmp139, tmp140,
> orr r3, r3, r6, asl #8 @, tmp152, tmp149, tmp150,
> orr r2, r2, r5, asl #16 @, tmp145, tmp142, tmp143,
> orr r3, r3, r4, asl #16 @, tmp155, tmp152, tmp153,
> orr r0, r2, ip, asl #24 @,, tmp145, tmp146,
> orr r1, r3, r1, asl #24 @,, tmp155, tmp156,
> ldmfd sp!, {r4, r5, r6, r7}
> bx lr
>
> Compared to the initial results, this is really nicely optimized and I
> couldn't do much better if I were to hand code it myself.
>
> Signed-off-by: Rob Herring <rob.herring at calxeda.com>
> Reviewed-by: Nicolas Pitre <nico at linaro.org>
> ---
> arch/arm/include/asm/Kbuild | 1 +
> arch/arm/include/asm/unaligned.h | 19 -------------------
> 2 files changed, 1 insertion(+), 19 deletions(-)
> delete mode 100644 arch/arm/include/asm/unaligned.h
>
> diff --git a/arch/arm/include/asm/Kbuild b/arch/arm/include/asm/Kbuild
> index 6cc8a13..a86cabf 100644
> --- a/arch/arm/include/asm/Kbuild
> +++ b/arch/arm/include/asm/Kbuild
> @@ -33,3 +33,4 @@ generic-y += sockios.h
> generic-y += termbits.h
> generic-y += timex.h
> generic-y += types.h
> +generic-y += unaligned.h
> diff --git a/arch/arm/include/asm/unaligned.h b/arch/arm/include/asm/unaligned.h
> deleted file mode 100644
> index 44593a8..0000000
> --- a/arch/arm/include/asm/unaligned.h
> +++ /dev/null
> @@ -1,19 +0,0 @@
> -#ifndef _ASM_ARM_UNALIGNED_H
> -#define _ASM_ARM_UNALIGNED_H
> -
> -#include <linux/unaligned/le_byteshift.h>
> -#include <linux/unaligned/be_byteshift.h>
> -#include <linux/unaligned/generic.h>
> -
> -/*
> - * Select endianness
> - */
> -#ifndef __ARMEB__
> -#define get_unaligned __get_unaligned_le
> -#define put_unaligned __put_unaligned_le
> -#else
> -#define get_unaligned __get_unaligned_be
> -#define put_unaligned __put_unaligned_be
> -#endif
> -
> -#endif /* _ASM_ARM_UNALIGNED_H */
> --
> 1.7.9.5
>
>
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
More information about the linux-arm-kernel
mailing list