[PATCH] ARM: get rid of __memzero()
Ard Biesheuvel
ard.biesheuvel at linaro.org
Fri Jan 19 04:43:53 PST 2018
On 19 January 2018 at 03:53, Nicolas Pitre <nicolas.pitre at linaro.org> wrote:
> The __memzero assembly code is almost identical to memset's except for
> two orr instructions. The runtime performance of __memset(p, n) and
> memset(p, 0, n) is accordingly almost identical.
>
> However, the memset() macro used to guard against a zero length and to
> call __memzero at compile time when the fill value is a constant zero
> interferes with compiler optimizations.
>
> Arnd found tha the test against a zero length brings up some new
> warnings with gcc v8:
>
> https://gcc.gnu.org/bugzilla/show_bug.cgi?id=82103
>
> And successively rremoving the test against a zero length and the call
> to __memzero optimization produces the following kernel sizes for
> defconfig with gcc 6:
>
> text data bss dec hex filename
> 12248142 6278960 413588 18940690 1210312 vmlinux.orig
> 12244474 6278960 413588 18937022 120f4be vmlinux.no_zero_test
> 12239160 6278960 413588 18931708 120dffc vmlinux.no_memzero
>
> So it is probably not worth keeping __memzero around given that the
> compiler can do a better job at inlining trivial memset(p,0,n) on its
> own. And the memset code already handles a zero length just fine.
>
> Suggested-by: Arnd Bergmann <arnd at arndb.de>
> Signed-off-by: Nicolas Pitre <nico at linaro.org>
Acked-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
>
> diff --git a/arch/arm/boot/compressed/string.c b/arch/arm/boot/compressed/string.c
> index 309e1bbad7..13c90abc68 100644
> --- a/arch/arm/boot/compressed/string.c
> +++ b/arch/arm/boot/compressed/string.c
> @@ -130,8 +130,3 @@ void *memset(void *s, int c, size_t count)
> *xs++ = c;
> return s;
> }
> -
> -void __memzero(void *s, size_t count)
> -{
> - memset(s, 0, count);
> -}
> diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
> index f54a3136aa..111a1d8a41 100644
> --- a/arch/arm/include/asm/string.h
> +++ b/arch/arm/include/asm/string.h
> @@ -39,18 +39,4 @@ static inline void *memset64(uint64_t *p, uint64_t v, __kernel_size_t n)
> return __memset64(p, v, n * 8, v >> 32);
> }
>
> -extern void __memzero(void *ptr, __kernel_size_t n);
> -
> -#define memset(p,v,n) \
> - ({ \
> - void *__p = (p); size_t __n = n; \
> - if ((__n) != 0) { \
> - if (__builtin_constant_p((v)) && (v) == 0) \
> - __memzero((__p),(__n)); \
> - else \
> - memset((__p),(v),(__n)); \
> - } \
> - (__p); \
> - })
> -
> #endif
> diff --git a/arch/arm/kernel/armksyms.c b/arch/arm/kernel/armksyms.c
> index 5266fd9ad6..783fbb4de5 100644
> --- a/arch/arm/kernel/armksyms.c
> +++ b/arch/arm/kernel/armksyms.c
> @@ -92,7 +92,6 @@ EXPORT_SYMBOL(__memset64);
> EXPORT_SYMBOL(memcpy);
> EXPORT_SYMBOL(memmove);
> EXPORT_SYMBOL(memchr);
> -EXPORT_SYMBOL(__memzero);
>
> EXPORT_SYMBOL(mmioset);
> EXPORT_SYMBOL(mmiocpy);
> diff --git a/arch/arm/kernel/head-common.S b/arch/arm/kernel/head-common.S
> index 21dde771a7..d7eb3c4db5 100644
> --- a/arch/arm/kernel/head-common.S
> +++ b/arch/arm/kernel/head-common.S
> @@ -105,8 +105,9 @@ __mmap_switched:
> ARM( ldmia r4!, {r0, r1, sp} )
> THUMB( ldmia r4!, {r0, r1, r3} )
> THUMB( mov sp, r3 )
> - sub r1, r1, r0
> - bl __memzero @ clear .bss
> + sub r2, r1, r0
> + mov r1, #0
> + bl memset @ clear .bss
>
> ldmia r4, {r0, r1, r2, r3}
> str r9, [r0] @ Save processor ID
> diff --git a/arch/arm/lib/Makefile b/arch/arm/lib/Makefile
> index 4cb0b9624d..ad25fd1872 100644
> --- a/arch/arm/lib/Makefile
> +++ b/arch/arm/lib/Makefile
> @@ -8,7 +8,7 @@
> lib-y := backtrace.o changebit.o csumipv6.o csumpartial.o \
> csumpartialcopy.o csumpartialcopyuser.o clearbit.o \
> delay.o delay-loop.o findbit.o memchr.o memcpy.o \
> - memmove.o memset.o memzero.o setbit.o \
> + memmove.o memset.o setbit.o \
> strchr.o strrchr.o \
> testchangebit.o testclearbit.o testsetbit.o \
> ashldi3.o ashrdi3.o lshrdi3.o muldi3.o \
> diff --git a/arch/arm/lib/memzero.S b/arch/arm/lib/memzero.S
> deleted file mode 100644
> index 0eded952e0..0000000000
> --- a/arch/arm/lib/memzero.S
> +++ /dev/null
> @@ -1,137 +0,0 @@
> -/*
> - * linux/arch/arm/lib/memzero.S
> - *
> - * Copyright (C) 1995-2000 Russell King
> - *
> - * This program is free software; you can redistribute it and/or modify
> - * it under the terms of the GNU General Public License version 2 as
> - * published by the Free Software Foundation.
> - */
> -#include <linux/linkage.h>
> -#include <asm/assembler.h>
> -#include <asm/unwind.h>
> -
> - .text
> - .align 5
> - .word 0
> -/*
> - * Align the pointer in r0. r3 contains the number of bytes that we are
> - * mis-aligned by, and r1 is the number of bytes. If r1 < 4, then we
> - * don't bother; we use byte stores instead.
> - */
> -UNWIND( .fnstart )
> -1: subs r1, r1, #4 @ 1 do we have enough
> - blt 5f @ 1 bytes to align with?
> - cmp r3, #2 @ 1
> - strltb r2, [r0], #1 @ 1
> - strleb r2, [r0], #1 @ 1
> - strb r2, [r0], #1 @ 1
> - add r1, r1, r3 @ 1 (r1 = r1 - (4 - r3))
> -/*
> - * The pointer is now aligned and the length is adjusted. Try doing the
> - * memzero again.
> - */
> -
> -ENTRY(__memzero)
> - mov r2, #0 @ 1
> - ands r3, r0, #3 @ 1 unaligned?
> - bne 1b @ 1
> -/*
> - * r3 = 0, and we know that the pointer in r0 is aligned to a word boundary.
> - */
> - cmp r1, #16 @ 1 we can skip this chunk if we
> - blt 4f @ 1 have < 16 bytes
> -
> -#if ! CALGN(1)+0
> -
> -/*
> - * We need an extra register for this loop - save the return address and
> - * use the LR
> - */
> - str lr, [sp, #-4]! @ 1
> -UNWIND( .fnend )
> -UNWIND( .fnstart )
> -UNWIND( .save {lr} )
> - mov ip, r2 @ 1
> - mov lr, r2 @ 1
> -
> -3: subs r1, r1, #64 @ 1 write 32 bytes out per loop
> - stmgeia r0!, {r2, r3, ip, lr} @ 4
> - stmgeia r0!, {r2, r3, ip, lr} @ 4
> - stmgeia r0!, {r2, r3, ip, lr} @ 4
> - stmgeia r0!, {r2, r3, ip, lr} @ 4
> - bgt 3b @ 1
> - ldmeqfd sp!, {pc} @ 1/2 quick exit
> -/*
> - * No need to correct the count; we're only testing bits from now on
> - */
> - tst r1, #32 @ 1
> - stmneia r0!, {r2, r3, ip, lr} @ 4
> - stmneia r0!, {r2, r3, ip, lr} @ 4
> - tst r1, #16 @ 1 16 bytes or more?
> - stmneia r0!, {r2, r3, ip, lr} @ 4
> - ldr lr, [sp], #4 @ 1
> -UNWIND( .fnend )
> -
> -#else
> -
> -/*
> - * This version aligns the destination pointer in order to write
> - * whole cache lines at once.
> - */
> -
> - stmfd sp!, {r4-r7, lr}
> -UNWIND( .fnend )
> -UNWIND( .fnstart )
> -UNWIND( .save {r4-r7, lr} )
> - mov r4, r2
> - mov r5, r2
> - mov r6, r2
> - mov r7, r2
> - mov ip, r2
> - mov lr, r2
> -
> - cmp r1, #96
> - andgts ip, r0, #31
> - ble 3f
> -
> - rsb ip, ip, #32
> - sub r1, r1, ip
> - movs ip, ip, lsl #(32 - 4)
> - stmcsia r0!, {r4, r5, r6, r7}
> - stmmiia r0!, {r4, r5}
> - movs ip, ip, lsl #2
> - strcs r2, [r0], #4
> -
> -3: subs r1, r1, #64
> - stmgeia r0!, {r2-r7, ip, lr}
> - stmgeia r0!, {r2-r7, ip, lr}
> - bgt 3b
> - ldmeqfd sp!, {r4-r7, pc}
> -
> - tst r1, #32
> - stmneia r0!, {r2-r7, ip, lr}
> - tst r1, #16
> - stmneia r0!, {r4-r7}
> - ldmfd sp!, {r4-r7, lr}
> -UNWIND( .fnend )
> -
> -#endif
> -
> -UNWIND( .fnstart )
> -4: tst r1, #8 @ 1 8 bytes or more?
> - stmneia r0!, {r2, r3} @ 2
> - tst r1, #4 @ 1 4 bytes or more?
> - strne r2, [r0], #4 @ 1
> -/*
> - * When we get here, we've got less than 4 bytes to zero. We
> - * may have an unaligned pointer as well.
> - */
> -5: tst r1, #2 @ 1 2 bytes or more?
> - strneb r2, [r0], #1 @ 1
> - strneb r2, [r0], #1 @ 1
> - tst r1, #1 @ 1 a byte left over
> - strneb r2, [r0], #1 @ 1
> - ret lr @ 1
> -UNWIND( .fnend )
> -ENDPROC(__memzero)
> diff --git a/drivers/dma/imx-dma.c b/drivers/dma/imx-dma.c
> index 331f863c60..715b39ae5a 100644
> --- a/drivers/dma/imx-dma.c
> +++ b/drivers/dma/imx-dma.c
> @@ -765,7 +765,7 @@ static int imxdma_alloc_chan_resources(struct dma_chan *chan)
> desc = kzalloc(sizeof(*desc), GFP_KERNEL);
> if (!desc)
> break;
> - __memzero(&desc->desc, sizeof(struct dma_async_tx_descriptor));
> + memset(&desc->desc, 0, sizeof(struct dma_async_tx_descriptor));
> dma_async_tx_descriptor_init(&desc->desc, chan);
> desc->desc.tx_submit = imxdma_tx_submit;
> /* txd.flags will be overwritten in prep funcs */
More information about the linux-arm-kernel
mailing list