[PATCH v2 10/10] ARM: add optimized memmove
Marco Felsch
m.felsch at pengutronix.de
Thu Sep 26 22:12:15 PDT 2024
Hi Sascha,
On 24-09-26, Sascha Hauer wrote:
> Until now there has been no assembler optimized version of memmove() for
> ARM. Add this from Linux-6.10 for both ARM32 and ARM64. This also updates
> memcpy() for ARM64 from Linux-6.10.
out of curiosity, did you made performance measurements?
Regards,
Marco
> Reviewed-by: Ahmad Fatoum <a.fatoum at pengutronix.de>
> Signed-off-by: Sascha Hauer <s.hauer at pengutronix.de>
> ---
> arch/arm/include/asm/cache.h | 8 ++
> arch/arm/include/asm/string.h | 4 +-
> arch/arm/lib32/Makefile | 1 +
> arch/arm/lib32/memmove.S | 206 +++++++++++++++++++++++++++++++
> arch/arm/lib64/copy_template.S | 180 ---------------------------
> arch/arm/lib64/memcpy.S | 274 ++++++++++++++++++++++++++++++++++-------
> arch/arm/lib64/memset.S | 18 +--
> arch/arm/lib64/string.c | 17 +++
> include/string.h | 2 +
> lib/string.c | 1 -
> 10 files changed, 478 insertions(+), 233 deletions(-)
>
> diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
> index 261c30129a..dd022c1f23 100644
> --- a/arch/arm/include/asm/cache.h
> +++ b/arch/arm/include/asm/cache.h
> @@ -3,6 +3,13 @@
> #ifndef __ASM_CACHE_H
> #define __ASM_CACHE_H
>
> +#ifdef CONFIG_CPU_64
> +#define L1_CACHE_SHIFT (6)
> +#define L1_CACHE_BYTES (1 << L1_CACHE_SHIFT)
> +#endif
> +
> +#ifndef __ASSEMBLY__
> +
> void v8_invalidate_icache_all(void);
> void v8_flush_dcache_all(void);
> void v8_invalidate_dcache_all(void);
> @@ -25,5 +32,6 @@ void arm_early_mmu_cache_invalidate(void);
> void sync_caches_for_execution(void);
>
> #include <asm-generic/cache.h>
> +#endif
>
> #endif
> diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
> index 2322b846b2..f79392e53d 100644
> --- a/arch/arm/include/asm/string.h
> +++ b/arch/arm/include/asm/string.h
> @@ -9,10 +9,12 @@
> extern void *memcpy(void *, const void *, __kernel_size_t);
> #define __HAVE_ARCH_MEMSET
> extern void *memset(void *, int, __kernel_size_t);
> -
> +#define __HAVE_ARCH_MEMMOVE
> +extern void *memmove(void *, const void *, __kernel_size_t);
> #endif
>
> extern void *__memcpy(void *, const void *, __kernel_size_t);
> extern void *__memset(void *, int, __kernel_size_t);
> +extern void *__memmove(void *, const void *, __kernel_size_t);
>
> #endif
> diff --git a/arch/arm/lib32/Makefile b/arch/arm/lib32/Makefile
> index 511a029062..a139a80fb8 100644
> --- a/arch/arm/lib32/Makefile
> +++ b/arch/arm/lib32/Makefile
> @@ -21,6 +21,7 @@ obj-y += lshrdi3.o
> obj-y += runtime-offset.o
> pbl-y += runtime-offset.o
> obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memcpy.o
> +obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memmove.o
> obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS) += memset.o
> obj-$(CONFIG_ARM_UNWIND) += unwind.o
> obj-$(CONFIG_MODULES) += module.o
> diff --git a/arch/arm/lib32/memmove.S b/arch/arm/lib32/memmove.S
> new file mode 100644
> index 0000000000..6410554039
> --- /dev/null
> +++ b/arch/arm/lib32/memmove.S
> @@ -0,0 +1,206 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * linux/arch/arm/lib/memmove.S
> + *
> + * Author: Nicolas Pitre
> + * Created: Sep 28, 2005
> + * Copyright: (C) MontaVista Software Inc.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +#include <asm/unwind.h>
> +
> + .text
> +
> +/*
> + * Prototype: void *memmove(void *dest, const void *src, size_t n);
> + *
> + * Note:
> + *
> + * If the memory regions don't overlap, we simply branch to memcpy which is
> + * normally a bit faster. Otherwise the copy is done going downwards. This
> + * is a transposition of the code from copy_template.S but with the copy
> + * occurring in the opposite direction.
> + */
> +
> +ENTRY(__memmove)
> +WEAK(memmove)
> + UNWIND( .fnstart )
> +
> + subs ip, r0, r1
> + cmphi r2, ip
> + bls __memcpy
> + UNWIND( .fnend )
> +
> + UNWIND( .fnstart )
> + UNWIND( .save {r0, r4, fpreg, lr} )
> + stmfd sp!, {r0, r4, UNWIND(fpreg,) lr}
> + UNWIND( .setfp fpreg, sp )
> + UNWIND( mov fpreg, sp )
> + add r1, r1, r2
> + add r0, r0, r2
> + subs r2, r2, #4
> + blt 8f
> + ands ip, r0, #3
> + PLD( pld [r1, #-4] )
> + bne 9f
> + ands ip, r1, #3
> + bne 10f
> +
> +1: subs r2, r2, #(28)
> + stmfd sp!, {r5, r6, r8, r9}
> + blt 5f
> +
> + CALGN( ands ip, r0, #31 )
> + CALGN( sbcsne r4, ip, r2 ) @ C is always set here
> + CALGN( bcs 2f )
> + CALGN( adr r4, 6f )
> + CALGN( subs r2, r2, ip ) @ C is set here
> + CALGN( rsb ip, ip, #32 )
> + CALGN( add pc, r4, ip )
> +
> + PLD( pld [r1, #-4] )
> +2: PLD( subs r2, r2, #96 )
> + PLD( pld [r1, #-32] )
> + PLD( blt 4f )
> + PLD( pld [r1, #-64] )
> + PLD( pld [r1, #-96] )
> +
> +3: PLD( pld [r1, #-128] )
> +4: ldmdb r1!, {r3, r4, r5, r6, r8, r9, ip, lr}
> + subs r2, r2, #32
> + stmdb r0!, {r3, r4, r5, r6, r8, r9, ip, lr}
> + bge 3b
> + PLD( cmn r2, #96 )
> + PLD( bge 4b )
> +
> +5: ands ip, r2, #28
> + rsb ip, ip, #32
> + addne pc, pc, ip @ C is always clear here
> + b 7f
> +6: W(nop)
> + W(ldr) r3, [r1, #-4]!
> + W(ldr) r4, [r1, #-4]!
> + W(ldr) r5, [r1, #-4]!
> + W(ldr) r6, [r1, #-4]!
> + W(ldr) r8, [r1, #-4]!
> + W(ldr) r9, [r1, #-4]!
> + W(ldr) lr, [r1, #-4]!
> +
> + add pc, pc, ip
> + nop
> + W(nop)
> + W(str) r3, [r0, #-4]!
> + W(str) r4, [r0, #-4]!
> + W(str) r5, [r0, #-4]!
> + W(str) r6, [r0, #-4]!
> + W(str) r8, [r0, #-4]!
> + W(str) r9, [r0, #-4]!
> + W(str) lr, [r0, #-4]!
> +
> + CALGN( bcs 2b )
> +
> +7: ldmfd sp!, {r5, r6, r8, r9}
> +
> +8: movs r2, r2, lsl #31
> + ldrbne r3, [r1, #-1]!
> + ldrbcs r4, [r1, #-1]!
> + ldrbcs ip, [r1, #-1]
> + strbne r3, [r0, #-1]!
> + strbcs r4, [r0, #-1]!
> + strbcs ip, [r0, #-1]
> + ldmfd sp!, {r0, r4, UNWIND(fpreg,) pc}
> +
> +9: cmp ip, #2
> + ldrbgt r3, [r1, #-1]!
> + ldrbge r4, [r1, #-1]!
> + ldrb lr, [r1, #-1]!
> + strbgt r3, [r0, #-1]!
> + strbge r4, [r0, #-1]!
> + subs r2, r2, ip
> + strb lr, [r0, #-1]!
> + blt 8b
> + ands ip, r1, #3
> + beq 1b
> +
> +10: bic r1, r1, #3
> + cmp ip, #2
> + ldr r3, [r1, #0]
> + beq 17f
> + blt 18f
> +
> +
> + .macro backward_copy_shift push pull
> +
> + subs r2, r2, #28
> + blt 14f
> +
> + CALGN( ands ip, r0, #31 )
> + CALGN( sbcsne r4, ip, r2 ) @ C is always set here
> + CALGN( subcc r2, r2, ip )
> + CALGN( bcc 15f )
> +
> +11: stmfd sp!, {r5, r6, r8 - r10}
> +
> + PLD( pld [r1, #-4] )
> + PLD( subs r2, r2, #96 )
> + PLD( pld [r1, #-32] )
> + PLD( blt 13f )
> + PLD( pld [r1, #-64] )
> + PLD( pld [r1, #-96] )
> +
> +12: PLD( pld [r1, #-128] )
> +13: ldmdb r1!, {r8, r9, r10, ip}
> + mov lr, r3, lspush #\push
> + subs r2, r2, #32
> + ldmdb r1!, {r3, r4, r5, r6}
> + orr lr, lr, ip, lspull #\pull
> + mov ip, ip, lspush #\push
> + orr ip, ip, r10, lspull #\pull
> + mov r10, r10, lspush #\push
> + orr r10, r10, r9, lspull #\pull
> + mov r9, r9, lspush #\push
> + orr r9, r9, r8, lspull #\pull
> + mov r8, r8, lspush #\push
> + orr r8, r8, r6, lspull #\pull
> + mov r6, r6, lspush #\push
> + orr r6, r6, r5, lspull #\pull
> + mov r5, r5, lspush #\push
> + orr r5, r5, r4, lspull #\pull
> + mov r4, r4, lspush #\push
> + orr r4, r4, r3, lspull #\pull
> + stmdb r0!, {r4 - r6, r8 - r10, ip, lr}
> + bge 12b
> + PLD( cmn r2, #96 )
> + PLD( bge 13b )
> +
> + ldmfd sp!, {r5, r6, r8 - r10}
> +
> +14: ands ip, r2, #28
> + beq 16f
> +
> +15: mov lr, r3, lspush #\push
> + ldr r3, [r1, #-4]!
> + subs ip, ip, #4
> + orr lr, lr, r3, lspull #\pull
> + str lr, [r0, #-4]!
> + bgt 15b
> + CALGN( cmp r2, #0 )
> + CALGN( bge 11b )
> +
> +16: add r1, r1, #(\pull / 8)
> + b 8b
> +
> + .endm
> +
> +
> + backward_copy_shift push=8 pull=24
> +
> +17: backward_copy_shift push=16 pull=16
> +
> +18: backward_copy_shift push=24 pull=8
> +
> + UNWIND( .fnend )
> +ENDPROC(memmove)
> +ENDPROC(__memmove)
> diff --git a/arch/arm/lib64/copy_template.S b/arch/arm/lib64/copy_template.S
> deleted file mode 100644
> index 8e4ff059d1..0000000000
> --- a/arch/arm/lib64/copy_template.S
> +++ /dev/null
> @@ -1,180 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
> -/*
> - * This code is based on glibc cortex strings work originally authored by Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> - * be found @
> - *
> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> - * files/head:/src/aarch64/
> - */
> -
> -/*
> - * Copy a buffer from src to dest (alignment handled by the hardware)
> - *
> - * Parameters:
> - * x0 - dest
> - * x1 - src
> - * x2 - n
> - * Returns:
> - * x0 - dest
> - */
> -dstin .req x0
> -src .req x1
> -count .req x2
> -tmp1 .req x3
> -tmp1w .req w3
> -tmp2 .req x4
> -tmp2w .req w4
> -dst .req x6
> -
> -A_l .req x7
> -A_h .req x8
> -B_l .req x9
> -B_h .req x10
> -C_l .req x11
> -C_h .req x12
> -D_l .req x13
> -D_h .req x14
> -
> - mov dst, dstin
> - cmp count, #16
> - /*When memory length is less than 16, the accessed are not aligned.*/
> - b.lo .Ltiny15
> -
> - neg tmp2, src
> - ands tmp2, tmp2, #15/* Bytes to reach alignment. */
> - b.eq .LSrcAligned
> - sub count, count, tmp2
> - /*
> - * Copy the leading memory data from src to dst in an increasing
> - * address order.By this way,the risk of overwritting the source
> - * memory data is eliminated when the distance between src and
> - * dst is less than 16. The memory accesses here are alignment.
> - */
> - tbz tmp2, #0, 1f
> - ldrb1 tmp1w, src, #1
> - strb1 tmp1w, dst, #1
> -1:
> - tbz tmp2, #1, 2f
> - ldrh1 tmp1w, src, #2
> - strh1 tmp1w, dst, #2
> -2:
> - tbz tmp2, #2, 3f
> - ldr1 tmp1w, src, #4
> - str1 tmp1w, dst, #4
> -3:
> - tbz tmp2, #3, .LSrcAligned
> - ldr1 tmp1, src, #8
> - str1 tmp1, dst, #8
> -
> -.LSrcAligned:
> - cmp count, #64
> - b.ge .Lcpy_over64
> - /*
> - * Deal with small copies quickly by dropping straight into the
> - * exit block.
> - */
> -.Ltail63:
> - /*
> - * Copy up to 48 bytes of data. At this point we only need the
> - * bottom 6 bits of count to be accurate.
> - */
> - ands tmp1, count, #0x30
> - b.eq .Ltiny15
> - cmp tmp1w, #0x20
> - b.eq 1f
> - b.lt 2f
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> -1:
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> -2:
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> -.Ltiny15:
> - /*
> - * Prefer to break one ldp/stp into several load/store to access
> - * memory in an increasing address order,rather than to load/store 16
> - * bytes from (src-16) to (dst-16) and to backward the src to aligned
> - * address,which way is used in original cortex memcpy. If keeping
> - * the original memcpy process here, memmove need to satisfy the
> - * precondition that src address is at least 16 bytes bigger than dst
> - * address,otherwise some source data will be overwritten when memove
> - * call memcpy directly. To make memmove simpler and decouple the
> - * memcpy's dependency on memmove, withdrew the original process.
> - */
> - tbz count, #3, 1f
> - ldr1 tmp1, src, #8
> - str1 tmp1, dst, #8
> -1:
> - tbz count, #2, 2f
> - ldr1 tmp1w, src, #4
> - str1 tmp1w, dst, #4
> -2:
> - tbz count, #1, 3f
> - ldrh1 tmp1w, src, #2
> - strh1 tmp1w, dst, #2
> -3:
> - tbz count, #0, .Lexitfunc
> - ldrb1 tmp1w, src, #1
> - strb1 tmp1w, dst, #1
> -
> - b .Lexitfunc
> -
> -.Lcpy_over64:
> - subs count, count, #128
> - b.ge .Lcpy_body_large
> - /*
> - * Less than 128 bytes to copy, so handle 64 here and then jump
> - * to the tail.
> - */
> - ldp1 A_l, A_h, src, #16
> - stp1 A_l, A_h, dst, #16
> - ldp1 B_l, B_h, src, #16
> - ldp1 C_l, C_h, src, #16
> - stp1 B_l, B_h, dst, #16
> - stp1 C_l, C_h, dst, #16
> - ldp1 D_l, D_h, src, #16
> - stp1 D_l, D_h, dst, #16
> -
> - tst count, #0x3f
> - b.ne .Ltail63
> - b .Lexitfunc
> -
> - /*
> - * Critical loop. Start at a new cache line boundary. Assuming
> - * 64 bytes per line this ensures the entire loop is in one line.
> - */
> -.Lcpy_body_large:
> - /* pre-get 64 bytes data. */
> - ldp1 A_l, A_h, src, #16
> - ldp1 B_l, B_h, src, #16
> - ldp1 C_l, C_h, src, #16
> - ldp1 D_l, D_h, src, #16
> -1:
> - /*
> - * interlace the load of next 64 bytes data block with store of the last
> - * loaded 64 bytes data.
> - */
> - stp1 A_l, A_h, dst, #16
> - ldp1 A_l, A_h, src, #16
> - stp1 B_l, B_h, dst, #16
> - ldp1 B_l, B_h, src, #16
> - stp1 C_l, C_h, dst, #16
> - ldp1 C_l, C_h, src, #16
> - stp1 D_l, D_h, dst, #16
> - ldp1 D_l, D_h, src, #16
> - subs count, count, #64
> - b.ge 1b
> - stp1 A_l, A_h, dst, #16
> - stp1 B_l, B_h, dst, #16
> - stp1 C_l, C_h, dst, #16
> - stp1 D_l, D_h, dst, #16
> -
> - tst count, #0x3f
> - b.ne .Ltail63
> -.Lexitfunc:
> diff --git a/arch/arm/lib64/memcpy.S b/arch/arm/lib64/memcpy.S
> index 92845b25a6..98b453d3fd 100644
> --- a/arch/arm/lib64/memcpy.S
> +++ b/arch/arm/lib64/memcpy.S
> @@ -1,63 +1,249 @@
> /* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
> /*
> - * This code is based on glibc cortex strings work originally authored by Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> - * be found @
> + * Copyright (c) 2012-2021, Arm Limited.
> *
> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> - * files/head:/src/aarch64/
> + * Adapted from the original at:
> + * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
> */
>
> #include <linux/linkage.h>
> #include <asm/assembler.h>
>
> -/*
> - * Copy a buffer from src to dest (alignment handled by the hardware)
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, unaligned accesses.
> *
> - * Parameters:
> - * x0 - dest
> - * x1 - src
> - * x2 - n
> - * Returns:
> - * x0 - dest
> */
> - .macro ldrb1 ptr, regB, val
> - ldrb \ptr, [\regB], \val
> - .endm
>
> - .macro strb1 ptr, regB, val
> - strb \ptr, [\regB], \val
> - .endm
> +#define L(label) .L ## label
> +
> +#define dstin x0
> +#define src x1
> +#define count x2
> +#define dst x3
> +#define srcend x4
> +#define dstend x5
> +#define A_l x6
> +#define A_lw w6
> +#define A_h x7
> +#define B_l x8
> +#define B_lw w8
> +#define B_h x9
> +#define C_l x10
> +#define C_lw w10
> +#define C_h x11
> +#define D_l x12
> +#define D_h x13
> +#define E_l x14
> +#define E_h x15
> +#define F_l x16
> +#define F_h x17
> +#define G_l count
> +#define G_h dst
> +#define H_l src
> +#define H_h srcend
> +#define tmp1 x14
> +
> +/* This implementation handles overlaps and supports both memcpy and memmove
> + from a single entry point. It uses unaligned accesses and branchless
> + sequences to keep the code small, simple and improve performance.
> +
> + Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> + copies of up to 128 bytes, and large copies. The overhead of the overlap
> + check is negligible since it is only required for large copies.
> +
> + Large copies use a software pipelined loop processing 64 bytes per iteration.
> + The destination pointer is 16-byte aligned to minimize unaligned accesses.
> + The loop tail is handled by always copying 64 bytes from the end.
> +*/
> +
> +SYM_FUNC_START(__pi_memcpy)
> + add srcend, src, count
> + add dstend, dstin, count
> + cmp count, 128
> + b.hi L(copy_long)
> + cmp count, 32
> + b.hi L(copy32_128)
> +
> + /* Small copies: 0..32 bytes. */
> + cmp count, 16
> + b.lo L(copy16)
> + ldp A_l, A_h, [src]
> + ldp D_l, D_h, [srcend, -16]
> + stp A_l, A_h, [dstin]
> + stp D_l, D_h, [dstend, -16]
> + ret
> +
> + /* Copy 8-15 bytes. */
> +L(copy16):
> + tbz count, 3, L(copy8)
> + ldr A_l, [src]
> + ldr A_h, [srcend, -8]
> + str A_l, [dstin]
> + str A_h, [dstend, -8]
> + ret
> +
> + .p2align 3
> + /* Copy 4-7 bytes. */
> +L(copy8):
> + tbz count, 2, L(copy4)
> + ldr A_lw, [src]
> + ldr B_lw, [srcend, -4]
> + str A_lw, [dstin]
> + str B_lw, [dstend, -4]
> + ret
>
> - .macro ldrh1 ptr, regB, val
> - ldrh \ptr, [\regB], \val
> - .endm
> + /* Copy 0..3 bytes using a branchless sequence. */
> +L(copy4):
> + cbz count, L(copy0)
> + lsr tmp1, count, 1
> + ldrb A_lw, [src]
> + ldrb C_lw, [srcend, -1]
> + ldrb B_lw, [src, tmp1]
> + strb A_lw, [dstin]
> + strb B_lw, [dstin, tmp1]
> + strb C_lw, [dstend, -1]
> +L(copy0):
> + ret
>
> - .macro strh1 ptr, regB, val
> - strh \ptr, [\regB], \val
> - .endm
> + .p2align 4
> + /* Medium copies: 33..128 bytes. */
> +L(copy32_128):
> + ldp A_l, A_h, [src]
> + ldp B_l, B_h, [src, 16]
> + ldp C_l, C_h, [srcend, -32]
> + ldp D_l, D_h, [srcend, -16]
> + cmp count, 64
> + b.hi L(copy128)
> + stp A_l, A_h, [dstin]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstend, -32]
> + stp D_l, D_h, [dstend, -16]
> + ret
>
> - .macro ldr1 ptr, regB, val
> - ldr \ptr, [\regB], \val
> - .endm
> + .p2align 4
> + /* Copy 65..128 bytes. */
> +L(copy128):
> + ldp E_l, E_h, [src, 32]
> + ldp F_l, F_h, [src, 48]
> + cmp count, 96
> + b.ls L(copy96)
> + ldp G_l, G_h, [srcend, -64]
> + ldp H_l, H_h, [srcend, -48]
> + stp G_l, G_h, [dstend, -64]
> + stp H_l, H_h, [dstend, -48]
> +L(copy96):
> + stp A_l, A_h, [dstin]
> + stp B_l, B_h, [dstin, 16]
> + stp E_l, E_h, [dstin, 32]
> + stp F_l, F_h, [dstin, 48]
> + stp C_l, C_h, [dstend, -32]
> + stp D_l, D_h, [dstend, -16]
> + ret
>
> - .macro str1 ptr, regB, val
> - str \ptr, [\regB], \val
> - .endm
> + .p2align 4
> + /* Copy more than 128 bytes. */
> +L(copy_long):
> + /* Use backwards copy if there is an overlap. */
> + sub tmp1, dstin, src
> + cbz tmp1, L(copy0)
> + cmp tmp1, count
> + b.lo L(copy_long_backwards)
>
> - .macro ldp1 ptr, regB, regC, val
> - ldp \ptr, \regB, [\regC], \val
> - .endm
> + /* Copy 16 bytes and then align dst to 16-byte alignment. */
>
> - .macro stp1 ptr, regB, regC, val
> - stp \ptr, \regB, [\regC], \val
> - .endm
> + ldp D_l, D_h, [src]
> + and tmp1, dstin, 15
> + bic dst, dstin, 15
> + sub src, src, tmp1
> + add count, count, tmp1 /* Count is now 16 too large. */
> + ldp A_l, A_h, [src, 16]
> + stp D_l, D_h, [dstin]
> + ldp B_l, B_h, [src, 32]
> + ldp C_l, C_h, [src, 48]
> + ldp D_l, D_h, [src, 64]!
> + subs count, count, 128 + 16 /* Test and readjust count. */
> + b.ls L(copy64_from_end)
>
> - .weak __arch_memcpy
> -ENTRY(__arch_memcpy)
> -#include "copy_template.S"
> +L(loop64):
> + stp A_l, A_h, [dst, 16]
> + ldp A_l, A_h, [src, 16]
> + stp B_l, B_h, [dst, 32]
> + ldp B_l, B_h, [src, 32]
> + stp C_l, C_h, [dst, 48]
> + ldp C_l, C_h, [src, 48]
> + stp D_l, D_h, [dst, 64]!
> + ldp D_l, D_h, [src, 64]!
> + subs count, count, 64
> + b.hi L(loop64)
> +
> + /* Write the last iteration and copy 64 bytes from the end. */
> +L(copy64_from_end):
> + ldp E_l, E_h, [srcend, -64]
> + stp A_l, A_h, [dst, 16]
> + ldp A_l, A_h, [srcend, -48]
> + stp B_l, B_h, [dst, 32]
> + ldp B_l, B_h, [srcend, -32]
> + stp C_l, C_h, [dst, 48]
> + ldp C_l, C_h, [srcend, -16]
> + stp D_l, D_h, [dst, 64]
> + stp E_l, E_h, [dstend, -64]
> + stp A_l, A_h, [dstend, -48]
> + stp B_l, B_h, [dstend, -32]
> + stp C_l, C_h, [dstend, -16]
> ret
> -ENDPROC(__arch_memcpy)
> +
> + .p2align 4
> +
> + /* Large backwards copy for overlapping copies.
> + Copy 16 bytes and then align dst to 16-byte alignment. */
> +L(copy_long_backwards):
> + ldp D_l, D_h, [srcend, -16]
> + and tmp1, dstend, 15
> + sub srcend, srcend, tmp1
> + sub count, count, tmp1
> + ldp A_l, A_h, [srcend, -16]
> + stp D_l, D_h, [dstend, -16]
> + ldp B_l, B_h, [srcend, -32]
> + ldp C_l, C_h, [srcend, -48]
> + ldp D_l, D_h, [srcend, -64]!
> + sub dstend, dstend, tmp1
> + subs count, count, 128
> + b.ls L(copy64_from_start)
> +
> +L(loop64_backwards):
> + stp A_l, A_h, [dstend, -16]
> + ldp A_l, A_h, [srcend, -16]
> + stp B_l, B_h, [dstend, -32]
> + ldp B_l, B_h, [srcend, -32]
> + stp C_l, C_h, [dstend, -48]
> + ldp C_l, C_h, [srcend, -48]
> + stp D_l, D_h, [dstend, -64]!
> + ldp D_l, D_h, [srcend, -64]!
> + subs count, count, 64
> + b.hi L(loop64_backwards)
> +
> + /* Write the last iteration and copy 64 bytes from the start. */
> +L(copy64_from_start):
> + ldp G_l, G_h, [src, 48]
> + stp A_l, A_h, [dstend, -16]
> + ldp A_l, A_h, [src, 32]
> + stp B_l, B_h, [dstend, -32]
> + ldp B_l, B_h, [src, 16]
> + stp C_l, C_h, [dstend, -48]
> + ldp C_l, C_h, [src]
> + stp D_l, D_h, [dstend, -64]
> + stp G_l, G_h, [dstin, 48]
> + stp A_l, A_h, [dstin, 32]
> + stp B_l, B_h, [dstin, 16]
> + stp C_l, C_h, [dstin]
> + ret
> +SYM_FUNC_END(__pi_memcpy)
> +
> +SYM_FUNC_ALIAS(__arch_memcpy, __pi_memcpy)
> +SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
> +
> +SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
> +
> +SYM_FUNC_ALIAS(__arch_memmove, __pi_memmove)
> +SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
> diff --git a/arch/arm/lib64/memset.S b/arch/arm/lib64/memset.S
> index ff201750f1..f059203983 100644
> --- a/arch/arm/lib64/memset.S
> +++ b/arch/arm/lib64/memset.S
> @@ -1,10 +1,9 @@
> /* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
> /*
> + * Copyright (C) 2013 ARM Ltd.
> + * Copyright (C) 2013 Linaro.
> + *
> * This code is based on glibc cortex strings work originally authored by Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> * be found @
> *
> * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> @@ -13,6 +12,7 @@
>
> #include <linux/linkage.h>
> #include <asm/assembler.h>
> +#include <asm/cache.h>
>
> /*
> * Fill in the buffer with character c (alignment handled by the hardware)
> @@ -42,8 +42,7 @@ dst .req x8
> tmp3w .req w9
> tmp3 .req x9
>
> - .weak memset
> -ENTRY(__arch_memset)
> +SYM_FUNC_START(__pi_memset)
> mov dst, dstin /* Preserve return value. */
> and A_lw, val, #255
> orr A_lw, A_lw, A_lw, lsl #8
> @@ -115,6 +114,7 @@ ENTRY(__arch_memset)
> * Critical loop. Start at a new cache line boundary. Assuming
> * 64 bytes per line, this ensures the entire loop is in one line.
> */
> + .p2align L1_CACHE_SHIFT
> .Lnot_short:
> sub dst, dst, #16/* Pre-bias. */
> sub count, count, #64
> @@ -201,4 +201,8 @@ ENTRY(__arch_memset)
> ands count, count, zva_bits_x
> b.ne .Ltail_maybe_long
> ret
> -ENDPROC(__arch_memset)
> +SYM_FUNC_END(__pi_memset)
> +
> +SYM_FUNC_ALIAS(__arch_memset, __pi_memset)
> +
> +SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
> diff --git a/arch/arm/lib64/string.c b/arch/arm/lib64/string.c
> index 938790e1a9..c7954d6efe 100644
> --- a/arch/arm/lib64/string.c
> +++ b/arch/arm/lib64/string.c
> @@ -6,6 +6,7 @@
>
> void *__arch_memset(void *dst, int c, __kernel_size_t size);
> void *__arch_memcpy(void * dest, const void *src, size_t count);
> +void *__arch_memmove(void * dest, const void *src, size_t count);
>
> static __prereloc void *_memset(void *dst, int c, __kernel_size_t size)
> {
> @@ -38,3 +39,19 @@ void __weak *memcpy(void * dest, const void *src, size_t count)
>
> void *__memcpy(void * dest, const void *src, size_t count)
> __alias(_memcpy);
> +
> +static void *_memmove(void * dest, const void *src, size_t count)
> +{
> + if (likely(get_cr() & CR_M))
> + return __arch_memmove(dest, src, count);
> +
> + return __default_memmove(dest, src, count);
> +}
> +
> +void __weak *memmove(void * dest, const void *src, size_t count)
> +{
> + return _memmove(dest, src, count);
> +}
> +
> +void *__memmove(void * dest, const void *src, size_t count)
> + __alias(_memmove);
> diff --git a/include/string.h b/include/string.h
> index cbe6eddf7f..986ccd83dd 100644
> --- a/include/string.h
> +++ b/include/string.h
> @@ -17,6 +17,8 @@ void *__nokasan_default_memset(void *, int, __kernel_size_t);
> void *__default_memcpy(void * dest,const void *src,size_t count);
> void *__nokasan_default_memcpy(void * dest,const void *src,size_t count);
>
> +void *__default_memmove(void * dest,const void *src,size_t count);
> +
> char *parse_assignment(char *str);
>
> int strverscmp(const char *a, const char *b);
> diff --git a/lib/string.c b/lib/string.c
> index 98dd3cffdd..50c2016c2b 100644
> --- a/lib/string.c
> +++ b/lib/string.c
> @@ -701,7 +701,6 @@ void *memmove(void * dest, const void *src, size_t count)
> void *__memmove(void * dest, const void *src, size_t count)
> __alias(__default_memmove);
> #endif
> -EXPORT_SYMBOL(memmove);
>
> #ifndef __HAVE_ARCH_MEMCMP
> /**
>
> --
> 2.39.5
>
>
>
More information about the barebox
mailing list