[PATCH v2 10/10] ARM: add optimized memmove

Thu Sep 26 22:12:15 PDT 2024

Hi Sascha,

On 24-09-26, Sascha Hauer wrote:
> Until now there has been no assembler optimized version of memmove() for
> ARM. Add this from Linux-6.10 for both ARM32 and ARM64. This also updates
> memcpy() for ARM64 from Linux-6.10.

out of curiosity, did you made performance measurements?

Regards,
  Marco

> Reviewed-by: Ahmad Fatoum <a.fatoum at pengutronix.de>
> Signed-off-by: Sascha Hauer <s.hauer at pengutronix.de>
> ---
>  arch/arm/include/asm/cache.h   |   8 ++
>  arch/arm/include/asm/string.h  |   4 +-
>  arch/arm/lib32/Makefile        |   1 +
>  arch/arm/lib32/memmove.S       | 206 +++++++++++++++++++++++++++++++
>  arch/arm/lib64/copy_template.S | 180 ---------------------------
>  arch/arm/lib64/memcpy.S        | 274 ++++++++++++++++++++++++++++++++++-------
>  arch/arm/lib64/memset.S        |  18 +--
>  arch/arm/lib64/string.c        |  17 +++
>  include/string.h               |   2 +
>  lib/string.c                   |   1 -
>  10 files changed, 478 insertions(+), 233 deletions(-)
> 
> diff --git a/arch/arm/include/asm/cache.h b/arch/arm/include/asm/cache.h
> index 261c30129a..dd022c1f23 100644
> --- a/arch/arm/include/asm/cache.h
> +++ b/arch/arm/include/asm/cache.h
> @@ -3,6 +3,13 @@
>  #ifndef __ASM_CACHE_H
>  #define __ASM_CACHE_H
>  
> +#ifdef CONFIG_CPU_64
> +#define L1_CACHE_SHIFT		(6)
> +#define L1_CACHE_BYTES		(1 << L1_CACHE_SHIFT)
> +#endif
> +
> +#ifndef __ASSEMBLY__
> +
>  void v8_invalidate_icache_all(void);
>  void v8_flush_dcache_all(void);
>  void v8_invalidate_dcache_all(void);
> @@ -25,5 +32,6 @@ void arm_early_mmu_cache_invalidate(void);
>  void sync_caches_for_execution(void);
>  
>  #include <asm-generic/cache.h>
> +#endif
>  
>  #endif
> diff --git a/arch/arm/include/asm/string.h b/arch/arm/include/asm/string.h
> index 2322b846b2..f79392e53d 100644
> --- a/arch/arm/include/asm/string.h
> +++ b/arch/arm/include/asm/string.h
> @@ -9,10 +9,12 @@
>  extern void *memcpy(void *, const void *, __kernel_size_t);
>  #define __HAVE_ARCH_MEMSET
>  extern void *memset(void *, int, __kernel_size_t);
> -
> +#define __HAVE_ARCH_MEMMOVE
> +extern void *memmove(void *, const void *, __kernel_size_t);
>  #endif
>  
>  extern void *__memcpy(void *, const void *, __kernel_size_t);
>  extern void *__memset(void *, int, __kernel_size_t);
> +extern void *__memmove(void *, const void *, __kernel_size_t);
>  
>  #endif
> diff --git a/arch/arm/lib32/Makefile b/arch/arm/lib32/Makefile
> index 511a029062..a139a80fb8 100644
> --- a/arch/arm/lib32/Makefile
> +++ b/arch/arm/lib32/Makefile
> @@ -21,6 +21,7 @@ obj-y	+= lshrdi3.o
>  obj-y	+= runtime-offset.o
>  pbl-y	+= runtime-offset.o
>  obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS)	+= memcpy.o
> +obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS)	+= memmove.o
>  obj-$(CONFIG_ARM_OPTIMZED_STRING_FUNCTIONS)	+= memset.o
>  obj-$(CONFIG_ARM_UNWIND) += unwind.o
>  obj-$(CONFIG_MODULES) += module.o
> diff --git a/arch/arm/lib32/memmove.S b/arch/arm/lib32/memmove.S
> new file mode 100644
> index 0000000000..6410554039
> --- /dev/null
> +++ b/arch/arm/lib32/memmove.S
> @@ -0,0 +1,206 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + *  linux/arch/arm/lib/memmove.S
> + *
> + *  Author:	Nicolas Pitre
> + *  Created:	Sep 28, 2005
> + *  Copyright:	(C) MontaVista Software Inc.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +#include <asm/unwind.h>
> +
> +		.text
> +
> +/*
> + * Prototype: void *memmove(void *dest, const void *src, size_t n);
> + *
> + * Note:
> + *
> + * If the memory regions don't overlap, we simply branch to memcpy which is
> + * normally a bit faster. Otherwise the copy is done going downwards.  This
> + * is a transposition of the code from copy_template.S but with the copy
> + * occurring in the opposite direction.
> + */
> +
> +ENTRY(__memmove)
> +WEAK(memmove)
> +	UNWIND(	.fnstart			)
> +
> +		subs	ip, r0, r1
> +		cmphi	r2, ip
> +		bls	__memcpy
> +	UNWIND(	.fnend				)
> +
> +	UNWIND(	.fnstart			)
> +	UNWIND(	.save	{r0, r4, fpreg, lr}	)
> +		stmfd	sp!, {r0, r4, UNWIND(fpreg,) lr}
> +	UNWIND(	.setfp	fpreg, sp		)
> +	UNWIND(	mov	fpreg, sp		)
> +		add	r1, r1, r2
> +		add	r0, r0, r2
> +		subs	r2, r2, #4
> +		blt	8f
> +		ands	ip, r0, #3
> +	PLD(	pld	[r1, #-4]		)
> +		bne	9f
> +		ands	ip, r1, #3
> +		bne	10f
> +
> +1:		subs	r2, r2, #(28)
> +		stmfd	sp!, {r5, r6, r8, r9}
> +		blt	5f
> +
> +	CALGN(	ands	ip, r0, #31		)
> +	CALGN(	sbcsne	r4, ip, r2		)  @ C is always set here
> +	CALGN(	bcs	2f			)
> +	CALGN(	adr	r4, 6f			)
> +	CALGN(	subs	r2, r2, ip		)  @ C is set here
> +	CALGN(	rsb	ip, ip, #32		)
> +	CALGN(	add	pc, r4, ip		)
> +
> +	PLD(	pld	[r1, #-4]		)
> +2:	PLD(	subs	r2, r2, #96		)
> +	PLD(	pld	[r1, #-32]		)
> +	PLD(	blt	4f			)
> +	PLD(	pld	[r1, #-64]		)
> +	PLD(	pld	[r1, #-96]		)
> +
> +3:	PLD(	pld	[r1, #-128]		)
> +4:		ldmdb	r1!, {r3, r4, r5, r6, r8, r9, ip, lr}
> +		subs	r2, r2, #32
> +		stmdb	r0!, {r3, r4, r5, r6, r8, r9, ip, lr}
> +		bge	3b
> +	PLD(	cmn	r2, #96			)
> +	PLD(	bge	4b			)
> +
> +5:		ands	ip, r2, #28
> +		rsb	ip, ip, #32
> +		addne	pc, pc, ip		@ C is always clear here
> +		b	7f
> +6:		W(nop)
> +		W(ldr)	r3, [r1, #-4]!
> +		W(ldr)	r4, [r1, #-4]!
> +		W(ldr)	r5, [r1, #-4]!
> +		W(ldr)	r6, [r1, #-4]!
> +		W(ldr)	r8, [r1, #-4]!
> +		W(ldr)	r9, [r1, #-4]!
> +		W(ldr)	lr, [r1, #-4]!
> +
> +		add	pc, pc, ip
> +		nop
> +		W(nop)
> +		W(str)	r3, [r0, #-4]!
> +		W(str)	r4, [r0, #-4]!
> +		W(str)	r5, [r0, #-4]!
> +		W(str)	r6, [r0, #-4]!
> +		W(str)	r8, [r0, #-4]!
> +		W(str)	r9, [r0, #-4]!
> +		W(str)	lr, [r0, #-4]!
> +
> +	CALGN(	bcs	2b			)
> +
> +7:		ldmfd	sp!, {r5, r6, r8, r9}
> +
> +8:		movs	r2, r2, lsl #31
> +		ldrbne	r3, [r1, #-1]!
> +		ldrbcs	r4, [r1, #-1]!
> +		ldrbcs	ip, [r1, #-1]
> +		strbne	r3, [r0, #-1]!
> +		strbcs	r4, [r0, #-1]!
> +		strbcs	ip, [r0, #-1]
> +		ldmfd	sp!, {r0, r4, UNWIND(fpreg,) pc}
> +
> +9:		cmp	ip, #2
> +		ldrbgt	r3, [r1, #-1]!
> +		ldrbge	r4, [r1, #-1]!
> +		ldrb	lr, [r1, #-1]!
> +		strbgt	r3, [r0, #-1]!
> +		strbge	r4, [r0, #-1]!
> +		subs	r2, r2, ip
> +		strb	lr, [r0, #-1]!
> +		blt	8b
> +		ands	ip, r1, #3
> +		beq	1b
> +
> +10:		bic	r1, r1, #3
> +		cmp	ip, #2
> +		ldr	r3, [r1, #0]
> +		beq	17f
> +		blt	18f
> +
> +
> +		.macro	backward_copy_shift push pull
> +
> +		subs	r2, r2, #28
> +		blt	14f
> +
> +	CALGN(	ands	ip, r0, #31		)
> +	CALGN(	sbcsne	r4, ip, r2		)  @ C is always set here
> +	CALGN(	subcc	r2, r2, ip		)
> +	CALGN(	bcc	15f			)
> +
> +11:		stmfd	sp!, {r5, r6, r8 - r10}
> +
> +	PLD(	pld	[r1, #-4]		)
> +	PLD(	subs	r2, r2, #96		)
> +	PLD(	pld	[r1, #-32]		)
> +	PLD(	blt	13f			)
> +	PLD(	pld	[r1, #-64]		)
> +	PLD(	pld	[r1, #-96]		)
> +
> +12:	PLD(	pld	[r1, #-128]		)
> +13:		ldmdb   r1!, {r8, r9, r10, ip}
> +		mov     lr, r3, lspush #\push
> +		subs    r2, r2, #32
> +		ldmdb   r1!, {r3, r4, r5, r6}
> +		orr     lr, lr, ip, lspull #\pull
> +		mov     ip, ip, lspush #\push
> +		orr     ip, ip, r10, lspull #\pull
> +		mov     r10, r10, lspush #\push
> +		orr     r10, r10, r9, lspull #\pull
> +		mov     r9, r9, lspush #\push
> +		orr     r9, r9, r8, lspull #\pull
> +		mov     r8, r8, lspush #\push
> +		orr     r8, r8, r6, lspull #\pull
> +		mov     r6, r6, lspush #\push
> +		orr     r6, r6, r5, lspull #\pull
> +		mov     r5, r5, lspush #\push
> +		orr     r5, r5, r4, lspull #\pull
> +		mov     r4, r4, lspush #\push
> +		orr     r4, r4, r3, lspull #\pull
> +		stmdb   r0!, {r4 - r6, r8 - r10, ip, lr}
> +		bge	12b
> +	PLD(	cmn	r2, #96			)
> +	PLD(	bge	13b			)
> +
> +		ldmfd	sp!, {r5, r6, r8 - r10}
> +
> +14:		ands	ip, r2, #28
> +		beq	16f
> +
> +15:		mov     lr, r3, lspush #\push
> +		ldr	r3, [r1, #-4]!
> +		subs	ip, ip, #4
> +		orr	lr, lr, r3, lspull #\pull
> +		str	lr, [r0, #-4]!
> +		bgt	15b
> +	CALGN(	cmp	r2, #0			)
> +	CALGN(	bge	11b			)
> +
> +16:		add	r1, r1, #(\pull / 8)
> +		b	8b
> +
> +		.endm
> +
> +
> +		backward_copy_shift	push=8	pull=24
> +
> +17:		backward_copy_shift	push=16	pull=16
> +
> +18:		backward_copy_shift	push=24	pull=8
> +
> +	UNWIND(	.fnend				)
> +ENDPROC(memmove)
> +ENDPROC(__memmove)
> diff --git a/arch/arm/lib64/copy_template.S b/arch/arm/lib64/copy_template.S
> deleted file mode 100644
> index 8e4ff059d1..0000000000
> --- a/arch/arm/lib64/copy_template.S
> +++ /dev/null
> @@ -1,180 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
> -/*
> - * This code is based on glibc cortex strings work originally authored by Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> - * be found @
> - *
> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> - * files/head:/src/aarch64/
> - */
> -
> -/*
> - * Copy a buffer from src to dest (alignment handled by the hardware)
> - *
> - * Parameters:
> - *	x0 - dest
> - *	x1 - src
> - *	x2 - n
> - * Returns:
> - *	x0 - dest
> - */
> -dstin	.req	x0
> -src	.req	x1
> -count	.req	x2
> -tmp1	.req	x3
> -tmp1w	.req	w3
> -tmp2	.req	x4
> -tmp2w	.req	w4
> -dst	.req	x6
> -
> -A_l	.req	x7
> -A_h	.req	x8
> -B_l	.req	x9
> -B_h	.req	x10
> -C_l	.req	x11
> -C_h	.req	x12
> -D_l	.req	x13
> -D_h	.req	x14
> -
> -	mov	dst, dstin
> -	cmp	count, #16
> -	/*When memory length is less than 16, the accessed are not aligned.*/
> -	b.lo	.Ltiny15
> -
> -	neg	tmp2, src
> -	ands	tmp2, tmp2, #15/* Bytes to reach alignment. */
> -	b.eq	.LSrcAligned
> -	sub	count, count, tmp2
> -	/*
> -	* Copy the leading memory data from src to dst in an increasing
> -	* address order.By this way,the risk of overwritting the source
> -	* memory data is eliminated when the distance between src and
> -	* dst is less than 16. The memory accesses here are alignment.
> -	*/
> -	tbz	tmp2, #0, 1f
> -	ldrb1	tmp1w, src, #1
> -	strb1	tmp1w, dst, #1
> -1:
> -	tbz	tmp2, #1, 2f
> -	ldrh1	tmp1w, src, #2
> -	strh1	tmp1w, dst, #2
> -2:
> -	tbz	tmp2, #2, 3f
> -	ldr1	tmp1w, src, #4
> -	str1	tmp1w, dst, #4
> -3:
> -	tbz	tmp2, #3, .LSrcAligned
> -	ldr1	tmp1, src, #8
> -	str1	tmp1, dst, #8
> -
> -.LSrcAligned:
> -	cmp	count, #64
> -	b.ge	.Lcpy_over64
> -	/*
> -	* Deal with small copies quickly by dropping straight into the
> -	* exit block.
> -	*/
> -.Ltail63:
> -	/*
> -	* Copy up to 48 bytes of data. At this point we only need the
> -	* bottom 6 bits of count to be accurate.
> -	*/
> -	ands	tmp1, count, #0x30
> -	b.eq	.Ltiny15
> -	cmp	tmp1w, #0x20
> -	b.eq	1f
> -	b.lt	2f
> -	ldp1	A_l, A_h, src, #16
> -	stp1	A_l, A_h, dst, #16
> -1:
> -	ldp1	A_l, A_h, src, #16
> -	stp1	A_l, A_h, dst, #16
> -2:
> -	ldp1	A_l, A_h, src, #16
> -	stp1	A_l, A_h, dst, #16
> -.Ltiny15:
> -	/*
> -	* Prefer to break one ldp/stp into several load/store to access
> -	* memory in an increasing address order,rather than to load/store 16
> -	* bytes from (src-16) to (dst-16) and to backward the src to aligned
> -	* address,which way is used in original cortex memcpy. If keeping
> -	* the original memcpy process here, memmove need to satisfy the
> -	* precondition that src address is at least 16 bytes bigger than dst
> -	* address,otherwise some source data will be overwritten when memove
> -	* call memcpy directly. To make memmove simpler and decouple the
> -	* memcpy's dependency on memmove, withdrew the original process.
> -	*/
> -	tbz	count, #3, 1f
> -	ldr1	tmp1, src, #8
> -	str1	tmp1, dst, #8
> -1:
> -	tbz	count, #2, 2f
> -	ldr1	tmp1w, src, #4
> -	str1	tmp1w, dst, #4
> -2:
> -	tbz	count, #1, 3f
> -	ldrh1	tmp1w, src, #2
> -	strh1	tmp1w, dst, #2
> -3:
> -	tbz	count, #0, .Lexitfunc
> -	ldrb1	tmp1w, src, #1
> -	strb1	tmp1w, dst, #1
> -
> -	b	.Lexitfunc
> -
> -.Lcpy_over64:
> -	subs	count, count, #128
> -	b.ge	.Lcpy_body_large
> -	/*
> -	* Less than 128 bytes to copy, so handle 64 here and then jump
> -	* to the tail.
> -	*/
> -	ldp1	A_l, A_h, src, #16
> -	stp1	A_l, A_h, dst, #16
> -	ldp1	B_l, B_h, src, #16
> -	ldp1	C_l, C_h, src, #16
> -	stp1	B_l, B_h, dst, #16
> -	stp1	C_l, C_h, dst, #16
> -	ldp1	D_l, D_h, src, #16
> -	stp1	D_l, D_h, dst, #16
> -
> -	tst	count, #0x3f
> -	b.ne	.Ltail63
> -	b	.Lexitfunc
> -
> -	/*
> -	* Critical loop.  Start at a new cache line boundary.  Assuming
> -	* 64 bytes per line this ensures the entire loop is in one line.
> -	*/
> -.Lcpy_body_large:
> -	/* pre-get 64 bytes data. */
> -	ldp1	A_l, A_h, src, #16
> -	ldp1	B_l, B_h, src, #16
> -	ldp1	C_l, C_h, src, #16
> -	ldp1	D_l, D_h, src, #16
> -1:
> -	/*
> -	* interlace the load of next 64 bytes data block with store of the last
> -	* loaded 64 bytes data.
> -	*/
> -	stp1	A_l, A_h, dst, #16
> -	ldp1	A_l, A_h, src, #16
> -	stp1	B_l, B_h, dst, #16
> -	ldp1	B_l, B_h, src, #16
> -	stp1	C_l, C_h, dst, #16
> -	ldp1	C_l, C_h, src, #16
> -	stp1	D_l, D_h, dst, #16
> -	ldp1	D_l, D_h, src, #16
> -	subs	count, count, #64
> -	b.ge	1b
> -	stp1	A_l, A_h, dst, #16
> -	stp1	B_l, B_h, dst, #16
> -	stp1	C_l, C_h, dst, #16
> -	stp1	D_l, D_h, dst, #16
> -
> -	tst	count, #0x3f
> -	b.ne	.Ltail63
> -.Lexitfunc:
> diff --git a/arch/arm/lib64/memcpy.S b/arch/arm/lib64/memcpy.S
> index 92845b25a6..98b453d3fd 100644
> --- a/arch/arm/lib64/memcpy.S
> +++ b/arch/arm/lib64/memcpy.S
> @@ -1,63 +1,249 @@
>  /* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
>  /*
> - * This code is based on glibc cortex strings work originally authored by Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
> - * be found @
> + * Copyright (c) 2012-2021, Arm Limited.
>   *
> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> - * files/head:/src/aarch64/
> + * Adapted from the original at:
> + * https://github.com/ARM-software/optimized-routines/blob/afd6244a1f8d9229/string/aarch64/memcpy.S
>   */
>  
>  #include <linux/linkage.h>
>  #include <asm/assembler.h>
>  
> -/*
> - * Copy a buffer from src to dest (alignment handled by the hardware)
> +/* Assumptions:
> + *
> + * ARMv8-a, AArch64, unaligned accesses.
>   *
> - * Parameters:
> - *	x0 - dest
> - *	x1 - src
> - *	x2 - n
> - * Returns:
> - *	x0 - dest
>   */
> -	.macro ldrb1 ptr, regB, val
> -	ldrb  \ptr, [\regB], \val
> -	.endm
>  
> -	.macro strb1 ptr, regB, val
> -	strb \ptr, [\regB], \val
> -	.endm
> +#define L(label) .L ## label
> +
> +#define dstin	x0
> +#define src	x1
> +#define count	x2
> +#define dst	x3
> +#define srcend	x4
> +#define dstend	x5
> +#define A_l	x6
> +#define A_lw	w6
> +#define A_h	x7
> +#define B_l	x8
> +#define B_lw	w8
> +#define B_h	x9
> +#define C_l	x10
> +#define C_lw	w10
> +#define C_h	x11
> +#define D_l	x12
> +#define D_h	x13
> +#define E_l	x14
> +#define E_h	x15
> +#define F_l	x16
> +#define F_h	x17
> +#define G_l	count
> +#define G_h	dst
> +#define H_l	src
> +#define H_h	srcend
> +#define tmp1	x14
> +
> +/* This implementation handles overlaps and supports both memcpy and memmove
> +   from a single entry point.  It uses unaligned accesses and branchless
> +   sequences to keep the code small, simple and improve performance.
> +
> +   Copies are split into 3 main cases: small copies of up to 32 bytes, medium
> +   copies of up to 128 bytes, and large copies.  The overhead of the overlap
> +   check is negligible since it is only required for large copies.
> +
> +   Large copies use a software pipelined loop processing 64 bytes per iteration.
> +   The destination pointer is 16-byte aligned to minimize unaligned accesses.
> +   The loop tail is handled by always copying 64 bytes from the end.
> +*/
> +
> +SYM_FUNC_START(__pi_memcpy)
> +	add	srcend, src, count
> +	add	dstend, dstin, count
> +	cmp	count, 128
> +	b.hi	L(copy_long)
> +	cmp	count, 32
> +	b.hi	L(copy32_128)
> +
> +	/* Small copies: 0..32 bytes.  */
> +	cmp	count, 16
> +	b.lo	L(copy16)
> +	ldp	A_l, A_h, [src]
> +	ldp	D_l, D_h, [srcend, -16]
> +	stp	A_l, A_h, [dstin]
> +	stp	D_l, D_h, [dstend, -16]
> +	ret
> +
> +	/* Copy 8-15 bytes.  */
> +L(copy16):
> +	tbz	count, 3, L(copy8)
> +	ldr	A_l, [src]
> +	ldr	A_h, [srcend, -8]
> +	str	A_l, [dstin]
> +	str	A_h, [dstend, -8]
> +	ret
> +
> +	.p2align 3
> +	/* Copy 4-7 bytes.  */
> +L(copy8):
> +	tbz	count, 2, L(copy4)
> +	ldr	A_lw, [src]
> +	ldr	B_lw, [srcend, -4]
> +	str	A_lw, [dstin]
> +	str	B_lw, [dstend, -4]
> +	ret
>  
> -	.macro ldrh1 ptr, regB, val
> -	ldrh  \ptr, [\regB], \val
> -	.endm
> +	/* Copy 0..3 bytes using a branchless sequence.  */
> +L(copy4):
> +	cbz	count, L(copy0)
> +	lsr	tmp1, count, 1
> +	ldrb	A_lw, [src]
> +	ldrb	C_lw, [srcend, -1]
> +	ldrb	B_lw, [src, tmp1]
> +	strb	A_lw, [dstin]
> +	strb	B_lw, [dstin, tmp1]
> +	strb	C_lw, [dstend, -1]
> +L(copy0):
> +	ret
>  
> -	.macro strh1 ptr, regB, val
> -	strh \ptr, [\regB], \val
> -	.endm
> +	.p2align 4
> +	/* Medium copies: 33..128 bytes.  */
> +L(copy32_128):
> +	ldp	A_l, A_h, [src]
> +	ldp	B_l, B_h, [src, 16]
> +	ldp	C_l, C_h, [srcend, -32]
> +	ldp	D_l, D_h, [srcend, -16]
> +	cmp	count, 64
> +	b.hi	L(copy128)
> +	stp	A_l, A_h, [dstin]
> +	stp	B_l, B_h, [dstin, 16]
> +	stp	C_l, C_h, [dstend, -32]
> +	stp	D_l, D_h, [dstend, -16]
> +	ret
>  
> -	.macro ldr1 ptr, regB, val
> -	ldr \ptr, [\regB], \val
> -	.endm
> +	.p2align 4
> +	/* Copy 65..128 bytes.  */
> +L(copy128):
> +	ldp	E_l, E_h, [src, 32]
> +	ldp	F_l, F_h, [src, 48]
> +	cmp	count, 96
> +	b.ls	L(copy96)
> +	ldp	G_l, G_h, [srcend, -64]
> +	ldp	H_l, H_h, [srcend, -48]
> +	stp	G_l, G_h, [dstend, -64]
> +	stp	H_l, H_h, [dstend, -48]
> +L(copy96):
> +	stp	A_l, A_h, [dstin]
> +	stp	B_l, B_h, [dstin, 16]
> +	stp	E_l, E_h, [dstin, 32]
> +	stp	F_l, F_h, [dstin, 48]
> +	stp	C_l, C_h, [dstend, -32]
> +	stp	D_l, D_h, [dstend, -16]
> +	ret
>  
> -	.macro str1 ptr, regB, val
> -	str \ptr, [\regB], \val
> -	.endm
> +	.p2align 4
> +	/* Copy more than 128 bytes.  */
> +L(copy_long):
> +	/* Use backwards copy if there is an overlap.  */
> +	sub	tmp1, dstin, src
> +	cbz	tmp1, L(copy0)
> +	cmp	tmp1, count
> +	b.lo	L(copy_long_backwards)
>  
> -	.macro ldp1 ptr, regB, regC, val
> -	ldp \ptr, \regB, [\regC], \val
> -	.endm
> +	/* Copy 16 bytes and then align dst to 16-byte alignment.  */
>  
> -	.macro stp1 ptr, regB, regC, val
> -	stp \ptr, \regB, [\regC], \val
> -	.endm
> +	ldp	D_l, D_h, [src]
> +	and	tmp1, dstin, 15
> +	bic	dst, dstin, 15
> +	sub	src, src, tmp1
> +	add	count, count, tmp1	/* Count is now 16 too large.  */
> +	ldp	A_l, A_h, [src, 16]
> +	stp	D_l, D_h, [dstin]
> +	ldp	B_l, B_h, [src, 32]
> +	ldp	C_l, C_h, [src, 48]
> +	ldp	D_l, D_h, [src, 64]!
> +	subs	count, count, 128 + 16	/* Test and readjust count.  */
> +	b.ls	L(copy64_from_end)
>  
> -	.weak __arch_memcpy
> -ENTRY(__arch_memcpy)
> -#include "copy_template.S"
> +L(loop64):
> +	stp	A_l, A_h, [dst, 16]
> +	ldp	A_l, A_h, [src, 16]
> +	stp	B_l, B_h, [dst, 32]
> +	ldp	B_l, B_h, [src, 32]
> +	stp	C_l, C_h, [dst, 48]
> +	ldp	C_l, C_h, [src, 48]
> +	stp	D_l, D_h, [dst, 64]!
> +	ldp	D_l, D_h, [src, 64]!
> +	subs	count, count, 64
> +	b.hi	L(loop64)
> +
> +	/* Write the last iteration and copy 64 bytes from the end.  */
> +L(copy64_from_end):
> +	ldp	E_l, E_h, [srcend, -64]
> +	stp	A_l, A_h, [dst, 16]
> +	ldp	A_l, A_h, [srcend, -48]
> +	stp	B_l, B_h, [dst, 32]
> +	ldp	B_l, B_h, [srcend, -32]
> +	stp	C_l, C_h, [dst, 48]
> +	ldp	C_l, C_h, [srcend, -16]
> +	stp	D_l, D_h, [dst, 64]
> +	stp	E_l, E_h, [dstend, -64]
> +	stp	A_l, A_h, [dstend, -48]
> +	stp	B_l, B_h, [dstend, -32]
> +	stp	C_l, C_h, [dstend, -16]
>  	ret
> -ENDPROC(__arch_memcpy)
> +
> +	.p2align 4
> +
> +	/* Large backwards copy for overlapping copies.
> +	   Copy 16 bytes and then align dst to 16-byte alignment.  */
> +L(copy_long_backwards):
> +	ldp	D_l, D_h, [srcend, -16]
> +	and	tmp1, dstend, 15
> +	sub	srcend, srcend, tmp1
> +	sub	count, count, tmp1
> +	ldp	A_l, A_h, [srcend, -16]
> +	stp	D_l, D_h, [dstend, -16]
> +	ldp	B_l, B_h, [srcend, -32]
> +	ldp	C_l, C_h, [srcend, -48]
> +	ldp	D_l, D_h, [srcend, -64]!
> +	sub	dstend, dstend, tmp1
> +	subs	count, count, 128
> +	b.ls	L(copy64_from_start)
> +
> +L(loop64_backwards):
> +	stp	A_l, A_h, [dstend, -16]
> +	ldp	A_l, A_h, [srcend, -16]
> +	stp	B_l, B_h, [dstend, -32]
> +	ldp	B_l, B_h, [srcend, -32]
> +	stp	C_l, C_h, [dstend, -48]
> +	ldp	C_l, C_h, [srcend, -48]
> +	stp	D_l, D_h, [dstend, -64]!
> +	ldp	D_l, D_h, [srcend, -64]!
> +	subs	count, count, 64
> +	b.hi	L(loop64_backwards)
> +
> +	/* Write the last iteration and copy 64 bytes from the start.  */
> +L(copy64_from_start):
> +	ldp	G_l, G_h, [src, 48]
> +	stp	A_l, A_h, [dstend, -16]
> +	ldp	A_l, A_h, [src, 32]
> +	stp	B_l, B_h, [dstend, -32]
> +	ldp	B_l, B_h, [src, 16]
> +	stp	C_l, C_h, [dstend, -48]
> +	ldp	C_l, C_h, [src]
> +	stp	D_l, D_h, [dstend, -64]
> +	stp	G_l, G_h, [dstin, 48]
> +	stp	A_l, A_h, [dstin, 32]
> +	stp	B_l, B_h, [dstin, 16]
> +	stp	C_l, C_h, [dstin]
> +	ret
> +SYM_FUNC_END(__pi_memcpy)
> +
> +SYM_FUNC_ALIAS(__arch_memcpy, __pi_memcpy)
> +SYM_FUNC_ALIAS_WEAK(memcpy, __memcpy)
> +
> +SYM_FUNC_ALIAS(__pi_memmove, __pi_memcpy)
> +
> +SYM_FUNC_ALIAS(__arch_memmove, __pi_memmove)
> +SYM_FUNC_ALIAS_WEAK(memmove, __memmove)
> diff --git a/arch/arm/lib64/memset.S b/arch/arm/lib64/memset.S
> index ff201750f1..f059203983 100644
> --- a/arch/arm/lib64/memset.S
> +++ b/arch/arm/lib64/memset.S
> @@ -1,10 +1,9 @@
>  /* SPDX-License-Identifier: GPL-2.0-only */
> -/* SPDX-FileCopyrightText: 2013 ARM Ltd. */
> -/* SPDX-FileCopyrightText: 2013 Linaro */
> -
>  /*
> + * Copyright (C) 2013 ARM Ltd.
> + * Copyright (C) 2013 Linaro.
> + *
>   * This code is based on glibc cortex strings work originally authored by Linaro
> - * and re-licensed under GPLv2 for the Linux kernel. The original code can
>   * be found @
>   *
>   * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
> @@ -13,6 +12,7 @@
>  
>  #include <linux/linkage.h>
>  #include <asm/assembler.h>
> +#include <asm/cache.h>
>  
>  /*
>   * Fill in the buffer with character c (alignment handled by the hardware)
> @@ -42,8 +42,7 @@ dst		.req	x8
>  tmp3w		.req	w9
>  tmp3		.req	x9
>  
> -	.weak memset
> -ENTRY(__arch_memset)
> +SYM_FUNC_START(__pi_memset)
>  	mov	dst, dstin	/* Preserve return value.  */
>  	and	A_lw, val, #255
>  	orr	A_lw, A_lw, A_lw, lsl #8
> @@ -115,6 +114,7 @@ ENTRY(__arch_memset)
>  	* Critical loop. Start at a new cache line boundary. Assuming
>  	* 64 bytes per line, this ensures the entire loop is in one line.
>  	*/
> +	.p2align	L1_CACHE_SHIFT
>  .Lnot_short:
>  	sub	dst, dst, #16/* Pre-bias.  */
>  	sub	count, count, #64
> @@ -201,4 +201,8 @@ ENTRY(__arch_memset)
>  	ands	count, count, zva_bits_x
>  	b.ne	.Ltail_maybe_long
>  	ret
> -ENDPROC(__arch_memset)
> +SYM_FUNC_END(__pi_memset)
> +
> +SYM_FUNC_ALIAS(__arch_memset, __pi_memset)
> +
> +SYM_FUNC_ALIAS_WEAK(memset, __pi_memset)
> diff --git a/arch/arm/lib64/string.c b/arch/arm/lib64/string.c
> index 938790e1a9..c7954d6efe 100644
> --- a/arch/arm/lib64/string.c
> +++ b/arch/arm/lib64/string.c
> @@ -6,6 +6,7 @@
>  
>  void *__arch_memset(void *dst, int c, __kernel_size_t size);
>  void *__arch_memcpy(void * dest, const void *src, size_t count);
> +void *__arch_memmove(void * dest, const void *src, size_t count);
>  
>  static __prereloc void *_memset(void *dst, int c, __kernel_size_t size)
>  {
> @@ -38,3 +39,19 @@ void __weak *memcpy(void * dest, const void *src, size_t count)
>  
>  void *__memcpy(void * dest, const void *src, size_t count)
>  	__alias(_memcpy);
> +
> +static void *_memmove(void * dest, const void *src, size_t count)
> +{
> +	if (likely(get_cr() & CR_M))
> +		return __arch_memmove(dest, src, count);
> +
> +	return __default_memmove(dest, src, count);
> +}
> +
> +void __weak *memmove(void * dest, const void *src, size_t count)
> +{
> +	return _memmove(dest, src, count);
> +}
> +
> +void *__memmove(void * dest, const void *src, size_t count)
> +	__alias(_memmove);
> diff --git a/include/string.h b/include/string.h
> index cbe6eddf7f..986ccd83dd 100644
> --- a/include/string.h
> +++ b/include/string.h
> @@ -17,6 +17,8 @@ void *__nokasan_default_memset(void *, int, __kernel_size_t);
>  void *__default_memcpy(void * dest,const void *src,size_t count);
>  void *__nokasan_default_memcpy(void * dest,const void *src,size_t count);
>  
> +void *__default_memmove(void * dest,const void *src,size_t count);
> +
>  char *parse_assignment(char *str);
>  
>  int strverscmp(const char *a, const char *b);
> diff --git a/lib/string.c b/lib/string.c
> index 98dd3cffdd..50c2016c2b 100644
> --- a/lib/string.c
> +++ b/lib/string.c
> @@ -701,7 +701,6 @@ void *memmove(void * dest, const void *src, size_t count)
>  void *__memmove(void * dest, const void *src, size_t count)
>  	__alias(__default_memmove);
>  #endif
> -EXPORT_SYMBOL(memmove);
>  
>  #ifndef __HAVE_ARCH_MEMCMP
>  /**
> 
> -- 
> 2.39.5
> 
> 
>