[PATCH v2 08/10] ARM: update memcpy.S and memset.S from Linux
Sascha Hauer
s.hauer at pengutronix.de
Thu Sep 26 04:17:10 PDT 2024
This updates the assembler optimized memcpy() and memset() functions
from Linux-6.10.
Reviewed-by: Ahmad Fatoum <a.fatoum at pengutronix.de>
Signed-off-by: Sascha Hauer <s.hauer at pengutronix.de>
---
arch/arm/include/asm/assembler.h | 6 +++
arch/arm/lib32/copy_template.S | 58 +++++++++++++-----------
arch/arm/lib32/memcpy.S | 30 +++++++------
arch/arm/lib32/memset.S | 96 +++++++++++++++++++++++++---------------
4 files changed, 117 insertions(+), 73 deletions(-)
diff --git a/arch/arm/include/asm/assembler.h b/arch/arm/include/asm/assembler.h
index e8f5625a0a..c84c8ec734 100644
--- a/arch/arm/include/asm/assembler.h
+++ b/arch/arm/include/asm/assembler.h
@@ -67,6 +67,12 @@
#define CALGN(code...)
#endif
+#ifndef CONFIG_CPU_64
+/* the frame pointer used for stack unwinding */
+ARM( fpreg .req r11 )
+THUMB( fpreg .req r7 )
+#endif
+
/*
* Enable and disable interrupts
*/
diff --git a/arch/arm/lib32/copy_template.S b/arch/arm/lib32/copy_template.S
index 897e3db3ff..777e185701 100644
--- a/arch/arm/lib32/copy_template.S
+++ b/arch/arm/lib32/copy_template.S
@@ -1,5 +1,5 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2005 MontaVista Software, Inc (Nicolas Pitre)
+/* SPDX-FileCopyrightText: 2005 MontaVista Software, Inc (Nicolas Pitre) */
/*
* linux/arch/arm/lib/copy_template.s
@@ -48,6 +48,12 @@
* data as needed by the implementation including this code. Called
* upon code entry.
*
+ * usave reg1 reg2
+ *
+ * Unwind annotation macro is corresponding for 'enter' macro.
+ * It tell unwinder that preserved some provided registers on the stack
+ * and additional data by a prior 'enter' macro.
+ *
* exit reg1 reg2
*
* Restore registers with the values previously saved with the
@@ -61,8 +67,10 @@
* than one 32bit instruction in Thumb-2)
*/
-
- enter r4, lr
+ UNWIND( .fnstart )
+ enter r4, UNWIND(fpreg,) lr
+ UNWIND( .setfp fpreg, sp )
+ UNWIND( mov fpreg, sp )
subs r2, r2, #4
blt 8f
@@ -73,12 +81,12 @@
bne 10f
1: subs r2, r2, #(28)
- stmfd sp!, {r5 - r8}
+ stmfd sp!, {r5, r6, r8, r9}
blt 5f
CALGN( ands ip, r0, #31 )
CALGN( rsb r3, ip, #32 )
- CALGN( sbcnes r4, r3, r2 ) @ C is always set here
+ CALGN( sbcsne r4, r3, r2 ) @ C is always set here
CALGN( bcs 2f )
CALGN( adr r4, 6f )
CALGN( subs r2, r2, r3 ) @ C gets set
@@ -92,9 +100,9 @@
PLD( pld [r1, #92] )
3: PLD( pld [r1, #124] )
-4: ldr8w r1, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+4: ldr8w r1, r3, r4, r5, r6, r8, r9, ip, lr, abort=20f
subs r2, r2, #32
- str8w r0, r3, r4, r5, r6, r7, r8, ip, lr, abort=20f
+ str8w r0, r3, r4, r5, r6, r8, r9, ip, lr, abort=20f
bge 3b
PLD( cmn r2, #96 )
PLD( bge 4b )
@@ -114,8 +122,8 @@
ldr1w r1, r4, abort=20f
ldr1w r1, r5, abort=20f
ldr1w r1, r6, abort=20f
- ldr1w r1, r7, abort=20f
ldr1w r1, r8, abort=20f
+ ldr1w r1, r9, abort=20f
ldr1w r1, lr, abort=20f
#if LDR1W_SHIFT < STR1W_SHIFT
@@ -132,13 +140,13 @@
str1w r0, r4, abort=20f
str1w r0, r5, abort=20f
str1w r0, r6, abort=20f
- str1w r0, r7, abort=20f
str1w r0, r8, abort=20f
+ str1w r0, r9, abort=20f
str1w r0, lr, abort=20f
CALGN( bcs 2b )
-7: ldmfd sp!, {r5 - r8}
+7: ldmfd sp!, {r5, r6, r8, r9}
8: movs r2, r2, lsl #31
ldr1b r1, r3, ne, abort=21f
@@ -148,7 +156,7 @@
str1b r0, r4, cs, abort=21f
str1b r0, ip, cs, abort=21f
- exit r4, pc
+ exit r4, UNWIND(fpreg,) pc
9: rsb ip, ip, #4
cmp ip, #2
@@ -177,11 +185,11 @@
CALGN( ands ip, r0, #31 )
CALGN( rsb ip, ip, #32 )
- CALGN( sbcnes r4, ip, r2 ) @ C is always set here
+ CALGN( sbcsne r4, ip, r2 ) @ C is always set here
CALGN( subcc r2, r2, ip )
CALGN( bcc 15f )
-11: stmfd sp!, {r5 - r9}
+11: stmfd sp!, {r5, r6, r8 - r10}
PLD( pld [r1, #0] )
PLD( subs r2, r2, #96 )
@@ -191,31 +199,31 @@
PLD( pld [r1, #92] )
12: PLD( pld [r1, #124] )
-13: ldr4w r1, r4, r5, r6, r7, abort=19f
+13: ldr4w r1, r4, r5, r6, r8, abort=19f
mov r3, lr, lspull #\pull
subs r2, r2, #32
- ldr4w r1, r8, r9, ip, lr, abort=19f
+ ldr4w r1, r9, r10, ip, lr, abort=19f
orr r3, r3, r4, lspush #\push
mov r4, r4, lspull #\pull
orr r4, r4, r5, lspush #\push
mov r5, r5, lspull #\pull
orr r5, r5, r6, lspush #\push
mov r6, r6, lspull #\pull
- orr r6, r6, r7, lspush #\push
- mov r7, r7, lspull #\pull
- orr r7, r7, r8, lspush #\push
+ orr r6, r6, r8, lspush #\push
mov r8, r8, lspull #\pull
orr r8, r8, r9, lspush #\push
mov r9, r9, lspull #\pull
- orr r9, r9, ip, lspush #\push
+ orr r9, r9, r10, lspush #\push
+ mov r10, r10, lspull #\pull
+ orr r10, r10, ip, lspush #\push
mov ip, ip, lspull #\pull
orr ip, ip, lr, lspush #\push
- str8w r0, r3, r4, r5, r6, r7, r8, r9, ip, , abort=19f
+ str8w r0, r3, r4, r5, r6, r8, r9, r10, ip, abort=19f
bge 12b
PLD( cmn r2, #96 )
PLD( bge 13b )
- ldmfd sp!, {r5 - r9}
+ ldmfd sp!, {r5, r6, r8 - r10}
14: ands ip, r2, #28
beq 16f
@@ -241,6 +249,7 @@
18: forward_copy_shift pull=24 push=8
+ UNWIND( .fnend )
/*
* Abort preamble and completion macros.
@@ -250,14 +259,13 @@
*/
.macro copy_abort_preamble
-19: ldmfd sp!, {r5 - r9}
+19: ldmfd sp!, {r5, r6, r8 - r10}
b 21f
-20: ldmfd sp!, {r5 - r8}
+20: ldmfd sp!, {r5, r6, r8, r9}
21:
.endm
.macro copy_abort_end
- ldmfd sp!, {r4, pc}
+ ldmfd sp!, {r4, UNWIND(fpreg,) pc}
.endm
-
diff --git a/arch/arm/lib32/memcpy.S b/arch/arm/lib32/memcpy.S
index d40296e4bf..90f2b645aa 100644
--- a/arch/arm/lib32/memcpy.S
+++ b/arch/arm/lib32/memcpy.S
@@ -1,12 +1,15 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 2005 MontaVista Software, Inc (Nicolas Pitre)
-
/*
- * linux/arch/arm/lib/memcpy.S
+ * linux/arch/arm/lib/memcpy.S
+ *
+ * Author: Nicolas Pitre
+ * Created: Sep 28, 2005
+ * Copyright: MontaVista Software, Inc.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/unwind.h>
#define LDR1W_SHIFT 0
#define STR1W_SHIFT 0
@@ -24,7 +27,7 @@
.endm
.macro ldr1b ptr reg cond=al abort
- ldr\cond\()b \reg, [\ptr], #1
+ ldrb\cond \reg, [\ptr], #1
.endm
.macro str1w ptr reg abort
@@ -36,27 +39,28 @@
.endm
.macro str1b ptr reg cond=al abort
- str\cond\()b \reg, [\ptr], #1
+ strb\cond \reg, [\ptr], #1
.endm
- .macro enter reg1 reg2
- stmdb sp!, {r0, \reg1, \reg2}
+ .macro enter regs:vararg
+UNWIND( .save {r0, \regs} )
+ stmdb sp!, {r0, \regs}
.endm
- .macro exit reg1 reg2
- ldmfd sp!, {r0, \reg1, \reg2}
+ .macro exit regs:vararg
+ ldmfd sp!, {r0, \regs}
.endm
.text
/* Prototype: void *memcpy(void *dest, const void *src, size_t n); */
-.weak memcpy
-ENTRY(memcpy)
ENTRY(__memcpy)
+ENTRY(mmiocpy)
+WEAK(memcpy)
#include "copy_template.S"
-ENDPROC(__memcpy)
ENDPROC(memcpy)
-
+ENDPROC(mmiocpy)
+ENDPROC(__memcpy)
diff --git a/arch/arm/lib32/memset.S b/arch/arm/lib32/memset.S
index 4ba74e0c6c..de75ae4d5a 100644
--- a/arch/arm/lib32/memset.S
+++ b/arch/arm/lib32/memset.S
@@ -1,19 +1,23 @@
/* SPDX-License-Identifier: GPL-2.0-only */
-/* SPDX-FileCopyrightText: 1995-2000 Russell King */
-
/*
- * linux/arch/arm/lib/memset.S
- * ASM optimised string functions
+ * linux/arch/arm/lib/memset.S
+ *
+ * Copyright (C) 1995-2000 Russell King
+ *
+ * ASM optimised string functions
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
+#include <asm/unwind.h>
.text
.align 5
-.weak memset
ENTRY(__memset)
-ENTRY(memset)
+ENTRY(mmioset)
+WEAK(memset)
+UNWIND( .fnstart )
+ and r1, r1, #255 @ cast to unsigned char
ands r3, r0, #3 @ 1 unaligned?
mov ip, r0 @ preserve r0 as return value
bne 6f @ 1
@@ -23,34 +27,38 @@ ENTRY(memset)
1: orr r1, r1, r1, lsl #8
orr r1, r1, r1, lsl #16
mov r3, r1
- cmp r2, #16
+7: cmp r2, #16
blt 4f
+UNWIND( .fnend )
#if ! CALGN(1)+0
/*
- * We need an 2 extra registers for this loop - use r8 and the LR
+ * We need 2 extra registers for this loop - use r8 and the LR
*/
+UNWIND( .fnstart )
+UNWIND( .save {r8, lr} )
stmfd sp!, {r8, lr}
mov r8, r1
- mov lr, r1
+ mov lr, r3
2: subs r2, r2, #64
- stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time.
- stmgeia ip!, {r1, r3, r8, lr}
- stmgeia ip!, {r1, r3, r8, lr}
- stmgeia ip!, {r1, r3, r8, lr}
+ stmiage ip!, {r1, r3, r8, lr} @ 64 bytes at a time.
+ stmiage ip!, {r1, r3, r8, lr}
+ stmiage ip!, {r1, r3, r8, lr}
+ stmiage ip!, {r1, r3, r8, lr}
bgt 2b
- ldmeqfd sp!, {r8, pc} @ Now <64 bytes to go.
+ ldmfdeq sp!, {r8, pc} @ Now <64 bytes to go.
/*
* No need to correct the count; we're only testing bits from now on
*/
tst r2, #32
- stmneia ip!, {r1, r3, r8, lr}
- stmneia ip!, {r1, r3, r8, lr}
+ stmiane ip!, {r1, r3, r8, lr}
+ stmiane ip!, {r1, r3, r8, lr}
tst r2, #16
- stmneia ip!, {r1, r3, r8, lr}
+ stmiane ip!, {r1, r3, r8, lr}
ldmfd sp!, {r8, lr}
+UNWIND( .fnend )
#else
@@ -59,13 +67,15 @@ ENTRY(memset)
* whole cache lines at once.
*/
+UNWIND( .fnstart )
+UNWIND( .save {r4-r8, lr} )
stmfd sp!, {r4-r8, lr}
mov r4, r1
- mov r5, r1
+ mov r5, r3
mov r6, r1
- mov r7, r1
+ mov r7, r3
mov r8, r1
- mov lr, r1
+ mov lr, r3
cmp r2, #96
tstgt ip, #31
@@ -75,48 +85,64 @@ ENTRY(memset)
rsb r8, r8, #32
sub r2, r2, r8
movs r8, r8, lsl #(32 - 4)
- stmcsia ip!, {r4, r5, r6, r7}
- stmmiia ip!, {r4, r5}
+ stmiacs ip!, {r4, r5, r6, r7}
+ stmiami ip!, {r4, r5}
tst r8, #(1 << 30)
mov r8, r1
strne r1, [ip], #4
3: subs r2, r2, #64
- stmgeia ip!, {r1, r3-r8, lr}
- stmgeia ip!, {r1, r3-r8, lr}
+ stmiage ip!, {r1, r3-r8, lr}
+ stmiage ip!, {r1, r3-r8, lr}
bgt 3b
- ldmeqfd sp!, {r4-r8, pc}
+ ldmfdeq sp!, {r4-r8, pc}
tst r2, #32
- stmneia ip!, {r1, r3-r8, lr}
+ stmiane ip!, {r1, r3-r8, lr}
tst r2, #16
- stmneia ip!, {r4-r7}
+ stmiane ip!, {r4-r7}
ldmfd sp!, {r4-r8, lr}
+UNWIND( .fnend )
#endif
+UNWIND( .fnstart )
4: tst r2, #8
- stmneia ip!, {r1, r3}
+ stmiane ip!, {r1, r3}
tst r2, #4
strne r1, [ip], #4
/*
- * When we get here, we've got less than 4 bytes to zero. We
+ * When we get here, we've got less than 4 bytes to set. We
* may have an unaligned pointer as well.
*/
5: tst r2, #2
- strneb r1, [ip], #1
- strneb r1, [ip], #1
+ strbne r1, [ip], #1
+ strbne r1, [ip], #1
tst r2, #1
- strneb r1, [ip], #1
- mov pc, lr
+ strbne r1, [ip], #1
+ ret lr
6: subs r2, r2, #4 @ 1 do we have enough
blt 5b @ 1 bytes to align with?
cmp r3, #2 @ 1
- strltb r1, [ip], #1 @ 1
- strleb r1, [ip], #1 @ 1
+ strblt r1, [ip], #1 @ 1
+ strble r1, [ip], #1 @ 1
strb r1, [ip], #1 @ 1
add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
b 1b
+UNWIND( .fnend )
ENDPROC(memset)
+ENDPROC(mmioset)
ENDPROC(__memset)
+
+ENTRY(__memset32)
+UNWIND( .fnstart )
+ mov r3, r1 @ copy r1 to r3 and fall into memset64
+UNWIND( .fnend )
+ENDPROC(__memset32)
+ENTRY(__memset64)
+UNWIND( .fnstart )
+ mov ip, r0 @ preserve r0 as return value
+ b 7b @ jump into the middle of memset
+UNWIND( .fnend )
+ENDPROC(__memset64)
--
2.39.5
More information about the barebox
mailing list