[PATCH 2/4] ARM: copy_template.S: rework the unaligned copy loop

Nicolas Pitre nico at fluxnic.net
Thu Mar 29 00:00:22 EDT 2012


From: Nicolas Pitre <nicolas.pitre at linaro.org>

Let's rework the unaligned copy loop to enforce a range of contigous
registers starting from an even register, and to use a single ldr8w
construct instead of two ldr4w's.  There are no users of ldr4w anymore,
so its various definitions are removed.

By using one additional temporary registers, it is possible to have the
same register set for the loads and the stores, and to make the loop
friendlier to superscalar CPUs at the same time.

Signed-off-by: Nicolas Pitre <nico at linaro.org>
---
 arch/arm/lib/copy_from_user.S |   11 +++----
 arch/arm/lib/copy_template.S  |   57 ++++++++++++++++++++---------------------
 arch/arm/lib/copy_to_user.S   |    4 ---
 arch/arm/lib/memcpy.S         |    4 ---
 4 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/arch/arm/lib/copy_from_user.S b/arch/arm/lib/copy_from_user.S
index 66a477a3e3..d1df0ec62b 100644
--- a/arch/arm/lib/copy_from_user.S
+++ b/arch/arm/lib/copy_from_user.S
@@ -44,16 +44,15 @@
 	ldrusr	\reg, \ptr, 4, abort=\abort
 	.endm
 
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
+	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 	ldr1w \ptr, \reg1, \abort
 	ldr1w \ptr, \reg2, \abort
 	ldr1w \ptr, \reg3, \abort
 	ldr1w \ptr, \reg4, \abort
-	.endm
-
-	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
-	ldr4w \ptr, \reg1, \reg2, \reg3, \reg4, \abort
-	ldr4w \ptr, \reg5, \reg6, \reg7, \reg8, \abort
+	ldr1w \ptr, \reg5, \abort
+	ldr1w \ptr, \reg6, \abort
+	ldr1w \ptr, \reg7, \abort
+	ldr1w \ptr, \reg8, \abort
 	.endm
 
 	.macro ldr1b ptr reg cond=al abort
diff --git a/arch/arm/lib/copy_template.S b/arch/arm/lib/copy_template.S
index 7244dcef0d..84e94cd48c 100644
--- a/arch/arm/lib/copy_template.S
+++ b/arch/arm/lib/copy_template.S
@@ -27,10 +27,9 @@
  *	This loads one word from 'ptr', stores it in 'reg' and increments
  *	'ptr' to the next word. The 'abort' argument is used for fixup tables.
  *
- * ldr4w ptr reg1 reg2 reg3 reg4 abort
  * ldr8w ptr, reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
  *
- *	This loads four or eight words starting from 'ptr', stores them
+ *	This loads eight words starting from 'ptr', stores them
  *	in provided registers and increments 'ptr' past those words.
  *	The'abort' argument is used for fixup tables.
  *
@@ -63,7 +62,7 @@
  *
  *	Correction to be applied to the "ip" register when branching into
  *	the ldr1w or str1w instructions (some of these macros may expand to
- *	than one 32bit instruction in Thumb-2)
+ *	more than one 32bit instruction in Thumb-2)
  */
 
 
@@ -170,7 +169,7 @@
 
 10:		bic	r1, r1, #3
 		cmp	ip, #2
-		ldr1w	r1, ip, abort=21f
+		ldr1w	r1, r2, abort=21f
 		beq	17f
 		bgt	18f
 
@@ -178,6 +177,7 @@
 		.macro	forward_copy_shift pull push
 
 		subs	lr, lr, #28
+		mov	ip, r2, pull #\pull
 		blt	14f
 
 	CALGN(	ands	r3, r0, #31		)
@@ -186,7 +186,7 @@
 	CALGN(	subcc	lr, lr, r3		)
 	CALGN(	bcc	15f			)
 
-11:		stmfd	sp!, {r5 - r9}
+11:		stmfd	sp!, {r5 - sl}
 
 	PLD(	pld	[r1, #0]		)
 	PLD(	subs	lr, lr, #96		)
@@ -196,40 +196,39 @@
 	PLD(	pld	[r1, #92]		)
 
 12:	PLD(	pld	[r1, #124]		)
-13:		ldr4w	r1, r3, r4, r5, r6, abort=19f
-		mov	r2, ip, pull #\pull
+13:		ldr8w	r1, r2, r3, r4, r5, r6, r7, r8, r9, abort=19f
 		subs	lr, lr, #32
-		ldr4w	r1, r7, r8, r9, ip, abort=19f
-		orr	r2, r2, r3, push #\push
-		mov	r3, r3, pull #\pull
-		orr	r3, r3, r4, push #\push
-		mov	r4, r4, pull #\pull
-		orr	r4, r4, r5, push #\push
-		mov	r5, r5, pull #\pull
-		orr	r5, r5, r6, push #\push
-		mov	r6, r6, pull #\pull
-		orr	r6, r6, r7, push #\push
-		mov	r7, r7, pull #\pull
-		orr	r7, r7, r8, push #\push
-		mov	r8, r8, pull #\pull
-		orr	r8, r8, r9, push #\push
-		mov	r9, r9, pull #\pull
-		orr	r9, r9, ip, push #\push
+		mov	sl, r2, pull #\pull
+		orr	r2, ip, r2, push #\push
+		mov	ip, r3, pull #\pull
+		orr	r3, sl, r3, push #\push
+		mov	sl, r4, pull #\pull
+		orr	r4, ip, r4, push #\push
+		mov	ip, r5, pull #\pull
+		orr	r5, sl, r5, push #\push
+		mov	sl, r6, pull #\pull
+		orr	r6, ip, r6, push #\push
+		mov	ip, r7, pull #\pull
+		orr	r7, sl, r7, push #\push
+		mov	sl, r8, pull #\pull
+		orr	r8, ip, r8, push #\push
+		mov	ip, r9, pull #\pull
+		orr	r9, sl, r9, push #\push
 		str8w	r0, r2, r3, r4, r5, r6, r7, r8, r9, abort=19f
 		bge	12b
 	PLD(	cmn	lr, #96			)
 	PLD(	bge	13b			)
 
-		ldmfd	sp!, {r5 - r9}
+		ldmfd	sp!, {r5 - sl}
 
 14:		ands	r3, lr, #28
 		beq	16f
 
-15:		mov	r2, ip, pull #\pull
-		ldr1w	r1, ip, abort=21f
+15:		ldr1w	r1, r2, abort=21f
 		subs	r3, r3, #4
-		orr	r2, r2, ip, push #\push
-		str1w	r0, r2, abort=21f
+		orr	r4, ip, r2, push #\push
+		mov	ip, r2, pull #\pull
+		str1w	r0, r4, abort=21f
 		bgt	15b
 	CALGN(	cmp	lr, #0			)
 	CALGN(	bge	11b			)
@@ -255,7 +254,7 @@
  */
 
 	.macro	copy_abort_preamble
-19:	ldmfd	sp!, {r5 - r9}
+19:	ldmfd	sp!, {r5 - sl}
 	b	21f
 20:	ldmfd	sp!, {r5 - r8}
 21:
diff --git a/arch/arm/lib/copy_to_user.S b/arch/arm/lib/copy_to_user.S
index d066df686e..a83bc04365 100644
--- a/arch/arm/lib/copy_to_user.S
+++ b/arch/arm/lib/copy_to_user.S
@@ -44,10 +44,6 @@
 	W(ldr) \reg, [\ptr], #4
 	.endm
 
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
-	.endm
-
 	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm
diff --git a/arch/arm/lib/memcpy.S b/arch/arm/lib/memcpy.S
index a9b9e2287a..adbccc6e2d 100644
--- a/arch/arm/lib/memcpy.S
+++ b/arch/arm/lib/memcpy.S
@@ -20,10 +20,6 @@
 	W(ldr) \reg, [\ptr], #4
 	.endm
 
-	.macro ldr4w ptr reg1 reg2 reg3 reg4 abort
-	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4}
-	.endm
-
 	.macro ldr8w ptr reg1 reg2 reg3 reg4 reg5 reg6 reg7 reg8 abort
 	ldmia \ptr!, {\reg1, \reg2, \reg3, \reg4, \reg5, \reg6, \reg7, \reg8}
 	.endm
-- 
1.7.9.rc2




More information about the linux-arm-kernel mailing list