[PATCH] [RFC] arm: fix return value of assembly version of memset()
Ivan Djelic
ivan.djelic at parrot.com
Fri Jan 25 16:49:33 EST 2013
The current ARM optimized memset code does not return the value of
its first argument, as is usually expected from standard implementations.
Recent GCC versions (e.g. GCC-4.7.2) perform optimizations based on
assumptions about the implementation of memset and similar functions.
As a result, the following function:
void debug_mutex_lock_common(struct mutex *lock, struct mutex_waiter *waiter)
{
memset(waiter, MUTEX_DEBUG_INIT, sizeof(*waiter));
waiter->magic = waiter;
INIT_LIST_HEAD(&waiter->list);
}
is compiled as:
800554d0 <debug_mutex_lock_common>:
800554d0: e92d4008 push {r3, lr}
800554d4: e1a00001 mov r0, r1
800554d8: e3a02010 mov r2, #16 ; 0x10
800554dc: e3a01011 mov r1, #17 ; 0x11
800554e0: eb04426e bl 80165ea0 <memset>
800554e4: e1a03000 mov r3, r0
800554e8: e583000c str r0, [r3, #12]
800554ec: e5830000 str r0, [r3]
800554f0: e5830004 str r0, [r3, #4]
800554f4: e8bd8008 pop {r3, pc}
Note the use of r0 upon returning from memset: the compiler assumes
memset returns the value of pointer 'waiter' in register r0; as a result,
the generated code produces register/memory corruptions.
Actually, memset is declared as a macro in arch/arm/include/asm/string.h:
({ \
void *__p = (p); size_t __n = n; \
if ((__n) != 0) { \
if (__builtin_constant_p((v)) && (v) == 0) \
__memzero((__p),(__n)); \
else \
memset((__p),(v),(__n)); \
} \
(__p); \
})
Note here the final statement '(__p);' giving an explicit value to the
expression; it does not really help, however, because:
1. GCC still performs its optimization, probably assuming that '(__p);'
is redundant since memset already returns the pointer value.
2. GCC generates direct inlined calls to memset to initialize stack variables
in function prologues, etc; potentially making the same assumption on memset
return value.
For lack of a simpler solution, this patch fixes the return value of memset.
Could you please review and comment, or suggest better alternatives ?
Thanks,
--
Ivan
The patch adds a 'mov' instruction and merges an additional load+store into
existing load/store instructions.
For ease of review, here is a breakdown of the patch into 4 simple steps:
Step 1
======
Perform the following substitutions:
ip -> r8, then
r0 -> ip,
and insert 'mov ip, r0' as the first statement of the function.
At this point, we have a memset() implementation returning the proper result,
but corrupting r8 on some paths (the ones that were using ip).
Step 2
======
Make sure r8 is saved and restored when (! CALGN(1)+0) == 1:
save r8:
- str lr, [sp, #-4]!
+ stmfd sp!, {r8, lr}
and restore r8 on both exit paths:
- ldmeqfd sp!, {pc} @ Now <64 bytes to go.
+ ldmeqfd sp!, {r8, pc} @ Now <64 bytes to go.
(...)
tst r2, #16
stmneia ip!, {r1, r3, r8, lr}
- ldr lr, [sp], #4
+ ldmfd sp!, {r8, lr}
Step 3
======
Make sure r8 is saved and restored when (! CALGN(1)+0) == 0:
save r8:
- stmfd sp!, {r4-r7, lr}
+ stmfd sp!, {r4-r8, lr}
and restore r8 on both exit paths:
bgt 3b
- ldmeqfd sp!, {r4-r7, pc}
+ ldmeqfd sp!, {r4-r8, pc}
(...)
tst r2, #16
stmneia ip!, {r4-r7}
- ldmfd sp!, {r4-r7, lr}
+ ldmfd sp!, {r4-r8, lr}
Step 4
======
Rewrite register list "r4-r7, r8" as "r4-r8".
Signed-off-by: Ivan Djelic <ivan.djelic at parrot.com>
---
arch/arm/lib/memset.S | 85 +++++++++++++++++++++++++------------------------
1 file changed, 44 insertions(+), 41 deletions(-)
diff --git a/arch/arm/lib/memset.S b/arch/arm/lib/memset.S
index 650d592..eafd5ee 100644
--- a/arch/arm/lib/memset.S
+++ b/arch/arm/lib/memset.S
@@ -19,9 +19,9 @@
1: subs r2, r2, #4 @ 1 do we have enough
blt 5f @ 1 bytes to align with?
cmp r3, #2 @ 1
- strltb r1, [r0], #1 @ 1
- strleb r1, [r0], #1 @ 1
- strb r1, [r0], #1 @ 1
+ strltb r1, [ip], #1 @ 1
+ strleb r1, [ip], #1 @ 1
+ strb r1, [ip], #1 @ 1
add r2, r2, r3 @ 1 (r2 = r2 - (4 - r3))
/*
* The pointer is now aligned and the length is adjusted. Try doing the
@@ -29,10 +29,14 @@
*/
ENTRY(memset)
- ands r3, r0, #3 @ 1 unaligned?
+/*
+ * Preserve the contents of r0 for the return value.
+ */
+ mov ip, r0
+ ands r3, ip, #3 @ 1 unaligned?
bne 1b @ 1
/*
- * we know that the pointer in r0 is aligned to a word boundary.
+ * we know that the pointer in ip is aligned to a word boundary.
*/
orr r1, r1, r1, lsl #8
orr r1, r1, r1, lsl #16
@@ -43,29 +47,28 @@ ENTRY(memset)
#if ! CALGN(1)+0
/*
- * We need an extra register for this loop - save the return address and
- * use the LR
+ * We need an 2 extra registers for this loop - use r8 and the LR
*/
- str lr, [sp, #-4]!
- mov ip, r1
+ stmfd sp!, {r8, lr}
+ mov r8, r1
mov lr, r1
2: subs r2, r2, #64
- stmgeia r0!, {r1, r3, ip, lr} @ 64 bytes at a time.
- stmgeia r0!, {r1, r3, ip, lr}
- stmgeia r0!, {r1, r3, ip, lr}
- stmgeia r0!, {r1, r3, ip, lr}
+ stmgeia ip!, {r1, r3, r8, lr} @ 64 bytes at a time.
+ stmgeia ip!, {r1, r3, r8, lr}
+ stmgeia ip!, {r1, r3, r8, lr}
+ stmgeia ip!, {r1, r3, r8, lr}
bgt 2b
- ldmeqfd sp!, {pc} @ Now <64 bytes to go.
+ ldmeqfd sp!, {r8, pc} @ Now <64 bytes to go.
/*
* No need to correct the count; we're only testing bits from now on
*/
tst r2, #32
- stmneia r0!, {r1, r3, ip, lr}
- stmneia r0!, {r1, r3, ip, lr}
+ stmneia ip!, {r1, r3, r8, lr}
+ stmneia ip!, {r1, r3, r8, lr}
tst r2, #16
- stmneia r0!, {r1, r3, ip, lr}
- ldr lr, [sp], #4
+ stmneia ip!, {r1, r3, r8, lr}
+ ldmfd sp!, {r8, lr}
#else
@@ -74,54 +77,54 @@ ENTRY(memset)
* whole cache lines at once.
*/
- stmfd sp!, {r4-r7, lr}
+ stmfd sp!, {r4-r8, lr}
mov r4, r1
mov r5, r1
mov r6, r1
mov r7, r1
- mov ip, r1
+ mov r8, r1
mov lr, r1
cmp r2, #96
- tstgt r0, #31
+ tstgt ip, #31
ble 3f
- and ip, r0, #31
- rsb ip, ip, #32
- sub r2, r2, ip
- movs ip, ip, lsl #(32 - 4)
- stmcsia r0!, {r4, r5, r6, r7}
- stmmiia r0!, {r4, r5}
- tst ip, #(1 << 30)
- mov ip, r1
- strne r1, [r0], #4
+ and r8, ip, #31
+ rsb r8, r8, #32
+ sub r2, r2, r8
+ movs r8, r8, lsl #(32 - 4)
+ stmcsia ip!, {r4, r5, r6, r7}
+ stmmiia ip!, {r4, r5}
+ tst r8, #(1 << 30)
+ mov r8, r1
+ strne r1, [ip], #4
3: subs r2, r2, #64
- stmgeia r0!, {r1, r3-r7, ip, lr}
- stmgeia r0!, {r1, r3-r7, ip, lr}
+ stmgeia ip!, {r1, r3-r8, lr}
+ stmgeia ip!, {r1, r3-r8, lr}
bgt 3b
- ldmeqfd sp!, {r4-r7, pc}
+ ldmeqfd sp!, {r4-r8, pc}
tst r2, #32
- stmneia r0!, {r1, r3-r7, ip, lr}
+ stmneia ip!, {r1, r3-r8, lr}
tst r2, #16
- stmneia r0!, {r4-r7}
- ldmfd sp!, {r4-r7, lr}
+ stmneia ip!, {r4-r7}
+ ldmfd sp!, {r4-r8, lr}
#endif
4: tst r2, #8
- stmneia r0!, {r1, r3}
+ stmneia ip!, {r1, r3}
tst r2, #4
- strne r1, [r0], #4
+ strne r1, [ip], #4
/*
* When we get here, we've got less than 4 bytes to zero. We
* may have an unaligned pointer as well.
*/
5: tst r2, #2
- strneb r1, [r0], #1
- strneb r1, [r0], #1
+ strneb r1, [ip], #1
+ strneb r1, [ip], #1
tst r2, #1
- strneb r1, [r0], #1
+ strneb r1, [ip], #1
mov pc, lr
ENDPROC(memset)
--
1.7.10.4
More information about the linux-arm-kernel
mailing list