updates for be8 patch series
Victor Kamensky
victor.kamensky at linaro.org
Tue Jul 23 21:06:05 EDT 2013
Hi Ben,
Let me split off atomic64 from other issues, so we have focused topic here.
Please see atomic64 issue test case below. Please try it in your setup. I've
run on Pandaboard ES with my set of patches, but I believe it should be the
same in your case.
<snip>
>> Index: linux-linaro-tracking/arch/arm/include/asm/atomic.h
>> ===================================================================
>> --- linux-linaro-tracking.orig/arch/arm/include/asm/atomic.h
>> +++ linux-linaro-tracking/arch/arm/include/asm/atomic.h
>> @@ -301,8 +301,13 @@ static inline void atomic64_add(u64 i, a
>>
>> __asm__ __volatile__("@ atomic64_add\n"
>> "1: ldrexd %0, %H0, [%3]\n"
>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>> " adds %0, %0, %4\n"
>> " adc %H0, %H0, %H4\n"
>> +#else
>> +" adds %H0, %H0, %H4\n"
>> +" adc %0, %0, %4\n"
>> +#endif
>> " strexd %1, %0, %H0, [%3]\n"
>> " teq %1, #0\n"
>> " bne 1b"
>> @@ -320,8 +325,13 @@ static inline u64 atomic64_add_return(u6
>>
>> __asm__ __volatile__("@ atomic64_add_return\n"
>> "1: ldrexd %0, %H0, [%3]\n"
>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>> " adds %0, %0, %4\n"
>> " adc %H0, %H0, %H4\n"
>> +#else
>> +" adds %H0, %H0, %H4\n"
>> +" adc %0, %0, %4\n"
>> +#endif
>> " strexd %1, %0, %H0, [%3]\n"
>> " teq %1, #0\n"
>> " bne 1b"
>> @@ -341,8 +351,13 @@ static inline void atomic64_sub(u64 i, a
>>
>> __asm__ __volatile__("@ atomic64_sub\n"
>> "1: ldrexd %0, %H0, [%3]\n"
>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>> " subs %0, %0, %4\n"
>> " sbc %H0, %H0, %H4\n"
>> +#else
>> +" subs %H0, %H0, %H4\n"
>> +" sbc %0, %0, %4\n"
>> +#endif
>> " strexd %1, %0, %H0, [%3]\n"
>> " teq %1, #0\n"
>> " bne 1b"
>> @@ -360,8 +375,13 @@ static inline u64 atomic64_sub_return(u6
>>
>> __asm__ __volatile__("@ atomic64_sub_return\n"
>> "1: ldrexd %0, %H0, [%3]\n"
>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>> " subs %0, %0, %4\n"
>> " sbc %H0, %H0, %H4\n"
>> +#else
>> +" subs %H0, %H0, %H4\n"
>> +" sbc %0, %0, %4\n"
>> +#endif
>> " strexd %1, %0, %H0, [%3]\n"
>> " teq %1, #0\n"
>> " bne 1b"
>> @@ -428,9 +448,15 @@ static inline u64 atomic64_dec_if_positi
>>
>> __asm__ __volatile__("@ atomic64_dec_if_positive\n"
>> "1: ldrexd %0, %H0, [%3]\n"
>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>> " subs %0, %0, #1\n"
>> " sbc %H0, %H0, #0\n"
>> " teq %H0, #0\n"
>> +#else
>> +" subs %H0, %H0, #1\n"
>> +" sbc %0, %0, #0\n"
>> +" teq %0, #0\n"
>> +#endif
>> " bmi 2f\n"
>> " strexd %1, %0, %H0, [%3]\n"
>> " teq %1, #0\n"
>> @@ -459,8 +485,13 @@ static inline int atomic64_add_unless(at
>> " teqeq %H0, %H5\n"
>> " moveq %1, #0\n"
>> " beq 2f\n"
>> +#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
>> " adds %0, %0, %6\n"
>> " adc %H0, %H0, %H6\n"
>> +#else
>> +" adds %H0, %H0, %H6\n"
>> +" adc %0, %0, %6\n"
>> +#endif
>> " strexd %2, %0, %H0, [%4]\n"
>> " teq %2, #0\n"
>> " bne 1b\n"
>
>
> I still believe that you're doing the wrong thing here
> and that these can be left well enough alone. Please give
> a concrete piece of code that can show they're actually
> doing the wrong thing.
Disable CONFIG_GENERIC_ATOMIC64
-------------------------------
Make sure that CONFIG_GENERIC_ATOMIC64 is not set. Otherwise atomic64_add from
lib/atomic64.c will be used and that one works correctly but this case does
not use atomic64_XXX inline functions from arch/arm/include/asm/atomic.h
Personally with my current kernel close to 3.10 I failed to do that and manage
to get live kernel, so to illustrate the issue I will use my own copy of
inline atomic64_add function. But effectively it will show the same problem
that I tried to report.
Compiler used
-------------
gcc-linaro-arm-linux-gnueabihf-4.7-2013.01-20130125_linux
Test module source
------------------
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/atomic.h>
static inline void my_atomic64_add_original(u64 i, atomic64_t *v)
{
u64 result;
unsigned long tmp;
__asm__ __volatile__("@ atomic64_add\n"
"1: ldrexd %0, %H0, [%3]\n"
" adds %0, %0, %4\n"
" adc %H0, %H0, %H4\n"
" strexd %1, %0, %H0, [%3]\n"
" teq %1, #0\n"
" bne 1b"
: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
: "r" (&v->counter), "r" (i)
: "cc");
}
static inline void my_atomic64_add_fixed(u64 i, atomic64_t *v)
{
u64 result;
unsigned long tmp;
__asm__ __volatile__("@ atomic64_add\n"
"1: ldrexd %0, %H0, [%3]\n"
#ifndef CONFIG_CPU_BIG_ENDIAN /* little endian */
" adds %0, %0, %4\n"
" adc %H0, %H0, %H4\n"
#else
" adds %H0, %H0, %H4\n"
" adc %0, %0, %4\n"
#endif
" strexd %1, %0, %H0, [%3]\n"
" teq %1, #0\n"
" bne 1b"
: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
: "r" (&v->counter), "r" (i)
: "cc");
}
atomic64_t a = {0};
#define INCREMENT 0x40000000
void atomic_update_original (void)
{
my_atomic64_add_original(INCREMENT, &a);
}
void atomic_update_fixed (void)
{
my_atomic64_add_fixed(INCREMENT, &a);
}
void test_atomic64_original(void)
{
int i;
printk("original atomic64_add\n");
atomic64_set(&a, 0);
printk("a = %llx\n", a.counter);
for (i = 0; i < 10; i++) {
atomic_update_original();
printk("i = %2d: a = %llx\n", i, a.counter);
}
}
void test_atomic64_fixed(void)
{
int i;
printk("fixed atomic64_add\n");
atomic64_set(&a, 0);
printk("a = %llx\n", a.counter);
for (i = 0; i < 10; i++) {
atomic_update_fixed();
printk("i = %2d: a = %llx\n", i, a.counter);
}
}
int
init_module (void)
{
test_atomic64_original();
test_atomic64_fixed();
return 0;
}
void
cleanup_module(void)
{
}
MODULE_LICENSE("GPL");
Test run
--------
Compile and run module, observe in original code counter is not incremented
correctly.
root at genericarmv7ab:~# insmod atomic64.ko
[ 73.041107] original atomic64_add
[ 73.044647] a = 0
[ 73.046691] i = 0: a = 40000000
[ 73.050659] i = 1: a = 80000000
[ 73.054199] i = 2: a = c0000000
[ 73.057983] i = 3: a = 0
[ 73.060852] i = 4: a = 40000000
[ 73.064361] i = 5: a = 80000000
[ 73.067932] i = 6: a = c0000000
[ 73.071441] i = 7: a = 0
[ 73.074310] i = 8: a = 40000000
[ 73.077850] i = 9: a = 80000000
[ 73.081390] fixed atomic64_add
[ 73.084625] a = 0
[ 73.086669] i = 0: a = 40000000
[ 73.090209] i = 1: a = 80000000
[ 73.093749] i = 2: a = c0000000
[ 73.097290] i = 3: a = 100000000
[ 73.100891] i = 4: a = 140000000
[ 73.104522] i = 5: a = 180000000
[ 73.108154] i = 6: a = 1c0000000
[ 73.111755] i = 7: a = 200000000
[ 73.115386] i = 8: a = 240000000
[ 73.119018] i = 9: a = 280000000
Disassemble of original code
----------------------------
(gdb) disassemble atomic_update_original
Dump of assembler code for function atomic_update_original:
0x00000000 <+0>: push {r4} ; (str r4, [sp, #-4]!)
0x00000004 <+4>: mov r3, #1073741824 ; 0x40000000
0x00000008 <+8>: ldr r12, [pc, #32] ; 0x30
<atomic_update_original+48>
0x0000000c <+12>: mov r2, #0
0x00000010 <+16>: ldrexd r0, [r12]
0x00000014 <+20>: adds r0, r0, r2
0x00000018 <+24>: adc r1, r1, r3
0x0000001c <+28>: strexd r4, r0, [r12]
0x00000020 <+32>: teq r4, #0
0x00000024 <+36>: bne 0x10 <atomic_update_original+16>
0x00000028 <+40>: ldmfd sp!, {r4}
0x0000002c <+44>: bx lr
0x00000030 <+48>: andeq r0, r0, r0
End of assembler dump.
Disassemble of fixed code
-------------------------
(gdb) disassemble atomic_update_fixed
Dump of assembler code for function atomic_update_fixed:
0x00000034 <+0>: push {r4} ; (str r4, [sp, #-4]!)
0x00000038 <+4>: mov r3, #1073741824 ; 0x40000000
0x0000003c <+8>: ldr r12, [pc, #32] ; 0x64 <atomic_update_fixed+48>
0x00000040 <+12>: mov r2, #0
0x00000044 <+16>: ldrexd r0, [r12]
0x00000048 <+20>: adds r1, r1, r3
0x0000004c <+24>: adc r0, r0, r2
0x00000050 <+28>: strexd r4, r0, [r12]
0x00000054 <+32>: teq r4, #0
0x00000058 <+36>: bne 0x44 <atomic_update_fixed+16>
0x0000005c <+40>: ldmfd sp!, {r4}
0x00000060 <+44>: bx lr
0x00000064 <+48>: andeq r0, r0, r0
End of assembler dump.
Fixed code explanation
----------------------
- $r2 has 4 most significant bytes of increement (0)
- $r3 has 4 least significant bytes of increement (0x40000000)
- Load from address at $r12 'long long' into r0 and r1
$r0 will get 4 most significant bytes (i.e 4 bytes at $r12)
$r1 will get 4 least significan bytes (i.e 4 bytes at $r12+4)
loads are big endian, so $r0 and $r0 will be read with proper be swap
0x00000044 <+16>: ldrexd r0, [r12]
- add $r3 to $r1 store result into $r1, carry flag could be set
- adding 4 least sginificant bytes of 'long long'
0x00000048 <+20>: adds r1, r1, r3
- adding with carry most siginificant parts of increment and counter
0x0000004c <+24>: adc r0, r0, r2
- Store result back
$r0 will to $r12 address
$r1 will to $r12+4 address
0x00000050 <+28>: strexd r4, r0, [r12]
Ldrd/strd
---------
Issue boils down to the fact that ldrexd/ldrd and strexd/strd instrucitons
in big endian mode will do byteswap only within 4 bytes boundary, but it will
not swap registers numbers itself. I.e ldrexd rN, <addr> puts swapped 4 bytes
at <addr> address into rN, and it puts swapped 4 bytes at <addr+4> into rN+1
Compiler?
---------
One may thing whether we have compiler bug here. And after all, what is the
meaning of %0 vs %H0? I asked my local gcc expert for the help. Here what
I got. Please look at
https://github.com/mirrors/gcc/blob/master/gcc/config/arm/arm.c
and search for >'Q', 'R' and 'H'<
according to this page '%HN' is always reg+1 operand regardless whether it is
big endian system or not. It is opposite to 'Q'.
Better fix
----------
In fact now I believe correct fix should use 'Q' for least siginificant 4
bytes, 'R' for most siginificant 4 bytes and 'H' is used in load/store
instructions.
It should look something like this:
static inline void my_atomic64_add_fixed1(u64 i, atomic64_t *v)
{
u64 result;
unsigned long tmp;
__asm__ __volatile__("@ atomic64_add\n"
"1: ldrexd %0, %H0, [%3]\n"
" adds %Q0, %Q0, %Q4\n"
" adc %R0, %R0, %R4\n"
" strexd %1, %0, %H0, [%3]\n"
" teq %1, #0\n"
" bne 1b"
: "=&r" (result), "=&r" (tmp), "+Qo" (v->counter)
: "r" (&v->counter), "r" (i)
: "cc");
}
It is better that my original fix because it does not have conditional
compilation. I've tested it in the module, it works on both LE and BE kernel
variants.
Thanks,
Victor
More information about the linux-arm-kernel
mailing list