gcc miscompiles csum_tcpudp_magic() on ARMv5
Russell King - ARM Linux
linux at arm.linux.org.uk
Thu Dec 12 11:47:48 EST 2013
On Thu, Dec 12, 2013 at 05:04:26PM +0100, Willy Tarreau wrote:
> On Thu, Dec 12, 2013 at 03:41:10PM +0000, Russell King - ARM Linux wrote:
> > One of the issues here is that internally, GCC tracks the "machine mode"
> > of a register, where "mode" basically means the type of register it is.
> > In this case, the register will be in "half integer" mode when it is
> > passed into the assembler, and gcc does _not_ promote it.
> >
> > We can see this in the RTL:
> >
> > (insn 11 10 12 3 ../t.c:9 (set (reg:SI 143 [ sum ])
> > (asm_operands:SI ("mov %0, %1") ("=&r") 0 [
> > (subreg:HI (reg:SI 148) 0)
> > ]
> > [
> > (asm_input:HI ("r") (null):0)
> > ]
> > [] ../t.c:12)) -1 (nil))
> >
> > Note that "HI" for the input register 148. What this means is that the
> > compiler "knows" that it's supposed to be a 16-bit quantity, and has
> > recorded that fact, and _will_ truncate it down to 16-bit when it needs
> > to be promoted.
> >
> > Since the asm() only asks for a "register", that's what we get - a
> > register which _happens_ to be in HI mode containing the value. However,
> > there's no way to tell that from the assembly code itself (GCC doesn't
> > have the facility to do conditional assembly generation depending on
> > the argument type passed into it.)
>
> OK so that means that it's just like having true 16-bit registers except
> that it's just a temporary representation and not a feature of the CPU.
> The compiler has the right to expect the asm instructions to only use
> the lower 16 bits and has no way to verify that.
Quite.
> Then changing the type of the function argument would probably be safer!
Actually, I think we can do a bit better with this code. We really don't
need much of this messing around here, we can combine some of these steps.
We have:
16-bit protocol in host endian
16-bit length in host endian
and we need to combine them into a 32-bit checksum which is then
subsequently folded down to 16-bits by adding the top and bottom halves.
Now, what we can do is this:
1. Construct a combined 32-bit protocol and length:
unsigned lenproto = len | proto << 16;
2. Pass this into the assembly thusly:
__asm__(
"adds %0, %1, %2 @ csum_tcpudp_nofold \n\t"
"adcs %0, %0, %3 \n\t"
#ifdef __ARMEB__
"adcs %0, %0, %4 \n\t"
#else
"adcs %0, %0, %4, ror #8 \n\t"
#endif
"adc %0, %0, #0"
: "=&r"(sum)
: "r" (sum), "r" (daddr), "r" (saddr), "r" (lenprot)
: "cc");
with no swabbing at this stage. Well, where do we get the endian
conversion? See that ror #8 - that a 32 bit rotate by 8 bits. As
these are two 16-bit quantities, we end up with this:
original:
31..24 23..16 15..8 7..0
len_h len_l pro_h pro_l
accumulated:
31..24 23..16 15..8 7..0
pro_l len_h len_l pro_h
And now when we fold it down to 16-bit:
15..8 7..0
len_l pro_h
pro_l len_h
There we go - the low and high bytes end up correctly placed. Now, the
advantage to having "len" and "proto" combined by the compiler is that
the compiler itself can make the decision about how to combine these two
16-bit HI-mode quantities in the most efficient way.
Here's some examples from udp.c - I have another optimisation in here
too which makes this code sequence slightly shorter. Some intermediary
instructions have been omitted.
mov r6, r6, asl #16 @ tmp204, len,
mov r6, r6, lsr #16 @ tmp203, tmp204,
orr r6, r6, #1114112 @ tmp205, tmp203,
@ 92 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1
adds r3, r8, r7 @ csum_tcpudp_nofold0 @ sum, dst, src
adcs r3, r3, r6, ror #8 @ sum, tmp205
adc r3, r3, #0 @ sum
mov r6, r6, asl #16 @ tmp220, len,
mov r6, r6, lsr #16 @ tmp219, tmp220,
orr r6, r6, #1114112 @ tmp221, tmp219,
@ 104 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1
adds r3, r0, r8 @ csum_tcpudp_nofold @ sum,, dst
adcs r3, r3, r7 @ sum, src
adcs r3, r3, r6, ror #8 @ sum, tmp221
adc r3, r3, #0 @ sum
Note this one sourcing an 8-bit protocol value.
ldrb r3, [r7, #269] @ zero_extendqisi2 @ tmp321, sk_5->sk_protocol
orr sl, sl, r3, asl #16 @, tmp324, D.48540, tmp321,
@ 104 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1
adds r3, r0, r2 @ csum_tcpudp_nofold @ sum, csum, fl4_19(D)->daddr
adcs r3, r3, r1 @ sum, fl4_19(D)->saddr
adcs r3, r3, sl, ror #8 @ sum, tmp324
adc r3, r3, #0 @ sum
ldrh lr, [r4, #76] @ tmp503, skb_10(D)->len
orr lr, lr, r7, asl #16 @, tmp505, tmp503, proto,
@ 104 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1
adds r1, sl, r0 @ csum_tcpudp_nofold @ sum, skb_10(D)->D.22802.csum, iph_354->daddr
adcs r1, r1, ip @ sum, iph_354->saddr
adcs r1, r1, lr, ror #8 @ sum, tmp505
adc r1, r1, #0 @ sum
ldrh r0, [r4, #76] @ tmp529, skb_10(D)->len
orr r0, r0, r7, asl #16 @, tmp531, tmp529, proto,
@ 92 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1
adds r3, r2, r1 @ csum_tcpudp_nofold0 @ sum, iph_354->daddr, iph_354->saddr
adcs r3, r3, r0, ror #8 @ sum, tmp531
adc r3, r3, #0 @ sum
This one is a little more complicated because it uses skb->len - we
can see the narrowing cast being performed:
ldr r0, [r4, #76] @ skb_1->len, skb_1->len
rsb r0, r9, r0 @ tmp335, D.53494, skb_1->len
mov r0, r0, asl #16 @ tmp337, tmp335,
mov r0, r0, lsr #16 @ tmp336, tmp337,
orr r0, r0, #1114112 @ tmp338, tmp336,
@ 92 "/home/rmk/git/linux-rmk/arch/arm/include/asm/checksum.h" 1
adds r3, r2, r1 @ csum_tcpudp_nofold0 @ sum, iph_252->daddr, iph_252->saddr
adcs r3, r3, r0, ror #8 @ sum, tmp338
adc r3, r3, #0 @ sum
Looking at whether it's better to shift len or proto to the upper 16 bits
reveals no real advantage to either: we end up with the same overall number
of instructions between the UDP and TCP code combined, individually there's
only one line between them.
So here's a patch for Maxime to try - I've not run it yet but it seems to
do the right thing by code inspection.
diff --git a/arch/arm/include/asm/checksum.h b/arch/arm/include/asm/checksum.h
index 6dcc16430868..523315115478 100644
--- a/arch/arm/include/asm/checksum.h
+++ b/arch/arm/include/asm/checksum.h
@@ -87,19 +87,34 @@ static inline __wsum
csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len,
unsigned short proto, __wsum sum)
{
- __asm__(
- "adds %0, %1, %2 @ csum_tcpudp_nofold \n\
- adcs %0, %0, %3 \n"
+ u32 lenprot = len | proto << 16;
+
+ if (__builtin_constant_p(sum) && sum == 0) {
+ __asm__(
+ "adds %0, %1, %2 @ csum_tcpudp_nofold0 \n\t"
#ifdef __ARMEB__
- "adcs %0, %0, %4 \n"
+ "adcs %0, %0, %3 \n\t"
#else
- "adcs %0, %0, %4, lsl #8 \n"
+ "adcs %0, %0, %3, ror #8 \n\t"
#endif
- "adcs %0, %0, %5 \n\
- adc %0, %0, #0"
- : "=&r"(sum)
- : "r" (sum), "r" (daddr), "r" (saddr), "r" (len), "Ir" (htons(proto))
- : "cc");
+ "adc %0, %0, #0"
+ : "=&r" (sum)
+ : "r" (daddr), "r" (saddr), "r" (lenprot)
+ : "cc");
+ } else {
+ __asm__(
+ "adds %0, %1, %2 @ csum_tcpudp_nofold \n\t"
+ "adcs %0, %0, %3 \n\t"
+#ifdef __ARMEB__
+ "adcs %0, %0, %4 \n\t"
+#else
+ "adcs %0, %0, %4, ror #8 \n\t"
+#endif
+ "adc %0, %0, #0"
+ : "=&r"(sum)
+ : "r" (sum), "r" (daddr), "r" (saddr), "r" (lenprot)
+ : "cc");
+ }
return sum;
}
/*
More information about the linux-arm-kernel
mailing list