[PATCH] arm64: support do_csum with neon
Robin Murphy
robin.murphy at arm.com
Fri Jan 12 04:22:50 PST 2018
On 12/01/18 11:53, Chen Zhou wrote:
> On arm64 little endian such as the Cortex-A57, the
> neon based implementation performance increases by
> about 70% when len is greater than 512.
Um, I don't see the kernel-mode NEON infrastructure being used anywhere
here. Blindly destroying someone else's register context is never going
to end well, regardless of how fast you can do it...
Robin.
> Signed-off-by: Chen Zhou <chenzhou10 at huawei.com>
> ---
> arch/arm64/include/asm/checksum.h | 3 +
> arch/arm64/lib/Makefile | 1 +
> arch/arm64/lib/do_csum.S | 177 ++++++++++++++++++++++++++++++++++++++
> 3 files changed, 181 insertions(+)
> create mode 100644 arch/arm64/lib/do_csum.S
>
> diff --git a/arch/arm64/include/asm/checksum.h b/arch/arm64/include/asm/checksum.h
> index 09f6533..e300782 100644
> --- a/arch/arm64/include/asm/checksum.h
> +++ b/arch/arm64/include/asm/checksum.h
> @@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum)
> }
> #define csum_fold csum_fold
>
> +#define do_csum do_csum
> +extern unsigned int do_csum(const unsigned char *, size_t);
> +
> static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> {
> __uint128_t tmp;
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
> index 1a811ec..5b6aa34 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -3,6 +3,7 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
> clear_page.o memchr.o memcpy.o memmove.o memset.o \
> memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
> strchr.o strrchr.o
> +lib-y += do_csum.o
>
> # Tell the compiler to treat all general purpose registers as
> # callee-saved, which allows for efficient runtime patching of the bl
> diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S
> new file mode 100644
> index 0000000..8e7b486
> --- /dev/null
> +++ b/arch/arm64/lib/do_csum.S
> @@ -0,0 +1,177 @@
> +/*
> + * Optmized version of the standard do_csum() function
> + *
> + * Parameters:
> + * x0 - address of buffer to checksum (const unsigned char *)
> + * x1 - length of the buffer (int)
> + * Returns:
> + * x0 - the return checksum of the buffer
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +ENTRY(do_csum)
> + ldr x13, =0xffff
> + eor x4, x4, x4
> + eor x5, x5, x5
> + eor v0.16b, v0.16b, v0.16b
> +
> + //len is zero or negative
> + and x6, x1, #0x80000000
> + cmp x6, #0
> + b.gt out
> + cbz w1, out
> +
> + tst x0, #1
> + b.eq addr_not_odd
> +
> + //addr is odd
> + mov x4, #1
> + ldr x6, [x0], #1
> +#ifdef __AARCH64EB__
> + and x6, x6, #0xff
> +#else
> + lsl x6, x6, #8
> + and x6, x6, x13
> +#endif
> + add x5, x5, x6
> + sub x1, x1, #1
> +
> +addr_not_odd:
> + cmp x1, #32
> + b.lt len_4
> + cmp x1, #192
> + b.ge len_than_192
> + b do_loop_16
> +
> +len_than_192:
> + ldp q1, q0, [x0], #32
> + ldp q3, q2, [x0], #32
> + ldp q5, q4, [x0], #32
> + sub x1, x1, #96
> +
> +do_loop_96:
> + ldp q7, q6, [x0], #32
> + ldp q9, q8, [x0], #32
> + ldp q11, q10, [x0], #32
> +
> + uaddl v12.4s, v0.4h, v6.4h
> + uaddl2 v13.4s, v0.8h, v6.8h
> +
> + uaddl v14.4s, v1.4h, v7.4h
> + uaddl2 v15.4s, v1.8h, v7.8h
> +
> + uaddl v16.4s, v2.4h, v8.4h
> + uaddl2 v17.4s, v2.8h, v8.8h
> +
> + uaddl v18.4s, v3.4h, v9.4h
> + uaddl2 v19.4s, v3.8h, v9.8h
> +
> + uaddl v20.4s, v4.4h, v10.4h
> + uaddl2 v21.4s, v4.8h, v10.8h
> + uaddl v22.4s, v5.4h, v11.4h
> + uaddl2 v23.4s, v5.8h, v11.8h
> +
> + add v0.4s, v12.4s, v13.4s
> + add v1.4s, v14.4s, v15.4s
> + add v2.4s, v16.4s, v17.4s
> + add v3.4s, v18.4s, v19.4s
> + add v4.4s, v20.4s, v21.4s
> + add v5.4s, v22.4s, v23.4s
> +
> + sub x1, x1, #96
> + cmp x1, #96
> + b.ge do_loop_96
> +
> + add v0.4s, v0.4s, v1.4s
> + add v2.4s, v2.4s, v3.4s
> + add v4.4s, v4.4s, v5.4s
> + add v0.4s, v0.4s, v2.4s
> + add v0.4s, v0.4s, v4.4s //get result
> +
> + cmp x1, #16
> + b.lt get_64
> +
> +do_loop_16:
> + ldr q6, [x0], #16
> + uaddl v24.4s, v0.4h, v6.4h
> + uaddl2 v25.4s, v0.8h, v6.8h
> + add v0.4s, v24.4s, v25.4s
> + sub x1, x1, #16
> + cmp x1, #16
> + b.ge do_loop_16
> +
> +get_64:
> + mov x6, v0.d[0]
> + add x5, x5, x6
> + mov x6, v0.d[1]
> +
> + add x5, x5, x6
> + cmp x5, x6
> + b.ge len_4
> + add x5, x5, #1
> +
> +len_4:
> + cmp x1, #4
> + b.lt len_2
> +
> + sub x1, x1, #4
> + ldr w6, [x0], #4
> + and x6, x6, #0xffffffff
> + add x5, x5, x6
> + b len_4
> +
> +len_2:
> + cmp x1, #2
> + b.lt len_1
> + sub x1, x1, #2
> + ldrh w6, [x0], #2
> + and x6, x6, x13
> + add x5, x5, x6
> +
> +len_1:
> + cmp x1, #1
> + b.lt fold_32
> + ldr x6, [x0], #1
> +#ifdef __AARCH64EB__
> + lsl x6, x6, #8
> + and x6, x6, x13
> +#else
> + and x6, x6, #0xff
> +#endif
> + add x5, x5, x6
> +
> +fold_32:
> + and x9, x5, x13 //[15:0]
> + and x10, x13, x5, lsr #16 //[31:16]
> + and x11, x13, x5, lsr #32 //[47:32]
> + and x12, x13, x5, lsr #48 //[47:32]
> +
> + add x9, x9, x10
> + add x11, x11, x12
> +
> + add x9, x9, x11
> +
> + and x10, x9, x13
> + and x11, x13, x9, lsr #16
> +
> + add x5, x10, x11
> +
> + and x9, x5, x13 //add carry
> + and x10, x13, x5, lsr #16
> + add x5, x9, x10
> +
> + cbz x4, out //addr isn't odd
> +
> + lsr x6, x5, #8
> + and x6, x6, #0xff
> + and x7, x5, #0xff
> + lsl x7, x7, #8
> +
> + orr x5, x6, x7
> +
> +out:
> + mov x0, x5
> + ret
> +ENDPROC(do_csum)
>
More information about the linux-arm-kernel
mailing list