[PATCH] arm64: support do_csum with neon

Mon Jan 15 17:07:01 PST 2018

Hi Robin,

-----Original Message-----
From: Robin Murphy [mailto:robin.murphy at arm.com] 
Sent: Friday, January 12, 2018 8:23 PM
To: chenzhou; catalin.marinas at arm.com; will.deacon at arm.com
Cc: linux-arm-kernel at lists.infradead.org
Subject: Re: [PATCH] arm64: support do_csum with neon

On 12/01/18 11:53, Chen Zhou wrote:
> On arm64 little endian such as the Cortex-A57, the neon based 
> implementation performance increases by about 70% when len is greater 
> than 512.

Um, I don't see the kernel-mode NEON infrastructure being used anywhere here. Blindly destroying someone else's register context is never going to end well, regardless of how fast you can do it...

Robin.

Thank you very much for your review. You're right. I don't think about whether the
systems support NEON and don't put kernel_neon_begin and kernel_neon_end
calls around NEON code. I will fix this up later.

Thanks
Chen Zhou

> Signed-off-by: Chen Zhou <chenzhou10 at huawei.com>
> ---
>   arch/arm64/include/asm/checksum.h |   3 +
>   arch/arm64/lib/Makefile           |   1 +
>   arch/arm64/lib/do_csum.S          | 177 ++++++++++++++++++++++++++++++++++++++
>   3 files changed, 181 insertions(+)
>   create mode 100644 arch/arm64/lib/do_csum.S
> 
> diff --git a/arch/arm64/include/asm/checksum.h 
> b/arch/arm64/include/asm/checksum.h
> index 09f6533..e300782 100644
> --- a/arch/arm64/include/asm/checksum.h
> +++ b/arch/arm64/include/asm/checksum.h
> @@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum)
>   }
>   #define csum_fold csum_fold
>   
> +#define do_csum do_csum
> +extern unsigned int do_csum(const unsigned char *, size_t);
> +
>   static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
>   {
>   	__uint128_t tmp;
> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 
> 1a811ec..5b6aa34 100644
> --- a/arch/arm64/lib/Makefile
> +++ b/arch/arm64/lib/Makefile
> @@ -3,6 +3,7 @@ lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
>   		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
>   		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
>   		   strchr.o strrchr.o
> +lib-y += do_csum.o
>   
>   # Tell the compiler to treat all general purpose registers as
>   # callee-saved, which allows for efficient runtime patching of the 
> bl diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S 
> new file mode 100644 index 0000000..8e7b486
> --- /dev/null
> +++ b/arch/arm64/lib/do_csum.S
> @@ -0,0 +1,177 @@
> +/*
> + * Optmized version of the standard do_csum() function
> + *
> + * Parameters:
> + *	x0 - address of buffer to checksum (const unsigned char *)
> + *	x1 - length of the buffer (int)
> + * Returns:
> + *	x0 - the return checksum of the buffer
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +ENTRY(do_csum)
> +	ldr	x13, =0xffff
> +	eor	x4, x4, x4
> +	eor	x5, x5, x5
> +	eor	v0.16b, v0.16b, v0.16b
> +
> +	//len is zero or negative
> +	and	x6, x1, #0x80000000
> +	cmp	x6, #0
> +	b.gt	out
> +	cbz	w1, out
> +
> +	tst	x0, #1
> +	b.eq	addr_not_odd
> +
> +	//addr is odd
> +	mov	x4, #1
> +	ldr	x6, [x0], #1
> +#ifdef __AARCH64EB__
> +	and     x6, x6, #0xff
> +#else
> +	lsl   x6, x6, #8
> +	and   x6, x6, x13
> +#endif
> +	add     x5, x5, x6
> +	sub     x1, x1, #1
> +
> +addr_not_odd:
> +	cmp	x1, #32
> +	b.lt	len_4
> +	cmp	x1, #192
> +	b.ge	len_than_192
> +	b	do_loop_16
> +
> +len_than_192:
> +	ldp	q1, q0, [x0], #32
> +	ldp	q3, q2, [x0], #32
> +	ldp	q5, q4, [x0], #32
> +	sub	x1, x1, #96
> +
> +do_loop_96:
> +	ldp	q7, q6, [x0], #32
> +	ldp	q9, q8, [x0], #32
> +	ldp	q11, q10, [x0], #32
> +
> +	uaddl	v12.4s, v0.4h, v6.4h
> +	uaddl2	v13.4s, v0.8h, v6.8h
> +
> +	uaddl	v14.4s, v1.4h, v7.4h
> +	uaddl2	v15.4s, v1.8h, v7.8h
> +
> +	uaddl	v16.4s, v2.4h, v8.4h
> +	uaddl2	v17.4s, v2.8h, v8.8h
> +
> +	uaddl	v18.4s, v3.4h, v9.4h
> +	uaddl2	v19.4s, v3.8h, v9.8h
> +
> +	uaddl	v20.4s, v4.4h, v10.4h
> +	uaddl2	v21.4s, v4.8h, v10.8h
> +	uaddl	v22.4s, v5.4h, v11.4h
> +	uaddl2	v23.4s, v5.8h, v11.8h
> +
> +	add	v0.4s, v12.4s, v13.4s
> +	add	v1.4s, v14.4s, v15.4s
> +	add	v2.4s, v16.4s, v17.4s
> +	add	v3.4s, v18.4s, v19.4s
> +	add	v4.4s, v20.4s, v21.4s
> +	add	v5.4s, v22.4s, v23.4s
> +
> +	sub	x1, x1, #96
> +	cmp	x1, #96
> +	b.ge	do_loop_96
> +
> +	add	v0.4s, v0.4s, v1.4s
> +	add	v2.4s, v2.4s, v3.4s
> +	add	v4.4s, v4.4s, v5.4s
> +	add	v0.4s, v0.4s, v2.4s
> +	add	v0.4s, v0.4s, v4.4s     //get result
> +
> +	cmp	x1, #16
> +	b.lt	get_64
> +
> +do_loop_16:
> +	ldr	q6, [x0], #16
> +	uaddl	v24.4s, v0.4h, v6.4h
> +	uaddl2	v25.4s, v0.8h, v6.8h
> +	add	v0.4s, v24.4s, v25.4s
> +	sub	x1, x1, #16
> +	cmp	x1, #16
> +	b.ge	do_loop_16
> +
> +get_64:
> +	mov	x6, v0.d[0]
> +	add	x5, x5, x6
> +	mov	x6, v0.d[1]
> +
> +	add	x5, x5, x6
> +	cmp	x5, x6
> +	b.ge	len_4
> +	add	x5, x5, #1
> +
> +len_4:
> +	cmp	x1, #4
> +	b.lt	len_2
> +
> +	sub	x1, x1, #4
> +	ldr	w6, [x0], #4
> +	and	x6, x6, #0xffffffff
> +	add	x5, x5, x6
> +	b	len_4
> +
> +len_2:
> +	cmp	x1, #2
> +	b.lt	len_1
> +	sub	x1, x1, #2
> +	ldrh	w6, [x0], #2
> +	and	x6, x6, x13
> +	add	x5, x5, x6
> +
> +len_1:
> +	cmp	x1, #1
> +	b.lt	fold_32
> +	ldr	x6, [x0], #1
> +#ifdef __AARCH64EB__
> +	lsl	x6, x6, #8
> +	and	x6, x6, x13
> +#else
> +	and	x6, x6, #0xff
> +#endif
> +	add	x5, x5, x6
> +
> +fold_32:
> +	and	x9, x5, x13		//[15:0]
> +	and	x10, x13, x5, lsr #16	//[31:16]
> +	and	x11, x13, x5, lsr #32	//[47:32]
> +	and	x12, x13, x5, lsr #48	//[47:32]
> +
> +	add	x9, x9, x10
> +	add	x11, x11, x12
> +
> +	add	x9, x9, x11
> +
> +	and	x10, x9, x13
> +	and	x11, x13, x9, lsr #16
> +
> +	add	x5, x10, x11
> +
> +	and     x9, x5, x13             //add carry
> +	and     x10, x13, x5, lsr #16
> +	add	x5, x9, x10
> +
> +	cbz	x4, out			//addr isn't odd
> +
> +	lsr	x6, x5, #8
> +	and	x6, x6, #0xff
> +	and	x7, x5, #0xff
> +	lsl	x7, x7, #8
> +
> +	orr	x5, x6, x7
> +
> +out:
> +	mov	x0, x5
> +	ret
> +ENDPROC(do_csum)
>