[PATCH] arm64: support do_csum with neon

Tue Jan 23 03:05:54 PST 2018

On Tue, Jan 16, 2018 at 01:07:01AM +0000, chenzhou wrote:
> Hi Robin,
> 
> -----Original Message-----
> From: Robin Murphy [mailto:robin.murphy at arm.com] 
> Sent: Friday, January 12, 2018 8:23 PM
> To: chenzhou; catalin.marinas at arm.com; will.deacon at arm.com
> Cc: linux-arm-kernel at lists.infradead.org
> Subject: Re: [PATCH] arm64: support do_csum with neon
> 
> On 12/01/18 11:53, Chen Zhou wrote:
> > On arm64 little endian such as the Cortex-A57, the neon based 
> > implementation performance increases by about 70% when len is greater 
> > than 512.
> 
> Um, I don't see the kernel-mode NEON infrastructure being used anywhere here. Blindly destroying someone else's register context is never going to end well, regardless of how fast you can do it...
> 
> Robin.
> 
> Thank you very much for your review. You're right. I don't think about whether the
> systems support NEON and don't put kernel_neon_begin and kernel_neon_end
> calls around NEON code. I will fix this up later.

Can do_csum() be called from any context?

Kernel-mode NEON cannot be used from all contexts, and cannot be nested:

 * thread context: yes, if hardware supports it (elf_hwcap & HWCAP_ASIMD)
 * softirq context: yes, if hardware supports it AND kernel-mode NEON is
	not currently in use in the interrupted thread:
	check (elf_hwcap & HWCAP_ASIMD) && may_use_simd().
 * other contexts (irq, nmi etc.): no
	may_use_simd() returns false for this case.

You will likely need to write some wrapper C code selects between the
NEON-optimised and C implementations, and does the appropriate runtime
checks.

See arch/arm64/crypto/sha256-glue.c for an example.

Feel free to ask if you're confused, and Cc me and/or Ard Biesheuvel on
the patches.

Possibly we should write some documentation on kernel_mode_neon()...

Cheers
---Dave

> 
> Thanks
> Chen Zhou
> 
> > Signed-off-by: Chen Zhou <chenzhou10 at huawei.com>
> > ---
> >   arch/arm64/include/asm/checksum.h |   3 +
> >   arch/arm64/lib/Makefile           |   1 +
> >   arch/arm64/lib/do_csum.S          | 177 ++++++++++++++++++++++++++++++++++++++
> >   3 files changed, 181 insertions(+)
> >   create mode 100644 arch/arm64/lib/do_csum.S
> > 
> > diff --git a/arch/arm64/include/asm/checksum.h 
> > b/arch/arm64/include/asm/checksum.h
> > index 09f6533..e300782 100644
> > --- a/arch/arm64/include/asm/checksum.h
> > +++ b/arch/arm64/include/asm/checksum.h
> > @@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum)
> >   }
> >   #define csum_fold csum_fold
> >   
> > +#define do_csum do_csum
> > +extern unsigned int do_csum(const unsigned char *, size_t);
> > +
> >   static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> >   {
> >   	__uint128_t tmp;
> > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index 
> > 1a811ec..5b6aa34 100644
> > --- a/arch/arm64/lib/Makefile
> > +++ b/arch/arm64/lib/Makefile
> > @@ -3,6 +3,7 @@ lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
> >   		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
> >   		   memcmp.o strcmp.o strncmp.o strlen.o strnlen.o	\
> >   		   strchr.o strrchr.o
> > +lib-y += do_csum.o
> >   
> >   # Tell the compiler to treat all general purpose registers as
> >   # callee-saved, which allows for efficient runtime patching of the 
> > bl diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S 
> > new file mode 100644 index 0000000..8e7b486
> > --- /dev/null
> > +++ b/arch/arm64/lib/do_csum.S
> > @@ -0,0 +1,177 @@
> > +/*
> > + * Optmized version of the standard do_csum() function
> > + *
> > + * Parameters:
> > + *	x0 - address of buffer to checksum (const unsigned char *)
> > + *	x1 - length of the buffer (int)
> > + * Returns:
> > + *	x0 - the return checksum of the buffer
> > + */
> > +
> > +#include <linux/linkage.h>
> > +#include <asm/assembler.h>
> > +
> > +ENTRY(do_csum)
> > +	ldr	x13, =0xffff
> > +	eor	x4, x4, x4
> > +	eor	x5, x5, x5
> > +	eor	v0.16b, v0.16b, v0.16b
> > +
> > +	//len is zero or negative
> > +	and	x6, x1, #0x80000000
> > +	cmp	x6, #0
> > +	b.gt	out
> > +	cbz	w1, out
> > +
> > +	tst	x0, #1
> > +	b.eq	addr_not_odd
> > +
> > +	//addr is odd
> > +	mov	x4, #1
> > +	ldr	x6, [x0], #1
> > +#ifdef __AARCH64EB__
> > +	and     x6, x6, #0xff
> > +#else
> > +	lsl   x6, x6, #8
> > +	and   x6, x6, x13
> > +#endif
> > +	add     x5, x5, x6
> > +	sub     x1, x1, #1
> > +
> > +addr_not_odd:
> > +	cmp	x1, #32
> > +	b.lt	len_4
> > +	cmp	x1, #192
> > +	b.ge	len_than_192
> > +	b	do_loop_16
> > +
> > +len_than_192:
> > +	ldp	q1, q0, [x0], #32
> > +	ldp	q3, q2, [x0], #32
> > +	ldp	q5, q4, [x0], #32
> > +	sub	x1, x1, #96
> > +
> > +do_loop_96:
> > +	ldp	q7, q6, [x0], #32
> > +	ldp	q9, q8, [x0], #32
> > +	ldp	q11, q10, [x0], #32
> > +
> > +	uaddl	v12.4s, v0.4h, v6.4h
> > +	uaddl2	v13.4s, v0.8h, v6.8h
> > +
> > +	uaddl	v14.4s, v1.4h, v7.4h
> > +	uaddl2	v15.4s, v1.8h, v7.8h
> > +
> > +	uaddl	v16.4s, v2.4h, v8.4h
> > +	uaddl2	v17.4s, v2.8h, v8.8h
> > +
> > +	uaddl	v18.4s, v3.4h, v9.4h
> > +	uaddl2	v19.4s, v3.8h, v9.8h
> > +
> > +	uaddl	v20.4s, v4.4h, v10.4h
> > +	uaddl2	v21.4s, v4.8h, v10.8h
> > +	uaddl	v22.4s, v5.4h, v11.4h
> > +	uaddl2	v23.4s, v5.8h, v11.8h
> > +
> > +	add	v0.4s, v12.4s, v13.4s
> > +	add	v1.4s, v14.4s, v15.4s
> > +	add	v2.4s, v16.4s, v17.4s
> > +	add	v3.4s, v18.4s, v19.4s
> > +	add	v4.4s, v20.4s, v21.4s
> > +	add	v5.4s, v22.4s, v23.4s
> > +
> > +	sub	x1, x1, #96
> > +	cmp	x1, #96
> > +	b.ge	do_loop_96
> > +
> > +	add	v0.4s, v0.4s, v1.4s
> > +	add	v2.4s, v2.4s, v3.4s
> > +	add	v4.4s, v4.4s, v5.4s
> > +	add	v0.4s, v0.4s, v2.4s
> > +	add	v0.4s, v0.4s, v4.4s     //get result
> > +
> > +	cmp	x1, #16
> > +	b.lt	get_64
> > +
> > +do_loop_16:
> > +	ldr	q6, [x0], #16
> > +	uaddl	v24.4s, v0.4h, v6.4h
> > +	uaddl2	v25.4s, v0.8h, v6.8h
> > +	add	v0.4s, v24.4s, v25.4s
> > +	sub	x1, x1, #16
> > +	cmp	x1, #16
> > +	b.ge	do_loop_16
> > +
> > +get_64:
> > +	mov	x6, v0.d[0]
> > +	add	x5, x5, x6
> > +	mov	x6, v0.d[1]
> > +
> > +	add	x5, x5, x6
> > +	cmp	x5, x6
> > +	b.ge	len_4
> > +	add	x5, x5, #1
> > +
> > +len_4:
> > +	cmp	x1, #4
> > +	b.lt	len_2
> > +
> > +	sub	x1, x1, #4
> > +	ldr	w6, [x0], #4
> > +	and	x6, x6, #0xffffffff
> > +	add	x5, x5, x6
> > +	b	len_4
> > +
> > +len_2:
> > +	cmp	x1, #2
> > +	b.lt	len_1
> > +	sub	x1, x1, #2
> > +	ldrh	w6, [x0], #2
> > +	and	x6, x6, x13
> > +	add	x5, x5, x6
> > +
> > +len_1:
> > +	cmp	x1, #1
> > +	b.lt	fold_32
> > +	ldr	x6, [x0], #1
> > +#ifdef __AARCH64EB__
> > +	lsl	x6, x6, #8
> > +	and	x6, x6, x13
> > +#else
> > +	and	x6, x6, #0xff
> > +#endif
> > +	add	x5, x5, x6
> > +
> > +fold_32:
> > +	and	x9, x5, x13		//[15:0]
> > +	and	x10, x13, x5, lsr #16	//[31:16]
> > +	and	x11, x13, x5, lsr #32	//[47:32]
> > +	and	x12, x13, x5, lsr #48	//[47:32]
> > +
> > +	add	x9, x9, x10
> > +	add	x11, x11, x12
> > +
> > +	add	x9, x9, x11
> > +
> > +	and	x10, x9, x13
> > +	and	x11, x13, x9, lsr #16
> > +
> > +	add	x5, x10, x11
> > +
> > +	and     x9, x5, x13             //add carry
> > +	and     x10, x13, x5, lsr #16
> > +	add	x5, x9, x10
> > +
> > +	cbz	x4, out			//addr isn't odd
> > +
> > +	lsr	x6, x5, #8
> > +	and	x6, x6, #0xff
> > +	and	x7, x5, #0xff
> > +	lsl	x7, x7, #8
> > +
> > +	orr	x5, x6, x7
> > +
> > +out:
> > +	mov	x0, x5
> > +	ret
> > +ENDPROC(do_csum)
> > 
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel