[PATCH] arm64: support do_csum with neon
Dave Martin
Dave.Martin at arm.com
Tue Jan 23 03:05:54 PST 2018
On Tue, Jan 16, 2018 at 01:07:01AM +0000, chenzhou wrote:
> Hi Robin,
>
> -----Original Message-----
> From: Robin Murphy [mailto:robin.murphy at arm.com]
> Sent: Friday, January 12, 2018 8:23 PM
> To: chenzhou; catalin.marinas at arm.com; will.deacon at arm.com
> Cc: linux-arm-kernel at lists.infradead.org
> Subject: Re: [PATCH] arm64: support do_csum with neon
>
> On 12/01/18 11:53, Chen Zhou wrote:
> > On arm64 little endian such as the Cortex-A57, the neon based
> > implementation performance increases by about 70% when len is greater
> > than 512.
>
> Um, I don't see the kernel-mode NEON infrastructure being used anywhere here. Blindly destroying someone else's register context is never going to end well, regardless of how fast you can do it...
>
> Robin.
>
> Thank you very much for your review. You're right. I don't think about whether the
> systems support NEON and don't put kernel_neon_begin and kernel_neon_end
> calls around NEON code. I will fix this up later.
Can do_csum() be called from any context?
Kernel-mode NEON cannot be used from all contexts, and cannot be nested:
* thread context: yes, if hardware supports it (elf_hwcap & HWCAP_ASIMD)
* softirq context: yes, if hardware supports it AND kernel-mode NEON is
not currently in use in the interrupted thread:
check (elf_hwcap & HWCAP_ASIMD) && may_use_simd().
* other contexts (irq, nmi etc.): no
may_use_simd() returns false for this case.
You will likely need to write some wrapper C code selects between the
NEON-optimised and C implementations, and does the appropriate runtime
checks.
See arch/arm64/crypto/sha256-glue.c for an example.
Feel free to ask if you're confused, and Cc me and/or Ard Biesheuvel on
the patches.
Possibly we should write some documentation on kernel_mode_neon()...
Cheers
---Dave
>
> Thanks
> Chen Zhou
>
> > Signed-off-by: Chen Zhou <chenzhou10 at huawei.com>
> > ---
> > arch/arm64/include/asm/checksum.h | 3 +
> > arch/arm64/lib/Makefile | 1 +
> > arch/arm64/lib/do_csum.S | 177 ++++++++++++++++++++++++++++++++++++++
> > 3 files changed, 181 insertions(+)
> > create mode 100644 arch/arm64/lib/do_csum.S
> >
> > diff --git a/arch/arm64/include/asm/checksum.h
> > b/arch/arm64/include/asm/checksum.h
> > index 09f6533..e300782 100644
> > --- a/arch/arm64/include/asm/checksum.h
> > +++ b/arch/arm64/include/asm/checksum.h
> > @@ -26,6 +26,9 @@ static inline __sum16 csum_fold(__wsum csum)
> > }
> > #define csum_fold csum_fold
> >
> > +#define do_csum do_csum
> > +extern unsigned int do_csum(const unsigned char *, size_t);
> > +
> > static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl)
> > {
> > __uint128_t tmp;
> > diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile index
> > 1a811ec..5b6aa34 100644
> > --- a/arch/arm64/lib/Makefile
> > +++ b/arch/arm64/lib/Makefile
> > @@ -3,6 +3,7 @@ lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
> > clear_page.o memchr.o memcpy.o memmove.o memset.o \
> > memcmp.o strcmp.o strncmp.o strlen.o strnlen.o \
> > strchr.o strrchr.o
> > +lib-y += do_csum.o
> >
> > # Tell the compiler to treat all general purpose registers as
> > # callee-saved, which allows for efficient runtime patching of the
> > bl diff --git a/arch/arm64/lib/do_csum.S b/arch/arm64/lib/do_csum.S
> > new file mode 100644 index 0000000..8e7b486
> > --- /dev/null
> > +++ b/arch/arm64/lib/do_csum.S
> > @@ -0,0 +1,177 @@
> > +/*
> > + * Optmized version of the standard do_csum() function
> > + *
> > + * Parameters:
> > + * x0 - address of buffer to checksum (const unsigned char *)
> > + * x1 - length of the buffer (int)
> > + * Returns:
> > + * x0 - the return checksum of the buffer
> > + */
> > +
> > +#include <linux/linkage.h>
> > +#include <asm/assembler.h>
> > +
> > +ENTRY(do_csum)
> > + ldr x13, =0xffff
> > + eor x4, x4, x4
> > + eor x5, x5, x5
> > + eor v0.16b, v0.16b, v0.16b
> > +
> > + //len is zero or negative
> > + and x6, x1, #0x80000000
> > + cmp x6, #0
> > + b.gt out
> > + cbz w1, out
> > +
> > + tst x0, #1
> > + b.eq addr_not_odd
> > +
> > + //addr is odd
> > + mov x4, #1
> > + ldr x6, [x0], #1
> > +#ifdef __AARCH64EB__
> > + and x6, x6, #0xff
> > +#else
> > + lsl x6, x6, #8
> > + and x6, x6, x13
> > +#endif
> > + add x5, x5, x6
> > + sub x1, x1, #1
> > +
> > +addr_not_odd:
> > + cmp x1, #32
> > + b.lt len_4
> > + cmp x1, #192
> > + b.ge len_than_192
> > + b do_loop_16
> > +
> > +len_than_192:
> > + ldp q1, q0, [x0], #32
> > + ldp q3, q2, [x0], #32
> > + ldp q5, q4, [x0], #32
> > + sub x1, x1, #96
> > +
> > +do_loop_96:
> > + ldp q7, q6, [x0], #32
> > + ldp q9, q8, [x0], #32
> > + ldp q11, q10, [x0], #32
> > +
> > + uaddl v12.4s, v0.4h, v6.4h
> > + uaddl2 v13.4s, v0.8h, v6.8h
> > +
> > + uaddl v14.4s, v1.4h, v7.4h
> > + uaddl2 v15.4s, v1.8h, v7.8h
> > +
> > + uaddl v16.4s, v2.4h, v8.4h
> > + uaddl2 v17.4s, v2.8h, v8.8h
> > +
> > + uaddl v18.4s, v3.4h, v9.4h
> > + uaddl2 v19.4s, v3.8h, v9.8h
> > +
> > + uaddl v20.4s, v4.4h, v10.4h
> > + uaddl2 v21.4s, v4.8h, v10.8h
> > + uaddl v22.4s, v5.4h, v11.4h
> > + uaddl2 v23.4s, v5.8h, v11.8h
> > +
> > + add v0.4s, v12.4s, v13.4s
> > + add v1.4s, v14.4s, v15.4s
> > + add v2.4s, v16.4s, v17.4s
> > + add v3.4s, v18.4s, v19.4s
> > + add v4.4s, v20.4s, v21.4s
> > + add v5.4s, v22.4s, v23.4s
> > +
> > + sub x1, x1, #96
> > + cmp x1, #96
> > + b.ge do_loop_96
> > +
> > + add v0.4s, v0.4s, v1.4s
> > + add v2.4s, v2.4s, v3.4s
> > + add v4.4s, v4.4s, v5.4s
> > + add v0.4s, v0.4s, v2.4s
> > + add v0.4s, v0.4s, v4.4s //get result
> > +
> > + cmp x1, #16
> > + b.lt get_64
> > +
> > +do_loop_16:
> > + ldr q6, [x0], #16
> > + uaddl v24.4s, v0.4h, v6.4h
> > + uaddl2 v25.4s, v0.8h, v6.8h
> > + add v0.4s, v24.4s, v25.4s
> > + sub x1, x1, #16
> > + cmp x1, #16
> > + b.ge do_loop_16
> > +
> > +get_64:
> > + mov x6, v0.d[0]
> > + add x5, x5, x6
> > + mov x6, v0.d[1]
> > +
> > + add x5, x5, x6
> > + cmp x5, x6
> > + b.ge len_4
> > + add x5, x5, #1
> > +
> > +len_4:
> > + cmp x1, #4
> > + b.lt len_2
> > +
> > + sub x1, x1, #4
> > + ldr w6, [x0], #4
> > + and x6, x6, #0xffffffff
> > + add x5, x5, x6
> > + b len_4
> > +
> > +len_2:
> > + cmp x1, #2
> > + b.lt len_1
> > + sub x1, x1, #2
> > + ldrh w6, [x0], #2
> > + and x6, x6, x13
> > + add x5, x5, x6
> > +
> > +len_1:
> > + cmp x1, #1
> > + b.lt fold_32
> > + ldr x6, [x0], #1
> > +#ifdef __AARCH64EB__
> > + lsl x6, x6, #8
> > + and x6, x6, x13
> > +#else
> > + and x6, x6, #0xff
> > +#endif
> > + add x5, x5, x6
> > +
> > +fold_32:
> > + and x9, x5, x13 //[15:0]
> > + and x10, x13, x5, lsr #16 //[31:16]
> > + and x11, x13, x5, lsr #32 //[47:32]
> > + and x12, x13, x5, lsr #48 //[47:32]
> > +
> > + add x9, x9, x10
> > + add x11, x11, x12
> > +
> > + add x9, x9, x11
> > +
> > + and x10, x9, x13
> > + and x11, x13, x9, lsr #16
> > +
> > + add x5, x10, x11
> > +
> > + and x9, x5, x13 //add carry
> > + and x10, x13, x5, lsr #16
> > + add x5, x9, x10
> > +
> > + cbz x4, out //addr isn't odd
> > +
> > + lsr x6, x5, #8
> > + and x6, x6, #0xff
> > + and x7, x5, #0xff
> > + lsl x7, x7, #8
> > +
> > + orr x5, x6, x7
> > +
> > +out:
> > + mov x0, x5
> > + ret
> > +ENDPROC(do_csum)
> >
> _______________________________________________
> linux-arm-kernel mailing list
> linux-arm-kernel at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
More information about the linux-arm-kernel
mailing list