[PATCHv2] arm: crypto: Add optimized SHA-256/224
Jean-Christophe PLAGNIOL-VILLARD
plagnioj at jcrosoft.com
Tue Mar 24 05:27:02 PDT 2015
On 13:50 Mon 23 Mar , Sami Tolvanen wrote:
> Add Andy Polyakov's optimized assembly and NEON implementations for
> SHA-256/224.
>
> The sha256-armv4.pl script for generating the assembly code is from
> OpenSSL commit 2ecd32a1f8f0643ae7b38f59bbaf9f0d6ef326fe.
>
> Compared to sha256-generic these implementations have the following
> tcrypt speed improvements on Motorola Nexus 6 (Snapdragon 805):
>
> bs b/u sha256-neon sha256-asm
> 16 16 x1.32 x1.19
> 64 16 x1.27 x1.15
> 64 64 x1.36 x1.20
> 256 16 x1.22 x1.11
> 256 64 x1.36 x1.19
> 256 256 x1.59 x1.23
> 1024 16 x1.21 x1.10
> 1024 256 x1.65 x1.23
> 1024 1024 x1.76 x1.25
> 2048 16 x1.21 x1.10
> 2048 256 x1.66 x1.23
> 2048 1024 x1.78 x1.25
> 2048 2048 x1.79 x1.25
> 4096 16 x1.20 x1.09
> 4096 256 x1.66 x1.23
> 4096 1024 x1.79 x1.26
> 4096 4096 x1.82 x1.26
> 8192 16 x1.20 x1.09
> 8192 256 x1.67 x1.23
> 8192 1024 x1.80 x1.26
> 8192 4096 x1.85 x1.28
> 8192 8192 x1.85 x1.27
>
> Where bs refers to block size and b/u to bytes per update.
>
> Signed-off-by: Sami Tolvanen <samitolvanen at google.com>
> Cc: Andy Polyakov <appro at openssl.org>
>
> ---
> Changes since v1:
> Rebased to Herbert's cryptodev tree
> Include sha256-armv4.pl and use it to generate sha256-core.S
> Add integer-only assembly version as sha256-asm
> Add support for SHA-224 to the glue code
> Change priority for sha256/224-ce to 300
>
> ---
> arch/arm/crypto/Kconfig | 7
> arch/arm/crypto/Makefile | 8
> arch/arm/crypto/sha2-ce-glue.c | 4
> arch/arm/crypto/sha256-armv4.pl | 713 ++++++
> arch/arm/crypto/sha256-core.S_shipped | 2775 ++++++++++++++++++++++++
> arch/arm/crypto/sha256_glue.c | 246 ++
> arch/arm/crypto/sha256_glue.h | 23
> arch/arm/crypto/sha256_neon_glue.c | 172 +
> 8 files changed, 3945 insertions(+), 3 deletions(-)
>
> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
> index d63f319..458729d 100644
> --- a/arch/arm/crypto/Kconfig
> +++ b/arch/arm/crypto/Kconfig
> @@ -46,6 +46,13 @@ config CRYPTO_SHA2_ARM_CE
> SHA-256 secure hash standard (DFIPS 180-2) implemented
> using special ARMv8 Crypto Extensions.
>
> +config CRYPTO_SHA256_ARM
> + tristate "SHA-224/256 digest algorithm (ARM-asm and NEON)"
> + select CRYPTO_HASH
> + help
> + SHA-256 secure hash standard (DFIPS 180-2) implemented
> + using optimized ARM assembler and NEON, when available.
> +
> config CRYPTO_SHA512_ARM_NEON
> tristate "SHA384 and SHA512 digest algorithm (ARM NEON)"
> depends on KERNEL_MODE_NEON
> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
> index 9a273bd..ef46e89 100644
> --- a/arch/arm/crypto/Makefile
> +++ b/arch/arm/crypto/Makefile
> @@ -7,6 +7,7 @@ obj-$(CONFIG_CRYPTO_AES_ARM_BS) += aes-arm-bs.o
> obj-$(CONFIG_CRYPTO_AES_ARM_CE) += aes-arm-ce.o
> obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
> obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
> +obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
> obj-$(CONFIG_CRYPTO_SHA512_ARM_NEON) += sha512-arm-neon.o
> obj-$(CONFIG_CRYPTO_SHA1_ARM_CE) += sha1-arm-ce.o
> obj-$(CONFIG_CRYPTO_SHA2_ARM_CE) += sha2-arm-ce.o
> @@ -16,6 +17,8 @@ aes-arm-y := aes-armv4.o aes_glue.o
> aes-arm-bs-y := aesbs-core.o aesbs-glue.o
> sha1-arm-y := sha1-armv4-large.o sha1_glue.o
> sha1-arm-neon-y := sha1-armv7-neon.o sha1_neon_glue.o
> +sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
> +sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
> sha512-arm-neon-y := sha512-armv7-neon.o sha512_neon_glue.o
> sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
> sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
> @@ -28,4 +31,7 @@ quiet_cmd_perl = PERL $@
> $(src)/aesbs-core.S_shipped: $(src)/bsaes-armv7.pl
> $(call cmd,perl)
>
> -.PRECIOUS: $(obj)/aesbs-core.S
> +$(src)/sha256-core.S_shipped: $(src)/sha256-armv4.pl
> + $(call cmd,perl)
> +
> +.PRECIOUS: $(obj)/aesbs-core.S $(obj)/sha256-core.S
> diff --git a/arch/arm/crypto/sha2-ce-glue.c b/arch/arm/crypto/sha2-ce-glue.c
> index 9ffe8ad..0449eca 100644
> --- a/arch/arm/crypto/sha2-ce-glue.c
> +++ b/arch/arm/crypto/sha2-ce-glue.c
> @@ -163,7 +163,7 @@ static struct shash_alg algs[] = { {
> .base = {
> .cra_name = "sha224",
> .cra_driver_name = "sha224-ce",
> - .cra_priority = 200,
> + .cra_priority = 300,
> .cra_flags = CRYPTO_ALG_TYPE_SHASH,
> .cra_blocksize = SHA256_BLOCK_SIZE,
> .cra_module = THIS_MODULE,
> @@ -180,7 +180,7 @@ static struct shash_alg algs[] = { {
> .base = {
> .cra_name = "sha256",
> .cra_driver_name = "sha256-ce",
> - .cra_priority = 200,
> + .cra_priority = 300,
> .cra_flags = CRYPTO_ALG_TYPE_SHASH,
> .cra_blocksize = SHA256_BLOCK_SIZE,
> .cra_module = THIS_MODULE,
> diff --git a/arch/arm/crypto/sha256-armv4.pl b/arch/arm/crypto/sha256-armv4.pl
> new file mode 100644
> index 0000000..4fee74d
> --- /dev/null
> +++ b/arch/arm/crypto/sha256-armv4.pl
> @@ -0,0 +1,713 @@
> +#!/usr/bin/env perl
> +
> +# ====================================================================
> +# Written by Andy Polyakov <appro at openssl.org> for the OpenSSL
> +# project. The module is, however, dual licensed under OpenSSL and
> +# CRYPTOGAMS licenses depending on where you obtain it. For further
> +# details see http://www.openssl.org/~appro/cryptogams/.
> +#
> +# Permission to use under GPL terms is granted.
> +# ====================================================================
> +
> +# SHA256 block procedure for ARMv4. May 2007.
> +
> +# Performance is ~2x better than gcc 3.4 generated code and in "abso-
> +# lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per
> +# byte [on single-issue Xscale PXA250 core].
> +
> +# July 2010.
> +#
> +# Rescheduling for dual-issue pipeline resulted in 22% improvement on
> +# Cortex A8 core and ~20 cycles per processed byte.
> +
> +# February 2011.
> +#
> +# Profiler-assisted and platform-specific optimization resulted in 16%
> +# improvement on Cortex A8 core and ~15.4 cycles per processed byte.
> +
> +# September 2013.
> +#
> +# Add NEON implementation. On Cortex A8 it was measured to process one
> +# byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon
> +# S4 does it in 12.5 cycles too, but it's 50% faster than integer-only
> +# code (meaning that latter performs sub-optimally, nothing was done
> +# about it).
> +
> +# May 2014.
> +#
> +# Add ARMv8 code path performing at 2.0 cpb on Apple A7.
> +
> +while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
> +open STDOUT,">$output";
> +
> +$ctx="r0"; $t0="r0";
> +$inp="r1"; $t4="r1";
> +$len="r2"; $t1="r2";
> +$T1="r3"; $t3="r3";
> +$A="r4";
> +$B="r5";
> +$C="r6";
> +$D="r7";
> +$E="r8";
> +$F="r9";
> +$G="r10";
> +$H="r11";
> + at V=($A,$B,$C,$D,$E,$F,$G,$H);
> +$t2="r12";
> +$Ktbl="r14";
> +
> + at Sigma0=( 2,13,22);
> + at Sigma1=( 6,11,25);
> + at sigma0=( 7,18, 3);
> + at sigma1=(17,19,10);
> +
> +sub BODY_00_15 {
> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
> +
> +$code.=<<___ if ($i<16);
> +#if __ARM_ARCH__>=7
> + @ ldr $t1,[$inp],#4 @ $i
> +# if $i==15
> + str $inp,[sp,#17*4] @ make room for $t4
> +# endif
> + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
> + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
> + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
> + rev $t1,$t1
> +#else
> + @ ldrb $t1,[$inp,#3] @ $i
> + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
> + ldrb $t2,[$inp,#2]
> + ldrb $t0,[$inp,#1]
> + orr $t1,$t1,$t2,lsl#8
> + ldrb $t2,[$inp],#4
> + orr $t1,$t1,$t0,lsl#16
> +# if $i==15
> + str $inp,[sp,#17*4] @ make room for $t4
> +# endif
> + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]`
> + orr $t1,$t1,$t2,lsl#24
> + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
> +#endif
> +___
> +$code.=<<___;
> + ldr $t2,[$Ktbl],#4 @ *K256++
> + add $h,$h,$t1 @ h+=X[i]
> + str $t1,[sp,#`$i%16`*4]
> + eor $t1,$f,$g
> + add $h,$h,$t0,ror#$Sigma1[0] @ h+=Sigma1(e)
> + and $t1,$t1,$e
> + add $h,$h,$t2 @ h+=K256[i]
> + eor $t1,$t1,$g @ Ch(e,f,g)
> + eor $t0,$a,$a,ror#`$Sigma0[1]-$Sigma0[0]`
> + add $h,$h,$t1 @ h+=Ch(e,f,g)
> +#if $i==31
> + and $t2,$t2,#0xff
> + cmp $t2,#0xf2 @ done?
> +#endif
> +#if $i<15
> +# if __ARM_ARCH__>=7
> + ldr $t1,[$inp],#4 @ prefetch
> +# else
> + ldrb $t1,[$inp,#3]
> +# endif
> + eor $t2,$a,$b @ a^b, b^c in next round
> +#else
> + ldr $t1,[sp,#`($i+2)%16`*4] @ from future BODY_16_xx
> + eor $t2,$a,$b @ a^b, b^c in next round
> + ldr $t4,[sp,#`($i+15)%16`*4] @ from future BODY_16_xx
> +#endif
> + eor $t0,$t0,$a,ror#`$Sigma0[2]-$Sigma0[0]` @ Sigma0(a)
> + and $t3,$t3,$t2 @ (b^c)&=(a^b)
> + add $d,$d,$h @ d+=h
> + eor $t3,$t3,$b @ Maj(a,b,c)
> + add $h,$h,$t0,ror#$Sigma0[0] @ h+=Sigma0(a)
> + @ add $h,$h,$t3 @ h+=Maj(a,b,c)
> +___
> + ($t2,$t3)=($t3,$t2);
> +}
> +
> +sub BODY_16_XX {
> +my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
> +
> +$code.=<<___;
> + @ ldr $t1,[sp,#`($i+1)%16`*4] @ $i
> + @ ldr $t4,[sp,#`($i+14)%16`*4]
> + mov $t0,$t1,ror#$sigma0[0]
> + add $a,$a,$t2 @ h+=Maj(a,b,c) from the past
> + mov $t2,$t4,ror#$sigma1[0]
> + eor $t0,$t0,$t1,ror#$sigma0[1]
> + eor $t2,$t2,$t4,ror#$sigma1[1]
> + eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
> + ldr $t1,[sp,#`($i+0)%16`*4]
> + eor $t2,$t2,$t4,lsr#$sigma1[2] @ sigma1(X[i+14])
> + ldr $t4,[sp,#`($i+9)%16`*4]
> +
> + add $t2,$t2,$t0
> + eor $t0,$e,$e,ror#`$Sigma1[1]-$Sigma1[0]` @ from BODY_00_15
> + add $t1,$t1,$t2
> + eor $t0,$t0,$e,ror#`$Sigma1[2]-$Sigma1[0]` @ Sigma1(e)
> + add $t1,$t1,$t4 @ X[i]
> +___
> + &BODY_00_15(@_);
> +}
> +
> +$code=<<___;
> +#ifndef __KERNEL__
> +# include "arm_arch.h"
> +#else
> +# define __ARM_ARCH__ __LINUX_ARM_ARCH__
> +# define __ARM_MAX_ARCH__ 7
I'm not sure this will work for kernel that is for older arm
> +#endif
> +
> +.text
> +#if __ARM_ARCH__<7
> +.code 32
> +#else
> +.syntax unified
> +# ifdef __thumb2__
> +.thumb
> +# else
> +.code 32
> +# endif
> +#endif
> +
> +.type K256,%object
> +.align 5
> +K256:
> +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
> +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
> +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
> +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
> +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
> +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
> +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
> +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
> +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
> +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
> +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
> +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
> +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
> +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
> +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
> +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
> +.size K256,.-K256
> +.word 0 @ terminator
> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
> +.LOPENSSL_armcap:
> +.word OPENSSL_armcap_P-sha256_block_data_order
> +#endif
> +.align 5
> +
> +.global sha256_block_data_order
> +.type sha256_block_data_order,%function
> +sha256_block_data_order:
> +#if __ARM_ARCH__<7
> + sub r3,pc,#8 @ sha256_block_data_order
> +#else
> + adr r3,sha256_block_data_order
> +#endif
> +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__)
> + ldr r12,.LOPENSSL_armcap
> + ldr r12,[r3,r12] @ OPENSSL_armcap_P
> + tst r12,#ARMV8_SHA256
> + bne .LARMv8
> + tst r12,#ARMV7_NEON
> + bne .LNEON
> +#endif
> + add $len,$inp,$len,lsl#6 @ len to point at the end of inp
> + stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
> + ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
> + sub $Ktbl,r3,#256+32 @ K256
> + sub sp,sp,#16*4 @ alloca(X[16])
> +.Loop:
> +# if __ARM_ARCH__>=7
> + ldr $t1,[$inp],#4
> +# else
> + ldrb $t1,[$inp,#3]
> +# endif
> + eor $t3,$B,$C @ magic
> + eor $t2,$t2,$t2
> +___
> +for($i=0;$i<16;$i++) { &BODY_00_15($i, at V); unshift(@V,pop(@V)); }
> +$code.=".Lrounds_16_xx:\n";
> +for (;$i<32;$i++) { &BODY_16_XX($i, at V); unshift(@V,pop(@V)); }
> +$code.=<<___;
> +#if __ARM_ARCH__>=7
> + ite eq @ Thumb2 thing, sanity check in ARM
> +#endif
> + ldreq $t3,[sp,#16*4] @ pull ctx
> + bne .Lrounds_16_xx
> +
> + add $A,$A,$t2 @ h+=Maj(a,b,c) from the past
> + ldr $t0,[$t3,#0]
> + ldr $t1,[$t3,#4]
> + ldr $t2,[$t3,#8]
> + add $A,$A,$t0
> + ldr $t0,[$t3,#12]
> + add $B,$B,$t1
> + ldr $t1,[$t3,#16]
> + add $C,$C,$t2
> + ldr $t2,[$t3,#20]
> + add $D,$D,$t0
> + ldr $t0,[$t3,#24]
> + add $E,$E,$t1
> + ldr $t1,[$t3,#28]
> + add $F,$F,$t2
> + ldr $inp,[sp,#17*4] @ pull inp
> + ldr $t2,[sp,#18*4] @ pull inp+len
> + add $G,$G,$t0
> + add $H,$H,$t1
> + stmia $t3,{$A,$B,$C,$D,$E,$F,$G,$H}
> + cmp $inp,$t2
> + sub $Ktbl,$Ktbl,#256 @ rewind Ktbl
> + bne .Loop
> +
> + add sp,sp,#`16+3`*4 @ destroy frame
> +#if __ARM_ARCH__>=5
> + ldmia sp!,{r4-r11,pc}
> +#else
> + ldmia sp!,{r4-r11,lr}
> + tst lr,#1
> + moveq pc,lr @ be binary compatible with V4, yet
> + bx lr @ interoperable with Thumb ISA:-)
> +#endif
> +.size sha256_block_data_order,.-sha256_block_data_order
> +___
> +######################################################################
> +# NEON stuff
> +#
> +{{{
> +my @X=map("q$_",(0..3));
> +my ($T0,$T1,$T2,$T3,$T4,$T5)=("q8","q9","q10","q11","d24","d25");
> +my $Xfer=$t4;
> +my $j=0;
> +
> +sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
> +sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
> +
> +sub AUTOLOAD() # thunk [simplified] x86-style perlasm
> +{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
> + my $arg = pop;
> + $arg = "#$arg" if ($arg*1 eq $arg);
> + $code .= "\t$opcode\t".join(',', at _,$arg)."\n";
> +}
> +
> +sub Xupdate()
> +{ use integer;
> + my $body = shift;
> + my @insns = (&$body,&$body,&$body,&$body);
> + my ($a,$b,$c,$d,$e,$f,$g,$h);
> +
> + &vext_8 ($T0, at X[0], at X[1],4); # X[1..4]
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vext_8 ($T1, at X[2], at X[3],4); # X[9..12]
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T2,$T0,$sigma0[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 (@X[0], at X[0],$T1); # X[0..3] += X[9..12]
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T1,$T0,$sigma0[2]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T2,$T0,32-$sigma0[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T3,$T0,$sigma0[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T1,$T1,$T2);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T3,$T0,32-$sigma0[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T1,$T1,$T3); # sigma0(X[1..4])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T5,&Dhi(@X[3]),$sigma1[2]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 (@X[0], at X[0],$T1); # X[0..3] += sigma0(X[1..4])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T5,$T5,$T4);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T4,&Dhi(@X[3]),$sigma1[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T4,&Dhi(@X[3]),32-$sigma1[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T5,$T5,$T4); # sigma1(X[14..15])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 (&Dlo(@X[0]),&Dlo(@X[0]),$T5);# X[0..1] += sigma1(X[14..15])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T5,&Dlo(@X[0]),$sigma1[2]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T5,$T5,$T4);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vshr_u32 ($T4,&Dlo(@X[0]),$sigma1[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vld1_32 ("{$T0}","[$Ktbl,:128]!");
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vsli_32 ($T4,&Dlo(@X[0]),32-$sigma1[1]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &veor ($T5,$T5,$T4); # sigma1(X[16..17])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 (&Dhi(@X[0]),&Dhi(@X[0]),$T5);# X[2..3] += sigma1(X[16..17])
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 ($T0,$T0, at X[0]);
> + while($#insns>=2) { eval(shift(@insns)); }
> + &vst1_32 ("{$T0}","[$Xfer,:128]!");
> + eval(shift(@insns));
> + eval(shift(@insns));
> +
> + push(@X,shift(@X)); # "rotate" X[]
> +}
> +
> +sub Xpreload()
> +{ use integer;
> + my $body = shift;
> + my @insns = (&$body,&$body,&$body,&$body);
> + my ($a,$b,$c,$d,$e,$f,$g,$h);
> +
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vld1_32 ("{$T0}","[$Ktbl,:128]!");
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vrev32_8 (@X[0], at X[0]);
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + eval(shift(@insns));
> + &vadd_i32 ($T0,$T0, at X[0]);
> + foreach (@insns) { eval; } # remaining instructions
> + &vst1_32 ("{$T0}","[$Xfer,:128]!");
> +
> + push(@X,shift(@X)); # "rotate" X[]
> +}
> +
> +sub body_00_15 () {
> + (
> + '($a,$b,$c,$d,$e,$f,$g,$h)=@V;'.
> + '&add ($h,$h,$t1)', # h+=X[i]+K[i]
> + '&eor ($t1,$f,$g)',
> + '&eor ($t0,$e,$e,"ror#".($Sigma1[1]-$Sigma1[0]))',
> + '&add ($a,$a,$t2)', # h+=Maj(a,b,c) from the past
> + '&and ($t1,$t1,$e)',
> + '&eor ($t2,$t0,$e,"ror#".($Sigma1[2]-$Sigma1[0]))', # Sigma1(e)
> + '&eor ($t0,$a,$a,"ror#".($Sigma0[1]-$Sigma0[0]))',
> + '&eor ($t1,$t1,$g)', # Ch(e,f,g)
> + '&add ($h,$h,$t2,"ror#$Sigma1[0]")', # h+=Sigma1(e)
> + '&eor ($t2,$a,$b)', # a^b, b^c in next round
> + '&eor ($t0,$t0,$a,"ror#".($Sigma0[2]-$Sigma0[0]))', # Sigma0(a)
> + '&add ($h,$h,$t1)', # h+=Ch(e,f,g)
> + '&ldr ($t1,sprintf "[sp,#%d]",4*(($j+1)&15)) if (($j&15)!=15);'.
> + '&ldr ($t1,"[$Ktbl]") if ($j==15);'.
> + '&ldr ($t1,"[sp,#64]") if ($j==31)',
> + '&and ($t3,$t3,$t2)', # (b^c)&=(a^b)
> + '&add ($d,$d,$h)', # d+=h
> + '&add ($h,$h,$t0,"ror#$Sigma0[0]");'. # h+=Sigma0(a)
> + '&eor ($t3,$t3,$b)', # Maj(a,b,c)
> + '$j++; unshift(@V,pop(@V)); ($t2,$t3)=($t3,$t2);'
> + )
> +}
> +
> +$code.=<<___;
> +#if __ARM_MAX_ARCH__>=7
this will be compile on armv4 but gcc will not allow it
we need to drop the neon code for older non v7 build
Best Regards,
J.
More information about the linux-arm-kernel
mailing list