[PATCH resend 15/18] crypto: arm/ghash - add NEON accelerated fallback for vmull.p64

Mon Jul 24 03:28:17 PDT 2017

Implement a NEON fallback for systems that do support NEON but have
no support for the optional 64x64->128 polynomial multiplication
instruction that is part of the ARMv8 Crypto Extensions. It is based
on the paper "Fast Software Polynomial Multiplication on ARM Processors
Using the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
Ricardo Dahab (https://hal.inria.fr/hal-01506572)

On a 32-bit guest executing under KVM on a Cortex-A57, the new code is
not only 4x faster than the generic table based GHASH driver, it is also
time invariant. (Note that the existing vmull.p64 code is 16x faster on
this core).

Signed-off-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
---
 arch/arm/crypto/Kconfig         |   5 +-
 arch/arm/crypto/ghash-ce-core.S | 234 ++++++++++++++++----
 arch/arm/crypto/ghash-ce-glue.c |  24 +-
 3 files changed, 215 insertions(+), 48 deletions(-)

diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
index b9adedcc5b2e..ec72752d5668 100644
--- a/arch/arm/crypto/Kconfig
+++ b/arch/arm/crypto/Kconfig
@@ -94,14 +94,15 @@ config CRYPTO_AES_ARM_CE
 	  ARMv8 Crypto Extensions
 
 config CRYPTO_GHASH_ARM_CE
-	tristate "PMULL-accelerated GHASH using ARMv8 Crypto Extensions"
+	tristate "PMULL-accelerated GHASH using NEON/ARMv8 Crypto Extensions"
 	depends on KERNEL_MODE_NEON
 	select CRYPTO_HASH
 	select CRYPTO_CRYPTD
 	help
 	  Use an implementation of GHASH (used by the GCM AEAD chaining mode)
 	  that uses the 64x64 to 128 bit polynomial multiplication (vmull.p64)
-	  that is part of the ARMv8 Crypto Extensions
+	  that is part of the ARMv8 Crypto Extensions, or a slower variant that
+	  uses the vmull.p8 instruction that is part of the basic NEON ISA.
 
 config CRYPTO_CRCT10DIF_ARM_CE
 	tristate "CRCT10DIF digest algorithm using PMULL instructions"
diff --git a/arch/arm/crypto/ghash-ce-core.S b/arch/arm/crypto/ghash-ce-core.S
index f6ab8bcc9efe..2f78c10b1881 100644
--- a/arch/arm/crypto/ghash-ce-core.S
+++ b/arch/arm/crypto/ghash-ce-core.S
@@ -1,7 +1,7 @@
 /*
- * Accelerated GHASH implementation with ARMv8 vmull.p64 instructions.
+ * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions.
  *
- * Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel at linaro.org>
+ * Copyright (C) 2015 - 2017 Linaro Ltd. <ard.biesheuvel at linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify it
  * under the terms of the GNU General Public License version 2 as published
@@ -12,40 +12,162 @@
 #include <asm/assembler.h>
 
 	SHASH		.req	q0
-	SHASH2		.req	q1
-	T1		.req	q2
-	T2		.req	q3
-	MASK		.req	q4
-	XL		.req	q5
-	XM		.req	q6
-	XH		.req	q7
-	IN1		.req	q7
+	T1		.req	q1
+	XL		.req	q2
+	XM		.req	q3
+	XH		.req	q4
+	IN1		.req	q4
 
 	SHASH_L		.req	d0
 	SHASH_H		.req	d1
-	SHASH2_L	.req	d2
-	T1_L		.req	d4
-	MASK_L		.req	d8
-	XL_L		.req	d10
-	XL_H		.req	d11
-	XM_L		.req	d12
-	XM_H		.req	d13
-	XH_L		.req	d14
+	T1_L		.req	d2
+	T1_H		.req	d3
+	XL_L		.req	d4
+	XL_H		.req	d5
+	XM_L		.req	d6
+	XM_H		.req	d7
+	XH_L		.req	d8
+
+	t0l		.req	d10
+	t0h		.req	d11
+	t1l		.req	d12
+	t1h		.req	d13
+	t2l		.req	d14
+	t2h		.req	d15
+	t3l		.req	d16
+	t3h		.req	d17
+	t4l		.req	d18
+	t4h		.req	d19
+
+	t0q		.req	q5
+	t1q		.req	q6
+	t2q		.req	q7
+	t3q		.req	q8
+	t4q		.req	q9
+	T2		.req	q9
+
+	s1l		.req	d20
+	s1h		.req	d21
+	s2l		.req	d22
+	s2h		.req	d23
+	s3l		.req	d24
+	s3h		.req	d25
+	s4l		.req	d26
+	s4h		.req	d27
+
+	MASK		.req	d28
+	SHASH2_p8	.req	d28
+
+	k16		.req	d29
+	k32		.req	d30
+	k48		.req	d31
+	SHASH2_p64	.req	d31
 
 	.text
 	.fpu		crypto-neon-fp-armv8
 
+	.macro		__pmull_p64, rd, rn, rm, b1, b2, b3, b4
+	vmull.p64	\rd, \rn, \rm
+	.endm
+
 	/*
-	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
-	 *			   struct ghash_key const *k, const char *head)
+	 * This implementation of 64x64 -> 128 bit polynomial multiplication
+	 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper
+	 * "Fast Software Polynomial Multiplication on ARM Processors Using
+	 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and
+	 * Ricardo Dahab (https://hal.inria.fr/hal-01506572)
+	 *
+	 * It has been slightly tweaked for in-order performance, and to allow
+	 * 'rq' to overlap with 'ad' or 'bd'.
 	 */
-ENTRY(pmull_ghash_update)
-	vld1.64		{SHASH}, [r3]
+	.macro		__pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l
+	vext.8		t0l, \ad, \ad, #1	@ A1
+	.ifc		\b1, t4l
+	vext.8		t4l, \bd, \bd, #1	@ B1
+	.endif
+	vmull.p8	t0q, t0l, \bd		@ F = A1*B
+	vext.8		t1l, \ad, \ad, #2	@ A2
+	vmull.p8	t4q, \ad, \b1		@ E = A*B1
+	.ifc		\b2, t3l
+	vext.8		t3l, \bd, \bd, #2	@ B2
+	.endif
+	vmull.p8	t1q, t1l, \bd		@ H = A2*B
+	vext.8		t2l, \ad, \ad, #3	@ A3
+	vmull.p8	t3q, \ad, \b2		@ G = A*B2
+	veor		t0q, t0q, t4q		@ L = E + F
+	.ifc		\b3, t4l
+	vext.8		t4l, \bd, \bd, #3	@ B3
+	.endif
+	vmull.p8	t2q, t2l, \bd		@ J = A3*B
+	veor		t0l, t0l, t0h		@ t0 = (L) (P0 + P1) << 8
+	veor		t1q, t1q, t3q		@ M = G + H
+	.ifc		\b4, t3l
+	vext.8		t3l, \bd, \bd, #4	@ B4
+	.endif
+	vmull.p8	t4q, \ad, \b3		@ I = A*B3
+	veor		t1l, t1l, t1h		@ t1 = (M) (P2 + P3) << 16
+	vmull.p8	t3q, \ad, \b4		@ K = A*B4
+	vand		t0h, t0h, k48
+	vand		t1h, t1h, k32
+	veor		t2q, t2q, t4q		@ N = I + J
+	veor		t0l, t0l, t0h
+	veor		t1l, t1l, t1h
+	veor		t2l, t2l, t2h		@ t2 = (N) (P4 + P5) << 24
+	vand		t2h, t2h, k16
+	veor		t3l, t3l, t3h		@ t3 = (K) (P6 + P7) << 32
+	vmov.i64	t3h, #0
+	vext.8		t0q, t0q, t0q, #15
+	veor		t2l, t2l, t2h
+	vext.8		t1q, t1q, t1q, #14
+	vmull.p8	\rq, \ad, \bd		@ D = A*B
+	vext.8		t2q, t2q, t2q, #13
+	vext.8		t3q, t3q, t3q, #12
+	veor		t0q, t0q, t1q
+	veor		t2q, t2q, t3q
+	veor		\rq, \rq, t0q
+	veor		\rq, \rq, t2q
+	.endm
+
+	//
+	// PMULL (64x64->128) based reduction for CPUs that can do
+	// it in a single instruction.
+	//
+	.macro		__pmull_reduce_p64
+	vmull.p64	T1, XL_L, MASK
+
+	veor		XH_L, XH_L, XM_H
+	vext.8		T1, T1, T1, #8
+	veor		XL_H, XL_H, XM_L
+	veor		T1, T1, XL
+
+	vmull.p64	XL, T1_H, MASK
+	.endm
+
+	//
+	// Alternative reduction for CPUs that lack support for the
+	// 64x64->128 PMULL instruction
+	//
+	.macro		__pmull_reduce_p8
+	veor		XL_H, XL_H, XM_L
+	veor		XH_L, XH_L, XM_H
+
+	vshl.i64	T1, XL, #57
+	vshl.i64	T2, XL, #62
+	veor		T1, T1, T2
+	vshl.i64	T2, XL, #63
+	veor		T1, T1, T2
+	veor		XL_H, XL_H, T1_L
+	veor		XH_L, XH_L, T1_H
+
+	vshr.u64	T1, XL, #1
+	veor		XH, XH, XL
+	veor		XL, XL, T1
+	vshr.u64	T1, T1, #6
+	vshr.u64	XL, XL, #1
+	.endm
+
+	.macro		ghash_update, pn
 	vld1.64		{XL}, [r1]
-	vmov.i8		MASK, #0xe1
-	vext.8		SHASH2, SHASH, SHASH, #8
-	vshl.u64	MASK, MASK, #57
-	veor		SHASH2, SHASH2, SHASH
 
 	/* do the head block first, if supplied */
 	ldr		ip, [sp]
@@ -62,33 +184,59 @@ ENTRY(pmull_ghash_update)
 #ifndef CONFIG_CPU_BIG_ENDIAN
 	vrev64.8	T1, T1
 #endif
-	vext.8		T2, XL, XL, #8
 	vext.8		IN1, T1, T1, #8
-	veor		T1, T1, T2
+	veor		T1_L, T1_L, XL_H
 	veor		XL, XL, IN1
 
-	vmull.p64	XH, SHASH_H, XL_H		@ a1 * b1
+	__pmull_\pn	XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h	@ a1 * b1
 	veor		T1, T1, XL
-	vmull.p64	XL, SHASH_L, XL_L		@ a0 * b0
-	vmull.p64	XM, SHASH2_L, T1_L		@ (a1 + a0)(b1 + b0)
+	__pmull_\pn	XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l	@ a0 * b0
+	__pmull_\pn	XM, T1_L, SHASH2_\pn			@ (a1+a0)(b1+b0)
 
-	vext.8		T1, XL, XH, #8
-	veor		T2, XL, XH
+	veor		T1, XL, XH
 	veor		XM, XM, T1
-	veor		XM, XM, T2
-	vmull.p64	T2, XL_L, MASK_L
 
-	vmov		XH_L, XM_H
-	vmov		XM_H, XL_L
+	__pmull_reduce_\pn
 
-	veor		XL, XM, T2
-	vext.8		T2, XL, XL, #8
-	vmull.p64	XL, XL_L, MASK_L
-	veor		T2, T2, XH
-	veor		XL, XL, T2
+	veor		T1, T1, XH
+	veor		XL, XL, T1
 
 	bne		0b
 
 	vst1.64		{XL}, [r1]
 	bx		lr
-ENDPROC(pmull_ghash_update)
+	.endm
+
+	/*
+	 * void pmull_ghash_update(int blocks, u64 dg[], const char *src,
+	 *			   struct ghash_key const *k, const char *head)
+	 */
+ENTRY(pmull_ghash_update_p64)
+	vld1.64		{SHASH}, [r3]
+	veor		SHASH2_p64, SHASH_L, SHASH_H
+
+	vmov.i8		MASK, #0xe1
+	vshl.u64	MASK, MASK, #57
+
+	ghash_update	p64
+ENDPROC(pmull_ghash_update_p64)
+
+ENTRY(pmull_ghash_update_p8)
+	vld1.64		{SHASH}, [r3]
+	veor		SHASH2_p8, SHASH_L, SHASH_H
+
+	vext.8		s1l, SHASH_L, SHASH_L, #1
+	vext.8		s2l, SHASH_L, SHASH_L, #2
+	vext.8		s3l, SHASH_L, SHASH_L, #3
+	vext.8		s4l, SHASH_L, SHASH_L, #4
+	vext.8		s1h, SHASH_H, SHASH_H, #1
+	vext.8		s2h, SHASH_H, SHASH_H, #2
+	vext.8		s3h, SHASH_H, SHASH_H, #3
+	vext.8		s4h, SHASH_H, SHASH_H, #4
+
+	vmov.i64	k16, #0xffff
+	vmov.i64	k32, #0xffffffff
+	vmov.i64	k48, #0xffffffffffff
+
+	ghash_update	p8
+ENDPROC(pmull_ghash_update_p8)
diff --git a/arch/arm/crypto/ghash-ce-glue.c b/arch/arm/crypto/ghash-ce-glue.c
index 6bac8bea9f1e..d9bb52cae2ac 100644
--- a/arch/arm/crypto/ghash-ce-glue.c
+++ b/arch/arm/crypto/ghash-ce-glue.c
@@ -22,6 +22,7 @@
 MODULE_DESCRIPTION("GHASH secure hash using ARMv8 Crypto Extensions");
 MODULE_AUTHOR("Ard Biesheuvel <ard.biesheuvel at linaro.org>");
 MODULE_LICENSE("GPL v2");
+MODULE_ALIAS_CRYPTO("ghash");
 
 #define GHASH_BLOCK_SIZE	16
 #define GHASH_DIGEST_SIZE	16
@@ -41,8 +42,17 @@ struct ghash_async_ctx {
 	struct cryptd_ahash *cryptd_tfm;
 };
 
-asmlinkage void pmull_ghash_update(int blocks, u64 dg[], const char *src,
-				   struct ghash_key const *k, const char *head);
+asmlinkage void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src,
+				       struct ghash_key const *k,
+				       const char *head);
+
+asmlinkage void pmull_ghash_update_p8(int blocks, u64 dg[], const char *src,
+				      struct ghash_key const *k,
+				      const char *head);
+
+static void (*pmull_ghash_update)(int blocks, u64 dg[], const char *src,
+				  struct ghash_key const *k,
+				  const char *head);
 
 static int ghash_init(struct shash_desc *desc)
 {
@@ -312,6 +322,14 @@ static int __init ghash_ce_mod_init(void)
 {
 	int err;
 
+	if (!(elf_hwcap & HWCAP_NEON))
+		return -ENODEV;
+
+	if (elf_hwcap2 & HWCAP2_PMULL)
+		pmull_ghash_update = pmull_ghash_update_p64;
+	else
+		pmull_ghash_update = pmull_ghash_update_p8;
+
 	err = crypto_register_shash(&ghash_alg);
 	if (err)
 		return err;
@@ -332,5 +350,5 @@ static void __exit ghash_ce_mod_exit(void)
 	crypto_unregister_shash(&ghash_alg);
 }
 
-module_cpu_feature_match(PMULL, ghash_ce_mod_init);
+module_init(ghash_ce_mod_init);
 module_exit(ghash_ce_mod_exit);
-- 
2.9.3