[PATCH] arm64/lib: add optimized implementation of sha_transform

Ard Biesheuvel ard.biesheuvel at linaro.org
Fri Mar 14 11:02:33 EDT 2014


This implementation keeps the 64 bytes of workspace in registers rather than
on the stack, eliminating most of the loads and stores, and reducing the
instruction count by about 25%.

Signed-off-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
---
Hello all,

No performance numbers I am allowed to share, unfortunately, so if anyone else
(with access to actual, representative hardware) would care to have a go, I
would be very grateful.

This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m), and
inserting the module using 'mode=303' as a parameter (note that the insmod
always fails, but produces its test output to the kernel log). Also note that
the sha_transform() function will be part of the kernel proper, so just
rebuilding the sha1_generic module is not sufficient.

Cheers,


 arch/arm64/kernel/arm64ksyms.c |   3 +
 arch/arm64/lib/Makefile        |   2 +-
 arch/arm64/lib/sha1.S          | 256 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 260 insertions(+), 1 deletion(-)
 create mode 100644 arch/arm64/lib/sha1.S

diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..1f5693fb5d93 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit);
 EXPORT_SYMBOL(test_and_clear_bit);
 EXPORT_SYMBOL(change_bit);
 EXPORT_SYMBOL(test_and_change_bit);
+
+	/* SHA-1 implementation under lib/ */
+EXPORT_SYMBOL(sha_transform);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..ea093ebb9a9a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
 lib-y		:= bitops.o clear_user.o delay.o copy_from_user.o	\
 		   copy_to_user.o copy_in_user.o copy_page.o		\
 		   clear_page.o memchr.o memcpy.o memmove.o memset.o	\
-		   strchr.o strrchr.o
+		   strchr.o strrchr.o sha1.o
diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S
new file mode 100644
index 000000000000..877b8d70e992
--- /dev/null
+++ b/arch/arm64/lib/sha1.S
@@ -0,0 +1,256 @@
+/*
+ * linux/arch/arm64/lib/sha1.S
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel at linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+	.text
+
+	k		.req	w1
+
+	res		.req	w2
+	xres		.req	x2
+
+	wA		.req	w3
+	wB		.req	w4
+	wC		.req	w5
+	wD		.req	w6
+	wE		.req	w7
+
+	tmp		.req	w16
+	xtmp		.req	x16
+
+	.macro		sha1_choose, out, b, c, d
+	eor		\out, \c, \d
+	and		\out, \out, \b
+	eor		\out, \out, \d
+	.endm
+
+	.macro		sha1_parity, out, b, c, d
+	eor		\out, \b, \c
+	eor		\out, \out, \d
+	.endm
+
+	.macro		sha1_majority, out, b, c, d
+	eor		tmp, \b, \c
+	and		\out, \b, \c
+	and		tmp, tmp, \d
+	add		\out, \out, tmp
+	.endm
+
+	.macro		mix_state, st0, st1, st4, st6, st7
+	extr		xtmp, \st7, \st6, #32
+	eor		\st0, \st0, \st1
+	eor		xtmp, xtmp, \st4
+	eor		xtmp, xtmp, \st0
+	ror		res, tmp, #(32 - 1)
+	lsr		xtmp, xtmp, #32
+	ror		tmp, tmp, #(32 - 1)
+	orr		\st0, xres, xtmp, lsl #32
+	.endm
+
+	.macro		sha1_round, func, r, h, a, b, c, d, e
+	sha1_\func	res, \b, \c, \d
+	add		res, res, \e
+	ror		\e, \a, #(32 - 5)
+	.ifc		\h, h
+	add		xres, xres, x\r, lsr #32
+	.else
+	add		res, res, w\r
+	.endif
+	add		\e, \e, k
+	ror		\b, \b, #2
+	add		\e, \e, res
+	.endm
+
+	/*
+	 * void sha_transform(__u32 *digest, const char *data, __u32 *array)
+	 */
+ENTRY(sha_transform)
+	/* load input into state array */
+	ldp		x8, x9, [x1]
+	ldp		x10, x11, [x1, #16]
+	ldp		x12, x13, [x1, #32]
+	ldp		x14, x15, [x1, #48]
+
+	/* load digest input */
+	ldr		wA, [x0]
+	ldp		wB, wC, [x0, #4]
+	ldp		wD, wE, [x0, #12]
+
+	/* endian-reverse the input on LE builds */
+CPU_LE( rev32		x8, x8		)
+CPU_LE( rev32		x9, x9		)
+CPU_LE( rev32		x10, x10	)
+CPU_LE( rev32		x11, x11	)
+CPU_LE( rev32		x12, x12	)
+CPU_LE( rev32		x13, x13	)
+CPU_LE( rev32		x14, x14	)
+CPU_LE( rev32		x15, x15	)
+
+	/* round 1 */
+	ldr		k, =0x5a827999
+	sha1_round	choose,  8, l, wA, wB, wC, wD, wE
+	sha1_round	choose,  8, h, wE, wA, wB, wC, wD
+	sha1_round	choose,  9, l, wD, wE, wA, wB, wC
+	sha1_round	choose,  9, h, wC, wD, wE, wA, wB
+	sha1_round	choose, 10, l, wB, wC, wD, wE, wA
+	sha1_round	choose, 10, h, wA, wB, wC, wD, wE
+	sha1_round	choose, 11, l, wE, wA, wB, wC, wD
+	sha1_round	choose, 11, h, wD, wE, wA, wB, wC
+	sha1_round	choose, 12, l, wC, wD, wE, wA, wB
+	sha1_round	choose, 12, h, wB, wC, wD, wE, wA
+	sha1_round	choose, 13, l, wA, wB, wC, wD, wE
+	sha1_round	choose, 13, h, wE, wA, wB, wC, wD
+	sha1_round	choose, 14, l, wD, wE, wA, wB, wC
+	sha1_round	choose, 14, h, wC, wD, wE, wA, wB
+	sha1_round	choose, 15, l, wB, wC, wD, wE, wA
+	sha1_round	choose, 15, h, wA, wB, wC, wD, wE
+
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	choose,  8, l, wE, wA, wB, wC, wD
+	sha1_round	choose,  8, h, wD, wE, wA, wB, wC
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	choose,  9, l, wC, wD, wE, wA, wB
+	sha1_round	choose,  9, h, wB, wC, wD, wE, wA
+
+	/* round 2 */
+	ldr		k, =0x6ed9eba1
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	parity, 10, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 10, h, wE, wA, wB, wC, wD
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	parity, 11, l, wD, wE, wA, wB, wC
+	sha1_round	parity, 11, h, wC, wD, wE, wA, wB
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	parity, 12, l, wB, wC, wD, wE, wA
+	sha1_round	parity, 12, h, wA, wB, wC, wD, wE
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	parity, 13, l, wE, wA, wB, wC, wD
+	sha1_round	parity, 13, h, wD, wE, wA, wB, wC
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	parity, 14, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 14, h, wB, wC, wD, wE, wA
+	mix_state	x15, x8, x11, x13, x14
+	sha1_round	parity, 15, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 15, h, wE, wA, wB, wC, wD
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	parity,  8, l, wD, wE, wA, wB, wC
+	sha1_round	parity,  8, h, wC, wD, wE, wA, wB
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	parity,  9, l, wB, wC, wD, wE, wA
+	sha1_round	parity,  9, h, wA, wB, wC, wD, wE
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	parity, 10, l, wE, wA, wB, wC, wD
+	sha1_round	parity, 10, h, wD, wE, wA, wB, wC
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	parity, 11, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 11, h, wB, wC, wD, wE, wA
+
+	/* round 3 */
+	ldr		k, =0x8f1bbcdc
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	majority, 12, l, wA, wB, wC, wD, wE
+	sha1_round	majority, 12, h, wE, wA, wB, wC, wD
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	majority, 13, l, wD, wE, wA, wB, wC
+	sha1_round	majority, 13, h, wC, wD, wE, wA, wB
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	majority, 14, l, wB, wC, wD, wE, wA
+	sha1_round	majority, 14, h, wA, wB, wC, wD, wE
+	mix_state	x15, x8, x11, x13, x14
+	sha1_round	majority, 15, l, wE, wA, wB, wC, wD
+	sha1_round	majority, 15, h, wD, wE, wA, wB, wC
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	majority,  8, l, wC, wD, wE, wA, wB
+	sha1_round	majority,  8, h, wB, wC, wD, wE, wA
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	majority,  9, l, wA, wB, wC, wD, wE
+	sha1_round	majority,  9, h, wE, wA, wB, wC, wD
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	majority, 10, l, wD, wE, wA, wB, wC
+	sha1_round	majority, 10, h, wC, wD, wE, wA, wB
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	majority, 11, l, wB, wC, wD, wE, wA
+	sha1_round	majority, 11, h, wA, wB, wC, wD, wE
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	majority, 12, l, wE, wA, wB, wC, wD
+	sha1_round	majority, 12, h, wD, wE, wA, wB, wC
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	majority, 13, l, wC, wD, wE, wA, wB
+	sha1_round	majority, 13, h, wB, wC, wD, wE, wA
+
+	/* round 4 */
+	ldr		k, =0xca62c1d6
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	parity, 14, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 14, h, wE, wA, wB, wC, wD
+	mix_state	x15, x8, x11, x13, x14
+	sha1_round	parity, 15, l, wD, wE, wA, wB, wC
+	sha1_round	parity, 15, h, wC, wD, wE, wA, wB
+	mix_state	x8, x9, x12, x14, x15
+	sha1_round	parity,  8, l, wB, wC, wD, wE, wA
+	sha1_round	parity,  8, h, wA, wB, wC, wD, wE
+	mix_state	x9, x10, x13, x15, x8
+	sha1_round	parity,  9, l, wE, wA, wB, wC, wD
+	sha1_round	parity,  9, h, wD, wE, wA, wB, wC
+	mix_state	x10, x11, x14, x8, x9
+	sha1_round	parity, 10, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 10 ,h, wB, wC, wD, wE, wA
+	mix_state	x11, x12, x15, x9, x10
+	sha1_round	parity, 11, l, wA, wB, wC, wD, wE
+	sha1_round	parity, 11, h, wE, wA, wB, wC, wD
+	mix_state	x12, x13, x8, x10, x11
+	sha1_round	parity, 12, l, wD, wE, wA, wB, wC
+	sha1_round	parity, 12, h, wC, wD, wE, wA, wB
+	mix_state	x13, x14, x9, x11, x12
+	sha1_round	parity, 13, l, wB, wC, wD, wE, wA
+	sha1_round	parity, 13, h, wA, wB, wC, wD, wE
+	mix_state	x14, x15, x10, x12, x13
+	sha1_round	parity, 14, l, wE, wA, wB, wC, wD
+	sha1_round	parity, 14, h, wD, wE, wA, wB, wC
+	mix_state	x15, x8, x11, x13, x14
+
+	/* reload digest input */
+	ldr		w8, [x0]
+	ldp		w9, w10, [x0, #4]
+	ldp		w11, w12, [x0, #12]
+
+	sha1_round	parity, 15, l, wC, wD, wE, wA, wB
+	sha1_round	parity, 15, h, wB, wC, wD, wE, wA
+
+	/* add this round's output to digest */
+	add		wA, wA, w8
+	add		wB, wB, w9
+	add		wC, wC, w10
+	add		wD, wD, w11
+	add		wE, wE, w12
+
+	/* store digest */
+	str		wA, [x0]
+	stp		wB, wC, [x0, #4]
+	stp		wD, wE, [x0, #12]
+	ret
+ENDPROC(sha_transform)
+
+	/*
+	 * void sha_init(__u32 *buf)
+	 */
+ENTRY(sha_init)
+	ldr	w1, =0x67452301
+	ldr	w2, =0xefcdab89
+	ldr	w3, =0x98badcfe
+	ldr	w4, =0x10325476
+	ldr	w5, =0xc3d2e1f0
+	str	w1, [x0]
+	stp	w2, w3, [x0, #4]
+	stp	w4, w5, [x0, #12]
+	ret
+ENDPROC(sha_init)
-- 
1.8.3.2




More information about the linux-arm-kernel mailing list