[PATCH] arm64/lib: add optimized implementation of sha_transform
Ard Biesheuvel
ard.biesheuvel at linaro.org
Fri Mar 14 11:02:33 EDT 2014
This implementation keeps the 64 bytes of workspace in registers rather than
on the stack, eliminating most of the loads and stores, and reducing the
instruction count by about 25%.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
---
Hello all,
No performance numbers I am allowed to share, unfortunately, so if anyone else
(with access to actual, representative hardware) would care to have a go, I
would be very grateful.
This can be done by building the tcrypt.ko module (CONFIG_CRYPTO_TEST=m), and
inserting the module using 'mode=303' as a parameter (note that the insmod
always fails, but produces its test output to the kernel log). Also note that
the sha_transform() function will be part of the kernel proper, so just
rebuilding the sha1_generic module is not sufficient.
Cheers,
arch/arm64/kernel/arm64ksyms.c | 3 +
arch/arm64/lib/Makefile | 2 +-
arch/arm64/lib/sha1.S | 256 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 260 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/lib/sha1.S
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..1f5693fb5d93 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit);
EXPORT_SYMBOL(test_and_clear_bit);
EXPORT_SYMBOL(change_bit);
EXPORT_SYMBOL(test_and_change_bit);
+
+ /* SHA-1 implementation under lib/ */
+EXPORT_SYMBOL(sha_transform);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..ea093ebb9a9a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_in_user.o copy_page.o \
clear_page.o memchr.o memcpy.o memmove.o memset.o \
- strchr.o strrchr.o
+ strchr.o strrchr.o sha1.o
diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S
new file mode 100644
index 000000000000..877b8d70e992
--- /dev/null
+++ b/arch/arm64/lib/sha1.S
@@ -0,0 +1,256 @@
+/*
+ * linux/arch/arm64/lib/sha1.S
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel at linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+
+ k .req w1
+
+ res .req w2
+ xres .req x2
+
+ wA .req w3
+ wB .req w4
+ wC .req w5
+ wD .req w6
+ wE .req w7
+
+ tmp .req w16
+ xtmp .req x16
+
+ .macro sha1_choose, out, b, c, d
+ eor \out, \c, \d
+ and \out, \out, \b
+ eor \out, \out, \d
+ .endm
+
+ .macro sha1_parity, out, b, c, d
+ eor \out, \b, \c
+ eor \out, \out, \d
+ .endm
+
+ .macro sha1_majority, out, b, c, d
+ eor tmp, \b, \c
+ and \out, \b, \c
+ and tmp, tmp, \d
+ add \out, \out, tmp
+ .endm
+
+ .macro mix_state, st0, st1, st4, st6, st7
+ extr xtmp, \st7, \st6, #32
+ eor \st0, \st0, \st1
+ eor xtmp, xtmp, \st4
+ eor xtmp, xtmp, \st0
+ ror res, tmp, #(32 - 1)
+ lsr xtmp, xtmp, #32
+ ror tmp, tmp, #(32 - 1)
+ orr \st0, xres, xtmp, lsl #32
+ .endm
+
+ .macro sha1_round, func, r, h, a, b, c, d, e
+ sha1_\func res, \b, \c, \d
+ add res, res, \e
+ ror \e, \a, #(32 - 5)
+ .ifc \h, h
+ add xres, xres, x\r, lsr #32
+ .else
+ add res, res, w\r
+ .endif
+ add \e, \e, k
+ ror \b, \b, #2
+ add \e, \e, res
+ .endm
+
+ /*
+ * void sha_transform(__u32 *digest, const char *data, __u32 *array)
+ */
+ENTRY(sha_transform)
+ /* load input into state array */
+ ldp x8, x9, [x1]
+ ldp x10, x11, [x1, #16]
+ ldp x12, x13, [x1, #32]
+ ldp x14, x15, [x1, #48]
+
+ /* load digest input */
+ ldr wA, [x0]
+ ldp wB, wC, [x0, #4]
+ ldp wD, wE, [x0, #12]
+
+ /* endian-reverse the input on LE builds */
+CPU_LE( rev32 x8, x8 )
+CPU_LE( rev32 x9, x9 )
+CPU_LE( rev32 x10, x10 )
+CPU_LE( rev32 x11, x11 )
+CPU_LE( rev32 x12, x12 )
+CPU_LE( rev32 x13, x13 )
+CPU_LE( rev32 x14, x14 )
+CPU_LE( rev32 x15, x15 )
+
+ /* round 1 */
+ ldr k, =0x5a827999
+ sha1_round choose, 8, l, wA, wB, wC, wD, wE
+ sha1_round choose, 8, h, wE, wA, wB, wC, wD
+ sha1_round choose, 9, l, wD, wE, wA, wB, wC
+ sha1_round choose, 9, h, wC, wD, wE, wA, wB
+ sha1_round choose, 10, l, wB, wC, wD, wE, wA
+ sha1_round choose, 10, h, wA, wB, wC, wD, wE
+ sha1_round choose, 11, l, wE, wA, wB, wC, wD
+ sha1_round choose, 11, h, wD, wE, wA, wB, wC
+ sha1_round choose, 12, l, wC, wD, wE, wA, wB
+ sha1_round choose, 12, h, wB, wC, wD, wE, wA
+ sha1_round choose, 13, l, wA, wB, wC, wD, wE
+ sha1_round choose, 13, h, wE, wA, wB, wC, wD
+ sha1_round choose, 14, l, wD, wE, wA, wB, wC
+ sha1_round choose, 14, h, wC, wD, wE, wA, wB
+ sha1_round choose, 15, l, wB, wC, wD, wE, wA
+ sha1_round choose, 15, h, wA, wB, wC, wD, wE
+
+ mix_state x8, x9, x12, x14, x15
+ sha1_round choose, 8, l, wE, wA, wB, wC, wD
+ sha1_round choose, 8, h, wD, wE, wA, wB, wC
+ mix_state x9, x10, x13, x15, x8
+ sha1_round choose, 9, l, wC, wD, wE, wA, wB
+ sha1_round choose, 9, h, wB, wC, wD, wE, wA
+
+ /* round 2 */
+ ldr k, =0x6ed9eba1
+ mix_state x10, x11, x14, x8, x9
+ sha1_round parity, 10, l, wA, wB, wC, wD, wE
+ sha1_round parity, 10, h, wE, wA, wB, wC, wD
+ mix_state x11, x12, x15, x9, x10
+ sha1_round parity, 11, l, wD, wE, wA, wB, wC
+ sha1_round parity, 11, h, wC, wD, wE, wA, wB
+ mix_state x12, x13, x8, x10, x11
+ sha1_round parity, 12, l, wB, wC, wD, wE, wA
+ sha1_round parity, 12, h, wA, wB, wC, wD, wE
+ mix_state x13, x14, x9, x11, x12
+ sha1_round parity, 13, l, wE, wA, wB, wC, wD
+ sha1_round parity, 13, h, wD, wE, wA, wB, wC
+ mix_state x14, x15, x10, x12, x13
+ sha1_round parity, 14, l, wC, wD, wE, wA, wB
+ sha1_round parity, 14, h, wB, wC, wD, wE, wA
+ mix_state x15, x8, x11, x13, x14
+ sha1_round parity, 15, l, wA, wB, wC, wD, wE
+ sha1_round parity, 15, h, wE, wA, wB, wC, wD
+ mix_state x8, x9, x12, x14, x15
+ sha1_round parity, 8, l, wD, wE, wA, wB, wC
+ sha1_round parity, 8, h, wC, wD, wE, wA, wB
+ mix_state x9, x10, x13, x15, x8
+ sha1_round parity, 9, l, wB, wC, wD, wE, wA
+ sha1_round parity, 9, h, wA, wB, wC, wD, wE
+ mix_state x10, x11, x14, x8, x9
+ sha1_round parity, 10, l, wE, wA, wB, wC, wD
+ sha1_round parity, 10, h, wD, wE, wA, wB, wC
+ mix_state x11, x12, x15, x9, x10
+ sha1_round parity, 11, l, wC, wD, wE, wA, wB
+ sha1_round parity, 11, h, wB, wC, wD, wE, wA
+
+ /* round 3 */
+ ldr k, =0x8f1bbcdc
+ mix_state x12, x13, x8, x10, x11
+ sha1_round majority, 12, l, wA, wB, wC, wD, wE
+ sha1_round majority, 12, h, wE, wA, wB, wC, wD
+ mix_state x13, x14, x9, x11, x12
+ sha1_round majority, 13, l, wD, wE, wA, wB, wC
+ sha1_round majority, 13, h, wC, wD, wE, wA, wB
+ mix_state x14, x15, x10, x12, x13
+ sha1_round majority, 14, l, wB, wC, wD, wE, wA
+ sha1_round majority, 14, h, wA, wB, wC, wD, wE
+ mix_state x15, x8, x11, x13, x14
+ sha1_round majority, 15, l, wE, wA, wB, wC, wD
+ sha1_round majority, 15, h, wD, wE, wA, wB, wC
+ mix_state x8, x9, x12, x14, x15
+ sha1_round majority, 8, l, wC, wD, wE, wA, wB
+ sha1_round majority, 8, h, wB, wC, wD, wE, wA
+ mix_state x9, x10, x13, x15, x8
+ sha1_round majority, 9, l, wA, wB, wC, wD, wE
+ sha1_round majority, 9, h, wE, wA, wB, wC, wD
+ mix_state x10, x11, x14, x8, x9
+ sha1_round majority, 10, l, wD, wE, wA, wB, wC
+ sha1_round majority, 10, h, wC, wD, wE, wA, wB
+ mix_state x11, x12, x15, x9, x10
+ sha1_round majority, 11, l, wB, wC, wD, wE, wA
+ sha1_round majority, 11, h, wA, wB, wC, wD, wE
+ mix_state x12, x13, x8, x10, x11
+ sha1_round majority, 12, l, wE, wA, wB, wC, wD
+ sha1_round majority, 12, h, wD, wE, wA, wB, wC
+ mix_state x13, x14, x9, x11, x12
+ sha1_round majority, 13, l, wC, wD, wE, wA, wB
+ sha1_round majority, 13, h, wB, wC, wD, wE, wA
+
+ /* round 4 */
+ ldr k, =0xca62c1d6
+ mix_state x14, x15, x10, x12, x13
+ sha1_round parity, 14, l, wA, wB, wC, wD, wE
+ sha1_round parity, 14, h, wE, wA, wB, wC, wD
+ mix_state x15, x8, x11, x13, x14
+ sha1_round parity, 15, l, wD, wE, wA, wB, wC
+ sha1_round parity, 15, h, wC, wD, wE, wA, wB
+ mix_state x8, x9, x12, x14, x15
+ sha1_round parity, 8, l, wB, wC, wD, wE, wA
+ sha1_round parity, 8, h, wA, wB, wC, wD, wE
+ mix_state x9, x10, x13, x15, x8
+ sha1_round parity, 9, l, wE, wA, wB, wC, wD
+ sha1_round parity, 9, h, wD, wE, wA, wB, wC
+ mix_state x10, x11, x14, x8, x9
+ sha1_round parity, 10, l, wC, wD, wE, wA, wB
+ sha1_round parity, 10 ,h, wB, wC, wD, wE, wA
+ mix_state x11, x12, x15, x9, x10
+ sha1_round parity, 11, l, wA, wB, wC, wD, wE
+ sha1_round parity, 11, h, wE, wA, wB, wC, wD
+ mix_state x12, x13, x8, x10, x11
+ sha1_round parity, 12, l, wD, wE, wA, wB, wC
+ sha1_round parity, 12, h, wC, wD, wE, wA, wB
+ mix_state x13, x14, x9, x11, x12
+ sha1_round parity, 13, l, wB, wC, wD, wE, wA
+ sha1_round parity, 13, h, wA, wB, wC, wD, wE
+ mix_state x14, x15, x10, x12, x13
+ sha1_round parity, 14, l, wE, wA, wB, wC, wD
+ sha1_round parity, 14, h, wD, wE, wA, wB, wC
+ mix_state x15, x8, x11, x13, x14
+
+ /* reload digest input */
+ ldr w8, [x0]
+ ldp w9, w10, [x0, #4]
+ ldp w11, w12, [x0, #12]
+
+ sha1_round parity, 15, l, wC, wD, wE, wA, wB
+ sha1_round parity, 15, h, wB, wC, wD, wE, wA
+
+ /* add this round's output to digest */
+ add wA, wA, w8
+ add wB, wB, w9
+ add wC, wC, w10
+ add wD, wD, w11
+ add wE, wE, w12
+
+ /* store digest */
+ str wA, [x0]
+ stp wB, wC, [x0, #4]
+ stp wD, wE, [x0, #12]
+ ret
+ENDPROC(sha_transform)
+
+ /*
+ * void sha_init(__u32 *buf)
+ */
+ENTRY(sha_init)
+ ldr w1, =0x67452301
+ ldr w2, =0xefcdab89
+ ldr w3, =0x98badcfe
+ ldr w4, =0x10325476
+ ldr w5, =0xc3d2e1f0
+ str w1, [x0]
+ stp w2, w3, [x0, #4]
+ stp w4, w5, [x0, #12]
+ ret
+ENDPROC(sha_init)
--
1.8.3.2
More information about the linux-arm-kernel
mailing list