[PATCH] arm64/lib: add optimized implementation of sha_transform
Ard Biesheuvel
ard.biesheuvel at linaro.org
Mon Mar 17 11:55:51 EDT 2014
This implementation keeps the 64 bytes of workspace in registers rather than
on the stack, eliminating most of the loads and stores, and reducing the
instruction count by about 25%.
Signed-off-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
---
@Catalin: I assumed x18 has no special significance in the kernel, so I am
using it as a temp register without preserving it. Is this correct?
Changes since v1:
- as suggested in feedback I received off list, it makes sense to schedule
more carefully for an in-order pipeline (A53?), so the rounds are now
2-way interleaved and combined with the schedule updates
- use named constants rather than bare numbers
- use ldnp for loading the input (non-temporal hint)
arch/arm64/kernel/arm64ksyms.c | 3 +
arch/arm64/lib/Makefile | 2 +-
arch/arm64/lib/sha1.S | 277 +++++++++++++++++++++++++++++++++++++++++
3 files changed, 281 insertions(+), 1 deletion(-)
create mode 100644 arch/arm64/lib/sha1.S
diff --git a/arch/arm64/kernel/arm64ksyms.c b/arch/arm64/kernel/arm64ksyms.c
index 338b568cd8ae..1f5693fb5d93 100644
--- a/arch/arm64/kernel/arm64ksyms.c
+++ b/arch/arm64/kernel/arm64ksyms.c
@@ -56,3 +56,6 @@ EXPORT_SYMBOL(clear_bit);
EXPORT_SYMBOL(test_and_clear_bit);
EXPORT_SYMBOL(change_bit);
EXPORT_SYMBOL(test_and_change_bit);
+
+ /* SHA-1 implementation under lib/ */
+EXPORT_SYMBOL(sha_transform);
diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
index 328ce1a99daa..ea093ebb9a9a 100644
--- a/arch/arm64/lib/Makefile
+++ b/arch/arm64/lib/Makefile
@@ -1,4 +1,4 @@
lib-y := bitops.o clear_user.o delay.o copy_from_user.o \
copy_to_user.o copy_in_user.o copy_page.o \
clear_page.o memchr.o memcpy.o memmove.o memset.o \
- strchr.o strrchr.o
+ strchr.o strrchr.o sha1.o
diff --git a/arch/arm64/lib/sha1.S b/arch/arm64/lib/sha1.S
new file mode 100644
index 000000000000..5c472f32f917
--- /dev/null
+++ b/arch/arm64/lib/sha1.S
@@ -0,0 +1,277 @@
+/*
+ * linux/arch/arm64/lib/sha1.S
+ *
+ * Copyright (C) 2014 Linaro Ltd <ard.biesheuvel at linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+ .text
+ .altmacro
+
+ wA .req w2
+ wB .req w3
+ wC .req w4
+ wD .req w5
+ wE .req w6
+
+ k .req w7
+
+ t0 .req w16
+ t1 .req w17
+ t2 .req w18
+ t3 .req w1
+
+ xt0 .req x16
+ xt1 .req x17
+ xt2 .req x18
+ xt3 .req x1
+
+ .macro load_k_hi, reg, rc
+ .ifnb rc
+ movz \reg, #:abs_g1:\rc
+ .endif
+ .endm
+
+ .macro load_k_lo, reg, rc
+ .ifnb rc
+ movk \reg, #:abs_g0_nc:\rc
+ .endif
+ .endm
+
+ .macro inp_2rounds, in, a, b, c, d, e, rc
+ eor t0, \c, \d
+ .irp in2, %(in | 1)
+ .ifne in ^ in2
+ ldnp x\in, x\in2, [x1, #8 * (\in - 8)]
+ .endif
+ .endr
+ load_k_hi k, \rc
+ and t0, t0, \b
+ load_k_lo k, \rc
+ ror \b, \b, #2
+ eor t0, t0, \d
+ eor t1, \b, \c
+CPU_LE( rev32 x\in, x\in )
+ add t0, t0, \e
+ ror \e, \a, #(32 - 5)
+ and t1, t1, \a
+ add \e, \e, k
+ add t0, t0, w\in
+ eor t1, t1, \c
+ add \e, \e, t0
+ add t1, t1, \d
+ ror \d, \e, #(32 - 5)
+ add xt1, xt1, x\in, lsr #32
+ add \d, \d, k
+ ror \a, \a, #2
+ add \d, \d, t1
+ .endm
+
+ .macro cho_2rounds, a, b, c, d, e, st0, st1, st4, st6, st7
+ extr xt2, x\st7, x\st6, #32
+ eor t0, \c, \d
+ eor x\st0, x\st0, x\st1
+ and t0, t0, \b
+ eor xt2, xt2, x\st4
+ ror \b, \b, #2
+ eor xt2, xt2, x\st0
+ eor t0, t0, \d
+ eor t1, \b, \c
+ ror t3, t2, #(32 - 1)
+ add t0, t0, \e
+ lsr xt2, xt2, #32
+ and t1, t1, \a
+ ror t2, t2, #(32 - 1)
+ ror \e, \a, #(32 - 5)
+ eor t1, t1, \c
+ add \e, \e, k
+ add t0, t0, t3
+ ror \a, \a, #2
+ add \e, \e, t0
+ add t1, t1, \d
+ ror \d, \e, #(32 - 5)
+ add t1, t1, t2
+ add \d, \d, k
+ orr x\st0, xt3, xt2, lsl #32
+ add \d, \d, t1
+ .endm
+
+ .macro par_2rounds, a, b, c, d, e, st0, st1, st4, st6, st7, rc
+ extr xt2, x\st7, x\st6, #32
+ load_k_hi k, \rc
+ eor x\st0, x\st0, x\st1
+ eor t0, \b, \c
+ load_k_lo k, \rc
+ eor xt2, xt2, x\st4
+ ror \b, \b, #2
+ eor xt2, xt2, x\st0
+ eor t0, t0, \d
+ ror t3, t2, #(32 - 1)
+ eor t1, \a, \b
+ lsr xt2, xt2, #32
+ add t0, t0, \e
+ ror t2, t2, #(32 - 1)
+ ror \e, \a, #(32 - 5)
+ eor t1, t1, \c
+ add \e, \e, k
+ add t0, t0, t3
+ ror \a, \a, #2
+ add \e, \e, t0
+ add t1, t1, \d
+ ror \d, \e, #(32 - 5)
+ add t1, t1, t2
+ add \d, \d, k
+ orr x\st0, xt3, xt2, lsl #32
+ add \d, \d, t1
+ .endm
+
+ .macro maj_2rounds, a, b, c, d, e, st0, st1, st4, st6, st7, rc
+ extr xt2, x\st7, x\st6, #32
+ load_k_hi k, \rc
+ eor t1, \b, \c
+ eor x\st0, x\st0, x\st1
+ and t0, \b, \c
+ load_k_lo k, \rc
+ eor xt2, xt2, x\st4
+ ror \b, \b, #2
+ and t1, t1, \d
+ eor t3, \a, \b
+ add t0, t0, t1
+ and t1, \a, \b
+ and t3, t3, \c
+ eor xt2, xt2, x\st0
+ add t1, t1, t3
+ ror t3, t2, #(32 - 1)
+ lsr xt2, xt2, #32
+ add t0, t0, \e
+ ror \e, \a, #(32 - 5)
+ ror t2, t2, #(32 - 1)
+ add \e, \e, k
+ add t0, t0, t3
+ ror \a, \a, #2
+ add \e, \e, t0
+ add t1, t1, \d
+ ror \d, \e, #(32 - 5)
+ add t1, t1, t2
+ add \d, \d, k
+ orr x\st0, xt3, xt2, lsl #32
+ add \d, \d, t1
+ .endm
+
+ .macro mix_2rounds, in, a, b, c, d, e, f, rc
+ st1 = (in + 1) % 8 + 8
+ st4 = (in + 4) % 8 + 8
+ st6 = (in + 6) % 8 + 8
+ st7 = (in + 7) % 8 + 8
+ \f\()_2rounds \a, \b, \c, \d, \e, \in, %st1, %st4, %st6, %st7, \rc
+ .endm
+
+ /*
+ * The SHA-1 round constants
+ */
+ .set sha_rcon1, 0x5a827999
+ .set sha_rcon2, 0x6ed9eba1
+ .set sha_rcon3, 0x8f1bbcdc
+ .set sha_rcon4, 0xca62c1d6
+
+ /*
+ * void sha_transform(__u32 *digest, const char *data, __u32 *array)
+ */
+ENTRY(sha_transform)
+ /* load digest input */
+ ldp wC, wD, [x0, #8]
+ ldp wA, wB, [x0]
+ ldr wE, [x0, #16]
+
+ inp_2rounds 8, wA, wB, wC, wD, wE, sha_rcon1
+ inp_2rounds 9, wD, wE, wA, wB, wC
+ inp_2rounds 10, wB, wC, wD, wE, wA
+ inp_2rounds 11, wE, wA, wB, wC, wD
+ inp_2rounds 12, wC, wD, wE, wA, wB
+ inp_2rounds 13, wA, wB, wC, wD, wE
+ inp_2rounds 14, wD, wE, wA, wB, wC
+ inp_2rounds 15, wB, wC, wD, wE, wA
+ mix_2rounds 8, wE, wA, wB, wC, wD, cho
+ mix_2rounds 9, wC, wD, wE, wA, wB, cho
+
+ mix_2rounds 10, wA, wB, wC, wD, wE, par, sha_rcon2
+ mix_2rounds 11, wD, wE, wA, wB, wC, par
+ mix_2rounds 12, wB, wC, wD, wE, wA, par
+ mix_2rounds 13, wE, wA, wB, wC, wD, par
+ mix_2rounds 14, wC, wD, wE, wA, wB, par
+ mix_2rounds 15, wA, wB, wC, wD, wE, par
+ mix_2rounds 8, wD, wE, wA, wB, wC, par
+ mix_2rounds 9, wB, wC, wD, wE, wA, par
+ mix_2rounds 10, wE, wA, wB, wC, wD, par
+ mix_2rounds 11, wC, wD, wE, wA, wB, par
+
+ mix_2rounds 12, wA, wB, wC, wD, wE, maj, sha_rcon3
+ mix_2rounds 13, wD, wE, wA, wB, wC, maj
+ mix_2rounds 14, wB, wC, wD, wE, wA, maj
+ mix_2rounds 15, wE, wA, wB, wC, wD, maj
+ mix_2rounds 8, wC, wD, wE, wA, wB, maj
+ mix_2rounds 9, wA, wB, wC, wD, wE, maj
+ mix_2rounds 10, wD, wE, wA, wB, wC, maj
+ mix_2rounds 11, wB, wC, wD, wE, wA, maj
+ mix_2rounds 12, wE, wA, wB, wC, wD, maj
+ mix_2rounds 13, wC, wD, wE, wA, wB, maj
+
+ mix_2rounds 14, wA, wB, wC, wD, wE, par, sha_rcon4
+ mix_2rounds 15, wD, wE, wA, wB, wC, par
+ mix_2rounds 8, wB, wC, wD, wE, wA, par
+ mix_2rounds 9, wE, wA, wB, wC, wD, par
+ mix_2rounds 10, wC, wD, wE, wA, wB, par
+ mix_2rounds 11, wA, wB, wC, wD, wE, par
+ mix_2rounds 12, wD, wE, wA, wB, wC, par
+ mix_2rounds 13, wB, wC, wD, wE, wA, par
+ mix_2rounds 14, wE, wA, wB, wC, wD, par
+ mix_2rounds 15, wC, wD, wE, wA, wB, par
+
+ /* reload digest input */
+ ldr w8, [x0]
+ ldp w9, w10, [x0, #4]
+ ldp w11, w12, [x0, #12]
+
+ /* add this block's output to digest */
+ add wA, wA, w8
+ add wB, wB, w9
+ add wC, wC, w10
+ add wD, wD, w11
+ add wE, wE, w12
+
+ /* store digest */
+ str wA, [x0]
+ stp wB, wC, [x0, #4]
+ stp wD, wE, [x0, #12]
+ ret
+ENDPROC(sha_transform)
+
+ /*
+ * The SHA-1 digest initial values
+ */
+.Lsha_init:
+ .word 0x67452301
+ .word 0xefcdab89
+ .word 0x98badcfe
+ .word 0x10325476
+ .word 0xc3d2e1f0
+
+ /*
+ * void sha_init(__u32 *buf)
+ */
+ENTRY(sha_init)
+ adr xt0, .Lsha_init
+ ldr wA, [xt0]
+ ldp wB, wC, [xt0, #4]
+ ldp wD, wE, [xt0, #12]
+ str wA, [x0]
+ stp wB, wC, [x0, #4]
+ stp wD, wE, [x0, #12]
+ ret
+ENDPROC(sha_init)
--
1.8.3.2
More information about the linux-arm-kernel
mailing list