[PATCH v3 10/14] crypto: arm/blake2s - add ARM scalar optimized BLAKE2s
Ard Biesheuvel
ardb at kernel.org
Wed Dec 23 04:08:50 EST 2020
On Wed, 23 Dec 2020 at 09:12, Eric Biggers <ebiggers at kernel.org> wrote:
>
> From: Eric Biggers <ebiggers at google.com>
>
> Add an ARM scalar optimized implementation of BLAKE2s.
>
> NEON isn't very useful for BLAKE2s because the BLAKE2s block size is too
> small for NEON to help. Each NEON instruction would depend on the
> previous one, resulting in poor performance.
>
> With scalar instructions, on the other hand, we can take advantage of
> ARM's "free" rotations (like I did in chacha-scalar-core.S) to get an
> implementation get runs much faster than the C implementation.
>
> Performance results on Cortex-A7 in cycles per byte using the shash API:
>
> 4096-byte messages:
> blake2s-256-arm: 18.8
> blake2s-256-generic: 26.0
>
> 500-byte messages:
> blake2s-256-arm: 20.3
> blake2s-256-generic: 27.9
>
> 100-byte messages:
> blake2s-256-arm: 29.7
> blake2s-256-generic: 39.2
>
> 32-byte messages:
> blake2s-256-arm: 50.6
> blake2s-256-generic: 66.2
>
> Except on very short messages, this is still slower than the NEON
> implementation of BLAKE2b which I've written; that is 14.0, 16.4, 25.8,
> and 76.1 cpb on 4096, 500, 100, and 32-byte messages, respectively.
> However, optimized BLAKE2s is useful for cases where BLAKE2s is used
> instead of BLAKE2b, such as WireGuard.
>
> This new implementation is added in the form of a new module
> blake2s-arm.ko, which is analogous to blake2s-x86_64.ko in that it
> provides blake2s_compress_arch() for use by the library API as well as
> optionally register the algorithms with the shash API.
>
> Acked-by: Ard Biesheuvel <ardb at kernel.org>
> Signed-off-by: Eric Biggers <ebiggers at google.com>
Tested-by: Ard Biesheuvel <ardb at kernel.org>
> ---
> arch/arm/crypto/Kconfig | 9 ++
> arch/arm/crypto/Makefile | 2 +
> arch/arm/crypto/blake2s-core.S | 285 +++++++++++++++++++++++++++++++++
> arch/arm/crypto/blake2s-glue.c | 78 +++++++++
> 4 files changed, 374 insertions(+)
> create mode 100644 arch/arm/crypto/blake2s-core.S
> create mode 100644 arch/arm/crypto/blake2s-glue.c
>
> diff --git a/arch/arm/crypto/Kconfig b/arch/arm/crypto/Kconfig
> index c9bf2df85cb90..281c829c12d0b 100644
> --- a/arch/arm/crypto/Kconfig
> +++ b/arch/arm/crypto/Kconfig
> @@ -62,6 +62,15 @@ config CRYPTO_SHA512_ARM
> SHA-512 secure hash standard (DFIPS 180-2) implemented
> using optimized ARM assembler and NEON, when available.
>
> +config CRYPTO_BLAKE2S_ARM
> + tristate "BLAKE2s digest algorithm (ARM)"
> + select CRYPTO_ARCH_HAVE_LIB_BLAKE2S
> + help
> + BLAKE2s digest algorithm optimized with ARM scalar instructions. This
> + is faster than the generic implementations of BLAKE2s and BLAKE2b, but
> + slower than the NEON implementation of BLAKE2b. (There is no NEON
> + implementation of BLAKE2s, since NEON doesn't really help with it.)
> +
> config CRYPTO_AES_ARM
> tristate "Scalar AES cipher for ARM"
> select CRYPTO_ALGAPI
> diff --git a/arch/arm/crypto/Makefile b/arch/arm/crypto/Makefile
> index b745c17d356fe..5ad1e985a718b 100644
> --- a/arch/arm/crypto/Makefile
> +++ b/arch/arm/crypto/Makefile
> @@ -9,6 +9,7 @@ obj-$(CONFIG_CRYPTO_SHA1_ARM) += sha1-arm.o
> obj-$(CONFIG_CRYPTO_SHA1_ARM_NEON) += sha1-arm-neon.o
> obj-$(CONFIG_CRYPTO_SHA256_ARM) += sha256-arm.o
> obj-$(CONFIG_CRYPTO_SHA512_ARM) += sha512-arm.o
> +obj-$(CONFIG_CRYPTO_BLAKE2S_ARM) += blake2s-arm.o
> obj-$(CONFIG_CRYPTO_CHACHA20_NEON) += chacha-neon.o
> obj-$(CONFIG_CRYPTO_POLY1305_ARM) += poly1305-arm.o
> obj-$(CONFIG_CRYPTO_NHPOLY1305_NEON) += nhpoly1305-neon.o
> @@ -29,6 +30,7 @@ sha256-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha256_neon_glue.o
> sha256-arm-y := sha256-core.o sha256_glue.o $(sha256-arm-neon-y)
> sha512-arm-neon-$(CONFIG_KERNEL_MODE_NEON) := sha512-neon-glue.o
> sha512-arm-y := sha512-core.o sha512-glue.o $(sha512-arm-neon-y)
> +blake2s-arm-y := blake2s-core.o blake2s-glue.o
> sha1-arm-ce-y := sha1-ce-core.o sha1-ce-glue.o
> sha2-arm-ce-y := sha2-ce-core.o sha2-ce-glue.o
> aes-arm-ce-y := aes-ce-core.o aes-ce-glue.o
> diff --git a/arch/arm/crypto/blake2s-core.S b/arch/arm/crypto/blake2s-core.S
> new file mode 100644
> index 0000000000000..bed897e9a181a
> --- /dev/null
> +++ b/arch/arm/crypto/blake2s-core.S
> @@ -0,0 +1,285 @@
> +/* SPDX-License-Identifier: GPL-2.0-or-later */
> +/*
> + * BLAKE2s digest algorithm, ARM scalar implementation
> + *
> + * Copyright 2020 Google LLC
> + *
> + * Author: Eric Biggers <ebiggers at google.com>
> + */
> +
> +#include <linux/linkage.h>
> +
> + // Registers used to hold message words temporarily. There aren't
> + // enough ARM registers to hold the whole message block, so we have to
> + // load the words on-demand.
> + M_0 .req r12
> + M_1 .req r14
> +
> +// The BLAKE2s initialization vector
> +.Lblake2s_IV:
> + .word 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A
> + .word 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19
> +
> +.macro __ldrd a, b, src, offset
> +#if __LINUX_ARM_ARCH__ >= 6
> + ldrd \a, \b, [\src, #\offset]
> +#else
> + ldr \a, [\src, #\offset]
> + ldr \b, [\src, #\offset + 4]
> +#endif
> +.endm
> +
> +.macro __strd a, b, dst, offset
> +#if __LINUX_ARM_ARCH__ >= 6
> + strd \a, \b, [\dst, #\offset]
> +#else
> + str \a, [\dst, #\offset]
> + str \b, [\dst, #\offset + 4]
> +#endif
> +.endm
> +
> +// Execute a quarter-round of BLAKE2s by mixing two columns or two diagonals.
> +// (a0, b0, c0, d0) and (a1, b1, c1, d1) give the registers containing the two
> +// columns/diagonals. s0-s1 are the word offsets to the message words the first
> +// column/diagonal needs, and likewise s2-s3 for the second column/diagonal.
> +// M_0 and M_1 are free to use, and the message block can be found at sp + 32.
> +//
> +// Note that to save instructions, the rotations don't happen when the
> +// pseudocode says they should, but rather they are delayed until the values are
> +// used. See the comment above _blake2s_round().
> +.macro _blake2s_quarterround a0, b0, c0, d0, a1, b1, c1, d1, s0, s1, s2, s3
> +
> + ldr M_0, [sp, #32 + 4 * \s0]
> + ldr M_1, [sp, #32 + 4 * \s2]
> +
> + // a += b + m[blake2s_sigma[r][2*i + 0]];
> + add \a0, \a0, \b0, ror #brot
> + add \a1, \a1, \b1, ror #brot
> + add \a0, \a0, M_0
> + add \a1, \a1, M_1
> +
> + // d = ror32(d ^ a, 16);
> + eor \d0, \a0, \d0, ror #drot
> + eor \d1, \a1, \d1, ror #drot
> +
> + // c += d;
> + add \c0, \c0, \d0, ror #16
> + add \c1, \c1, \d1, ror #16
> +
> + // b = ror32(b ^ c, 12);
> + eor \b0, \c0, \b0, ror #brot
> + eor \b1, \c1, \b1, ror #brot
> +
> + ldr M_0, [sp, #32 + 4 * \s1]
> + ldr M_1, [sp, #32 + 4 * \s3]
> +
> + // a += b + m[blake2s_sigma[r][2*i + 1]];
> + add \a0, \a0, \b0, ror #12
> + add \a1, \a1, \b1, ror #12
> + add \a0, \a0, M_0
> + add \a1, \a1, M_1
> +
> + // d = ror32(d ^ a, 8);
> + eor \d0, \a0, \d0, ror#16
> + eor \d1, \a1, \d1, ror#16
> +
> + // c += d;
> + add \c0, \c0, \d0, ror#8
> + add \c1, \c1, \d1, ror#8
> +
> + // b = ror32(b ^ c, 7);
> + eor \b0, \c0, \b0, ror#12
> + eor \b1, \c1, \b1, ror#12
> +.endm
> +
> +// Execute one round of BLAKE2s by updating the state matrix v[0..15]. v[0..9]
> +// are in r0..r9. The stack pointer points to 8 bytes of scratch space for
> +// spilling v[8..9], then to v[9..15], then to the message block. r10-r12 and
> +// r14 are free to use. The macro arguments s0-s15 give the order in which the
> +// message words are used in this round.
> +//
> +// All rotates are performed using the implicit rotate operand accepted by the
> +// 'add' and 'eor' instructions. This is faster than using explicit rotate
> +// instructions. To make this work, we allow the values in the second and last
> +// rows of the BLAKE2s state matrix (rows 'b' and 'd') to temporarily have the
> +// wrong rotation amount. The rotation amount is then fixed up just in time
> +// when the values are used. 'brot' is the number of bits the values in row 'b'
> +// need to be rotated right to arrive at the correct values, and 'drot'
> +// similarly for row 'd'. (brot, drot) start out as (0, 0) but we make it such
> +// that they end up as (7, 8) after every round.
> +.macro _blake2s_round s0, s1, s2, s3, s4, s5, s6, s7, \
> + s8, s9, s10, s11, s12, s13, s14, s15
> +
> + // Mix first two columns:
> + // (v[0], v[4], v[8], v[12]) and (v[1], v[5], v[9], v[13]).
> + __ldrd r10, r11, sp, 16 // load v[12] and v[13]
> + _blake2s_quarterround r0, r4, r8, r10, r1, r5, r9, r11, \
> + \s0, \s1, \s2, \s3
> + __strd r8, r9, sp, 0
> + __strd r10, r11, sp, 16
> +
> + // Mix second two columns:
> + // (v[2], v[6], v[10], v[14]) and (v[3], v[7], v[11], v[15]).
> + __ldrd r8, r9, sp, 8 // load v[10] and v[11]
> + __ldrd r10, r11, sp, 24 // load v[14] and v[15]
> + _blake2s_quarterround r2, r6, r8, r10, r3, r7, r9, r11, \
> + \s4, \s5, \s6, \s7
> + str r10, [sp, #24] // store v[14]
> + // v[10], v[11], and v[15] are used below, so no need to store them yet.
> +
> + .set brot, 7
> + .set drot, 8
> +
> + // Mix first two diagonals:
> + // (v[0], v[5], v[10], v[15]) and (v[1], v[6], v[11], v[12]).
> + ldr r10, [sp, #16] // load v[12]
> + _blake2s_quarterround r0, r5, r8, r11, r1, r6, r9, r10, \
> + \s8, \s9, \s10, \s11
> + __strd r8, r9, sp, 8
> + str r11, [sp, #28]
> + str r10, [sp, #16]
> +
> + // Mix second two diagonals:
> + // (v[2], v[7], v[8], v[13]) and (v[3], v[4], v[9], v[14]).
> + __ldrd r8, r9, sp, 0 // load v[8] and v[9]
> + __ldrd r10, r11, sp, 20 // load v[13] and v[14]
> + _blake2s_quarterround r2, r7, r8, r10, r3, r4, r9, r11, \
> + \s12, \s13, \s14, \s15
> + __strd r10, r11, sp, 20
> +.endm
> +
> +//
> +// void blake2s_compress_arch(struct blake2s_state *state,
> +// const u8 *block, size_t nblocks, u32 inc);
> +//
> +// Only the first three fields of struct blake2s_state are used:
> +// u32 h[8]; (inout)
> +// u32 t[2]; (inout)
> +// u32 f[2]; (in)
> +//
> + .align 5
> +ENTRY(blake2s_compress_arch)
> + push {r0-r2,r4-r11,lr} // keep this an even number
> +
> +.Lnext_block:
> + // r0 is 'state'
> + // r1 is 'block'
> + // r3 is 'inc'
> +
> + // Load and increment the counter t[0..1].
> + __ldrd r10, r11, r0, 32
> + adds r10, r10, r3
> + adc r11, r11, #0
> + __strd r10, r11, r0, 32
> +
> + // _blake2s_round is very short on registers, so copy the message block
> + // to the stack to save a register during the rounds. This also has the
> + // advantage that misalignment only needs to be dealt with in one place.
> + sub sp, sp, #64
> + mov r12, sp
> + tst r1, #3
> + bne .Lcopy_block_misaligned
> + ldmia r1!, {r2-r9}
> + stmia r12!, {r2-r9}
> + ldmia r1!, {r2-r9}
> + stmia r12, {r2-r9}
> +.Lcopy_block_done:
> + str r1, [sp, #68] // Update message pointer
> +
> + // Calculate v[8..15]. Push v[9..15] onto the stack, and leave space
> + // for spilling v[8..9]. Leave v[8..9] in r8-r9.
> + mov r14, r0 // r14 = state
> + adr r12, .Lblake2s_IV
> + ldmia r12!, {r8-r9} // load IV[0..1]
> + __ldrd r0, r1, r14, 40 // load f[0..1]
> + ldm r12, {r2-r7} // load IV[3..7]
> + eor r4, r4, r10 // v[12] = IV[4] ^ t[0]
> + eor r5, r5, r11 // v[13] = IV[5] ^ t[1]
> + eor r6, r6, r0 // v[14] = IV[6] ^ f[0]
> + eor r7, r7, r1 // v[15] = IV[7] ^ f[1]
> + push {r2-r7} // push v[9..15]
> + sub sp, sp, #8 // leave space for v[8..9]
> +
> + // Load h[0..7] == v[0..7].
> + ldm r14, {r0-r7}
> +
> + // Execute the rounds. Each round is provided the order in which it
> + // needs to use the message words.
> + .set brot, 0
> + .set drot, 0
> + _blake2s_round 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
> + _blake2s_round 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3
> + _blake2s_round 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4
> + _blake2s_round 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8
> + _blake2s_round 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13
> + _blake2s_round 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9
> + _blake2s_round 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11
> + _blake2s_round 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10
> + _blake2s_round 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5
> + _blake2s_round 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0
> +
> + // Fold the final state matrix into the hash chaining value:
> + //
> + // for (i = 0; i < 8; i++)
> + // h[i] ^= v[i] ^ v[i + 8];
> + //
> + ldr r14, [sp, #96] // r14 = &h[0]
> + add sp, sp, #8 // v[8..9] are already loaded.
> + pop {r10-r11} // load v[10..11]
> + eor r0, r0, r8
> + eor r1, r1, r9
> + eor r2, r2, r10
> + eor r3, r3, r11
> + ldm r14, {r8-r11} // load h[0..3]
> + eor r0, r0, r8
> + eor r1, r1, r9
> + eor r2, r2, r10
> + eor r3, r3, r11
> + stmia r14!, {r0-r3} // store new h[0..3]
> + ldm r14, {r0-r3} // load old h[4..7]
> + pop {r8-r11} // load v[12..15]
> + eor r0, r0, r4, ror #brot
> + eor r1, r1, r5, ror #brot
> + eor r2, r2, r6, ror #brot
> + eor r3, r3, r7, ror #brot
> + eor r0, r0, r8, ror #drot
> + eor r1, r1, r9, ror #drot
> + eor r2, r2, r10, ror #drot
> + eor r3, r3, r11, ror #drot
> + add sp, sp, #64 // skip copy of message block
> + stm r14, {r0-r3} // store new h[4..7]
> +
> + // Advance to the next block, if there is one. Note that if there are
> + // multiple blocks, then 'inc' (the counter increment amount) must be
> + // 64. So we can simply set it to 64 without re-loading it.
> + ldm sp, {r0, r1, r2} // load (state, block, nblocks)
> + mov r3, #64 // set 'inc'
> + subs r2, r2, #1 // nblocks--
> + str r2, [sp, #8]
> + bne .Lnext_block // nblocks != 0?
> +
> + pop {r0-r2,r4-r11,pc}
> +
> + // The next message block (pointed to by r1) isn't 4-byte aligned, so it
> + // can't be loaded using ldmia. Copy it to the stack buffer (pointed to
> + // by r12) using an alternative method. r2-r9 are free to use.
> +.Lcopy_block_misaligned:
> + mov r2, #64
> +1:
> +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
> + ldr r3, [r1], #4
> +#else
> + ldrb r3, [r1, #0]
> + ldrb r4, [r1, #1]
> + ldrb r5, [r1, #2]
> + ldrb r6, [r1, #3]
> + add r1, r1, #4
> + orr r3, r3, r4, lsl #8
> + orr r3, r3, r5, lsl #16
> + orr r3, r3, r6, lsl #24
> +#endif
> + subs r2, r2, #4
> + str r3, [r12], #4
> + bne 1b
> + b .Lcopy_block_done
> +ENDPROC(blake2s_compress_arch)
> diff --git a/arch/arm/crypto/blake2s-glue.c b/arch/arm/crypto/blake2s-glue.c
> new file mode 100644
> index 0000000000000..f2cc1e5fc9ec1
> --- /dev/null
> +++ b/arch/arm/crypto/blake2s-glue.c
> @@ -0,0 +1,78 @@
> +// SPDX-License-Identifier: GPL-2.0-or-later
> +/*
> + * BLAKE2s digest algorithm, ARM scalar implementation
> + *
> + * Copyright 2020 Google LLC
> + */
> +
> +#include <crypto/internal/blake2s.h>
> +#include <crypto/internal/hash.h>
> +
> +#include <linux/module.h>
> +
> +/* defined in blake2s-core.S */
> +EXPORT_SYMBOL(blake2s_compress_arch);
> +
> +static int crypto_blake2s_update_arm(struct shash_desc *desc,
> + const u8 *in, unsigned int inlen)
> +{
> + return crypto_blake2s_update(desc, in, inlen, blake2s_compress_arch);
> +}
> +
> +static int crypto_blake2s_final_arm(struct shash_desc *desc, u8 *out)
> +{
> + return crypto_blake2s_final(desc, out, blake2s_compress_arch);
> +}
> +
> +#define BLAKE2S_ALG(name, driver_name, digest_size) \
> + { \
> + .base.cra_name = name, \
> + .base.cra_driver_name = driver_name, \
> + .base.cra_priority = 200, \
> + .base.cra_flags = CRYPTO_ALG_OPTIONAL_KEY, \
> + .base.cra_blocksize = BLAKE2S_BLOCK_SIZE, \
> + .base.cra_ctxsize = sizeof(struct blake2s_tfm_ctx), \
> + .base.cra_module = THIS_MODULE, \
> + .digestsize = digest_size, \
> + .setkey = crypto_blake2s_setkey, \
> + .init = crypto_blake2s_init, \
> + .update = crypto_blake2s_update_arm, \
> + .final = crypto_blake2s_final_arm, \
> + .descsize = sizeof(struct blake2s_state), \
> + }
> +
> +static struct shash_alg blake2s_arm_algs[] = {
> + BLAKE2S_ALG("blake2s-128", "blake2s-128-arm", BLAKE2S_128_HASH_SIZE),
> + BLAKE2S_ALG("blake2s-160", "blake2s-160-arm", BLAKE2S_160_HASH_SIZE),
> + BLAKE2S_ALG("blake2s-224", "blake2s-224-arm", BLAKE2S_224_HASH_SIZE),
> + BLAKE2S_ALG("blake2s-256", "blake2s-256-arm", BLAKE2S_256_HASH_SIZE),
> +};
> +
> +static int __init blake2s_arm_mod_init(void)
> +{
> + return IS_REACHABLE(CONFIG_CRYPTO_HASH) ?
> + crypto_register_shashes(blake2s_arm_algs,
> + ARRAY_SIZE(blake2s_arm_algs)) : 0;
> +}
> +
> +static void __exit blake2s_arm_mod_exit(void)
> +{
> + if (IS_REACHABLE(CONFIG_CRYPTO_HASH))
> + crypto_unregister_shashes(blake2s_arm_algs,
> + ARRAY_SIZE(blake2s_arm_algs));
> +}
> +
> +module_init(blake2s_arm_mod_init);
> +module_exit(blake2s_arm_mod_exit);
> +
> +MODULE_DESCRIPTION("BLAKE2s digest algorithm, ARM scalar implementation");
> +MODULE_LICENSE("GPL");
> +MODULE_AUTHOR("Eric Biggers <ebiggers at google.com>");
> +MODULE_ALIAS_CRYPTO("blake2s-128");
> +MODULE_ALIAS_CRYPTO("blake2s-128-arm");
> +MODULE_ALIAS_CRYPTO("blake2s-160");
> +MODULE_ALIAS_CRYPTO("blake2s-160-arm");
> +MODULE_ALIAS_CRYPTO("blake2s-224");
> +MODULE_ALIAS_CRYPTO("blake2s-224-arm");
> +MODULE_ALIAS_CRYPTO("blake2s-256");
> +MODULE_ALIAS_CRYPTO("blake2s-256-arm");
> --
> 2.29.2
>
More information about the linux-arm-kernel
mailing list