[PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file
Ard Biesheuvel
ard.biesheuvel at linaro.org
Wed Nov 22 02:12:17 PST 2017
On 22 November 2017 at 10:05, Alex Matveev <alxmtvv at gmail.com> wrote:
> This is better than my simple fix, thank you.
>
> Out of curiosity, why doesn't NEON code use barrier() to prevent
> reordering?
>
Because barrier() affects ordering of memory accesses, not register accesses.
> On Tue, 21 Nov 2017 13:40:17 +0000
> Ard Biesheuvel <ard.biesheuvel at linaro.org> wrote:
>
>> Most crypto drivers involving kernel mode NEON take care to put the
>> code that actually touches the NEON register file in a separate
>> compilation unit, to prevent the compiler from reordering code that
>> preserves or restores the NEON context with code that may corrupt it.
>> This is necessary because we currently have no way to express the
>> restrictions imposed upon use of the NEON in kernel mode in a way
>> that the compiler understands.
>>
>> However, in the case of aes-ce-cipher, it did not seem unreasonable to
>> deviate from this rule, given how it does not seem possible for the
>> compiler to reorder cross object function calls with asm blocks whose
>> in- and output constraints reflect that it reads from and writes to
>> memory.
>>
>> Now that LTO is being proposed for the arm64 kernel, it is time to
>> revisit this. The link time optimization may replace the function
>> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
>> of the IR that make up its implementation, allowing further reordering
>> with the asm block.
>>
>> So let's clean this up, and move the asm() blocks into a separate .S
>> file.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
>> ---
>> arch/arm64/crypto/Makefile | 2 +-
>> arch/arm64/crypto/aes-ce-core.S | 87
>> ++++++++++++++++ .../crypto/{aes-ce-cipher.c => aes-ce-glue.c} |
>> 115 +++------------------ 3 files changed, 100 insertions(+), 104
>> deletions(-) create mode 100644 arch/arm64/crypto/aes-ce-core.S
>> rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
>>
>> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
>> index b5edc5918c28..f5e8295fd756 100644
>> --- a/arch/arm64/crypto/Makefile
>> +++ b/arch/arm64/crypto/Makefile
>> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>> crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>>
>> obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
>> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
>> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>>
>> obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>> aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
>> diff --git a/arch/arm64/crypto/aes-ce-core.S
>> b/arch/arm64/crypto/aes-ce-core.S new file mode 100644
>> index 000000000000..8efdfdade393
>> --- /dev/null
>> +++ b/arch/arm64/crypto/aes-ce-core.S
>> @@ -0,0 +1,87 @@
>> +/*
>> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel at linaro.org>
>> + *
>> + * This program is free software; you can redistribute it and/or
>> modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/assembler.h>
>> +
>> + .arch armv8-a+crypto
>> +
>> +ENTRY(__aes_ce_encrypt)
>> + sub w3, w3, #2
>> + ld1 {v0.16b}, [x2]
>> + ld1 {v1.4s}, [x0], #16
>> + cmp w3, #10
>> + bmi 0f
>> + bne 3f
>> + mov v3.16b, v1.16b
>> + b 2f
>> +0: mov v2.16b, v1.16b
>> + ld1 {v3.4s}, [x0], #16
>> +1: aese v0.16b, v2.16b
>> + aesmc v0.16b, v0.16b
>> +2: ld1 {v1.4s}, [x0], #16
>> + aese v0.16b, v3.16b
>> + aesmc v0.16b, v0.16b
>> +3: ld1 {v2.4s}, [x0], #16
>> + subs w3, w3, #3
>> + aese v0.16b, v1.16b
>> + aesmc v0.16b, v0.16b
>> + ld1 {v3.4s}, [x0], #16
>> + bpl 1b
>> + aese v0.16b, v2.16b
>> + eor v0.16b, v0.16b, v3.16b
>> + st1 {v0.16b}, [x1]
>> + ret
>> +ENDPROC(__aes_ce_encrypt)
>> +
>> +ENTRY(__aes_ce_decrypt)
>> + sub w3, w3, #2
>> + ld1 {v0.16b}, [x2]
>> + ld1 {v1.4s}, [x0], #16
>> + cmp w3, #10
>> + bmi 0f
>> + bne 3f
>> + mov v3.16b, v1.16b
>> + b 2f
>> +0: mov v2.16b, v1.16b
>> + ld1 {v3.4s}, [x0], #16
>> +1: aesd v0.16b, v2.16b
>> + aesimc v0.16b, v0.16b
>> +2: ld1 {v1.4s}, [x0], #16
>> + aesd v0.16b, v3.16b
>> + aesimc v0.16b, v0.16b
>> +3: ld1 {v2.4s}, [x0], #16
>> + subs w3, w3, #3
>> + aesd v0.16b, v1.16b
>> + aesimc v0.16b, v0.16b
>> + ld1 {v3.4s}, [x0], #16
>> + bpl 1b
>> + aesd v0.16b, v2.16b
>> + eor v0.16b, v0.16b, v3.16b
>> + st1 {v0.16b}, [x1]
>> + ret
>> +ENDPROC(__aes_ce_decrypt)
>> +
>> +/*
>> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
>> + * substitution on each byte in 'input'
>> + */
>> +ENTRY(__aes_ce_sub)
>> + dup v1.4s, w0
>> + movi v0.16b, #0
>> + aese v0.16b, v1.16b
>> + umov w0, v0.s[0]
>> + ret
>> +ENDPROC(__aes_ce_sub)
>> +
>> +ENTRY(__aes_ce_invert)
>> + ld1 {v0.4s}, [x1]
>> + aesimc v1.16b, v0.16b
>> + st1 {v1.4s}, [x0]
>> + ret
>> +ENDPROC(__aes_ce_invert)
>> diff --git a/arch/arm64/crypto/aes-ce-cipher.c
>> b/arch/arm64/crypto/aes-ce-glue.c similarity index 62%
>> rename from arch/arm64/crypto/aes-ce-cipher.c
>> rename to arch/arm64/crypto/aes-ce-glue.c
>> index 6a75cd75ed11..e6b3227bbf57 100644
>> --- a/arch/arm64/crypto/aes-ce-cipher.c
>> +++ b/arch/arm64/crypto/aes-ce-glue.c
>> @@ -29,6 +29,13 @@ struct aes_block {
>> u8 b[AES_BLOCK_SIZE];
>> };
>>
>> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int
>> rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8
>> *in, int rounds); +
>> +asmlinkage u32 __aes_ce_sub(u32 l);
>> +asmlinkage void __aes_ce_invert(struct aes_block *out,
>> + const struct aes_block *in);
>> +
>> static int num_rounds(struct crypto_aes_ctx *ctx)
>> {
>> /*
>> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>> static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8
>> const src[]) {
>> struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>> - struct aes_block *out = (struct aes_block *)dst;
>> - struct aes_block const *in = (struct aes_block *)src;
>> - void *dummy0;
>> - int dummy1;
>>
>> if (!may_use_simd()) {
>> __aes_arm64_encrypt(ctx->key_enc, dst, src,
>> num_rounds(ctx)); @@ -55,49 +58,13 @@ static void
>> aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>>
>> kernel_neon_begin();
>> -
>> - __asm__(" ld1 {v0.16b},
>> %[in] ;"
>> - " ld1 {v1.4s}, [%[key]],
>> #16 ;"
>> - " cmp %w[rounds],
>> #10 ;"
>> - " bmi
>> 0f ;"
>> - " bne
>> 3f ;"
>> - " mov v3.16b,
>> v1.16b ;"
>> - " b
>> 2f ;"
>> - "0: mov v2.16b,
>> v1.16b ;"
>> - " ld1 {v3.4s}, [%[key]],
>> #16 ;"
>> - "1: aese v0.16b,
>> v2.16b ;"
>> - " aesmc v0.16b,
>> v0.16b ;"
>> - "2: ld1 {v1.4s}, [%[key]],
>> #16 ;"
>> - " aese v0.16b,
>> v3.16b ;"
>> - " aesmc v0.16b,
>> v0.16b ;"
>> - "3: ld1 {v2.4s}, [%[key]],
>> #16 ;"
>> - " subs %w[rounds], %w[rounds],
>> #3 ;"
>> - " aese v0.16b,
>> v1.16b ;"
>> - " aesmc v0.16b,
>> v0.16b ;"
>> - " ld1 {v3.4s}, [%[key]],
>> #16 ;"
>> - " bpl
>> 1b ;"
>> - " aese v0.16b,
>> v2.16b ;"
>> - " eor v0.16b, v0.16b,
>> v3.16b ;"
>> - " st1 {v0.16b},
>> %[out] ;" -
>> - : [out] "=Q"(*out),
>> - [key] "=r"(dummy0),
>> - [rounds] "=r"(dummy1)
>> - : [in] "Q"(*in),
>> - "1"(ctx->key_enc),
>> - "2"(num_rounds(ctx) - 2)
>> - : "cc");
>> -
>> + __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>> kernel_neon_end();
>> }
>>
>> static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8
>> const src[]) {
>> struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>> - struct aes_block *out = (struct aes_block *)dst;
>> - struct aes_block const *in = (struct aes_block *)src;
>> - void *dummy0;
>> - int dummy1;
>>
>> if (!may_use_simd()) {
>> __aes_arm64_decrypt(ctx->key_dec, dst, src,
>> num_rounds(ctx)); @@ -105,62 +72,10 @@ static void
>> aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>>
>> kernel_neon_begin();
>> -
>> - __asm__(" ld1 {v0.16b},
>> %[in] ;"
>> - " ld1 {v1.4s}, [%[key]],
>> #16 ;"
>> - " cmp %w[rounds],
>> #10 ;"
>> - " bmi
>> 0f ;"
>> - " bne
>> 3f ;"
>> - " mov v3.16b,
>> v1.16b ;"
>> - " b
>> 2f ;"
>> - "0: mov v2.16b,
>> v1.16b ;"
>> - " ld1 {v3.4s}, [%[key]],
>> #16 ;"
>> - "1: aesd v0.16b,
>> v2.16b ;"
>> - " aesimc v0.16b,
>> v0.16b ;"
>> - "2: ld1 {v1.4s}, [%[key]],
>> #16 ;"
>> - " aesd v0.16b,
>> v3.16b ;"
>> - " aesimc v0.16b,
>> v0.16b ;"
>> - "3: ld1 {v2.4s}, [%[key]],
>> #16 ;"
>> - " subs %w[rounds], %w[rounds],
>> #3 ;"
>> - " aesd v0.16b,
>> v1.16b ;"
>> - " aesimc v0.16b,
>> v0.16b ;"
>> - " ld1 {v3.4s}, [%[key]],
>> #16 ;"
>> - " bpl
>> 1b ;"
>> - " aesd v0.16b,
>> v2.16b ;"
>> - " eor v0.16b, v0.16b,
>> v3.16b ;"
>> - " st1 {v0.16b},
>> %[out] ;" -
>> - : [out] "=Q"(*out),
>> - [key] "=r"(dummy0),
>> - [rounds] "=r"(dummy1)
>> - : [in] "Q"(*in),
>> - "1"(ctx->key_dec),
>> - "2"(num_rounds(ctx) - 2)
>> - : "cc");
>> -
>> + __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>> kernel_neon_end();
>> }
>>
>> -/*
>> - * aes_sub() - use the aese instruction to perform the AES sbox
>> substitution
>> - * on each byte in 'input'
>> - */
>> -static u32 aes_sub(u32 input)
>> -{
>> - u32 ret;
>> -
>> - __asm__("dup v1.4s, %w[in] ;"
>> - "movi v0.16b, #0 ;"
>> - "aese v0.16b, v1.16b ;"
>> - "umov %w[out], v0.4s[0] ;"
>> -
>> - : [out] "=r"(ret)
>> - : [in] "r"(input)
>> - : "v0","v1");
>> -
>> - return ret;
>> -}
>> -
>> int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>> unsigned int key_len)
>> {
>> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key, u32 *rki = ctx->key_enc + (i * kwords);
>> u32 *rko = rki + kwords;
>>
>> - rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^
>> rcon[i] ^ rki[0];
>> + rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
>> rcon[i] ^ rki[0]; rko[1] = rko[0] ^ rki[1];
>> rko[2] = rko[1] ^ rki[2];
>> rko[3] = rko[2] ^ rki[3];
>> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key, } else if (key_len == AES_KEYSIZE_256) {
>> if (i >= 6)
>> break;
>> - rko[4] = aes_sub(rko[3]) ^ rki[4];
>> + rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>> rko[5] = rko[4] ^ rki[5];
>> rko[6] = rko[5] ^ rki[6];
>> rko[7] = rko[6] ^ rki[7];
>> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key,
>> key_dec[0] = key_enc[j];
>> for (i = 1, j--; j > 0; i++, j--)
>> - __asm__("ld1 {v0.4s}, %[in] ;"
>> - "aesimc v1.16b,
>> v0.16b ;"
>> - "st1 {v1.4s}, %[out] ;"
>> -
>> - : [out] "=Q"(key_dec[i])
>> - : [in] "Q"(key_enc[j])
>> - : "v0","v1");
>> + __aes_ce_invert(key_dec + i, key_enc + j);
>> key_dec[i] = key_enc[0];
>>
>> kernel_neon_end();
>
> Regards,
> Alex
More information about the linux-arm-kernel
mailing list