[PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file

Ard Biesheuvel ard.biesheuvel at linaro.org
Wed Nov 22 02:12:17 PST 2017


On 22 November 2017 at 10:05, Alex Matveev <alxmtvv at gmail.com> wrote:
> This is better than my simple fix, thank you.
>
> Out of curiosity, why doesn't NEON code use barrier() to prevent
> reordering?
>

Because barrier() affects ordering of memory accesses, not register accesses.

> On Tue, 21 Nov 2017 13:40:17 +0000
> Ard Biesheuvel <ard.biesheuvel at linaro.org> wrote:
>
>> Most crypto drivers involving kernel mode NEON take care to put the
>> code that actually touches the NEON register file in a separate
>> compilation unit, to prevent the compiler from reordering code that
>> preserves or restores the NEON context with code that may corrupt it.
>> This is necessary because we currently have no way to express the
>> restrictions imposed upon use of the NEON in kernel mode in a way
>> that the compiler understands.
>>
>> However, in the case of aes-ce-cipher, it did not seem unreasonable to
>> deviate from this rule, given how it does not seem possible for the
>> compiler to reorder cross object function calls with asm blocks whose
>> in- and output constraints reflect that it reads from and writes to
>> memory.
>>
>> Now that LTO is being proposed for the arm64 kernel, it is time to
>> revisit this. The link time optimization may replace the function
>> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
>> of the IR that make up its implementation, allowing further reordering
>> with the asm block.
>>
>> So let's clean this up, and move the asm() blocks into a separate .S
>> file.
>>
>> Signed-off-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
>> ---
>>  arch/arm64/crypto/Makefile                         |   2 +-
>>  arch/arm64/crypto/aes-ce-core.S                    |  87
>> ++++++++++++++++ .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      |
>> 115 +++------------------ 3 files changed, 100 insertions(+), 104
>> deletions(-) create mode 100644 arch/arm64/crypto/aes-ce-core.S
>>  rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
>>
>> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
>> index b5edc5918c28..f5e8295fd756 100644
>> --- a/arch/arm64/crypto/Makefile
>> +++ b/arch/arm64/crypto/Makefile
>> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>>  crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>>
>>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
>> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
>> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>>
>>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>>  aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
>> diff --git a/arch/arm64/crypto/aes-ce-core.S
>> b/arch/arm64/crypto/aes-ce-core.S new file mode 100644
>> index 000000000000..8efdfdade393
>> --- /dev/null
>> +++ b/arch/arm64/crypto/aes-ce-core.S
>> @@ -0,0 +1,87 @@
>> +/*
>> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel at linaro.org>
>> + *
>> + * This program is free software; you can redistribute it and/or
>> modify
>> + * it under the terms of the GNU General Public License version 2 as
>> + * published by the Free Software Foundation.
>> + */
>> +
>> +#include <linux/linkage.h>
>> +#include <asm/assembler.h>
>> +
>> +     .arch           armv8-a+crypto
>> +
>> +ENTRY(__aes_ce_encrypt)
>> +     sub             w3, w3, #2
>> +     ld1             {v0.16b}, [x2]
>> +     ld1             {v1.4s}, [x0], #16
>> +     cmp             w3, #10
>> +     bmi             0f
>> +     bne             3f
>> +     mov             v3.16b, v1.16b
>> +     b               2f
>> +0:   mov             v2.16b, v1.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +1:   aese            v0.16b, v2.16b
>> +     aesmc           v0.16b, v0.16b
>> +2:   ld1             {v1.4s}, [x0], #16
>> +     aese            v0.16b, v3.16b
>> +     aesmc           v0.16b, v0.16b
>> +3:   ld1             {v2.4s}, [x0], #16
>> +     subs            w3, w3, #3
>> +     aese            v0.16b, v1.16b
>> +     aesmc           v0.16b, v0.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +     bpl             1b
>> +     aese            v0.16b, v2.16b
>> +     eor             v0.16b, v0.16b, v3.16b
>> +     st1             {v0.16b}, [x1]
>> +     ret
>> +ENDPROC(__aes_ce_encrypt)
>> +
>> +ENTRY(__aes_ce_decrypt)
>> +     sub             w3, w3, #2
>> +     ld1             {v0.16b}, [x2]
>> +     ld1             {v1.4s}, [x0], #16
>> +     cmp             w3, #10
>> +     bmi             0f
>> +     bne             3f
>> +     mov             v3.16b, v1.16b
>> +     b               2f
>> +0:   mov             v2.16b, v1.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +1:   aesd            v0.16b, v2.16b
>> +     aesimc          v0.16b, v0.16b
>> +2:   ld1             {v1.4s}, [x0], #16
>> +     aesd            v0.16b, v3.16b
>> +     aesimc          v0.16b, v0.16b
>> +3:   ld1             {v2.4s}, [x0], #16
>> +     subs            w3, w3, #3
>> +     aesd            v0.16b, v1.16b
>> +     aesimc          v0.16b, v0.16b
>> +     ld1             {v3.4s}, [x0], #16
>> +     bpl             1b
>> +     aesd            v0.16b, v2.16b
>> +     eor             v0.16b, v0.16b, v3.16b
>> +     st1             {v0.16b}, [x1]
>> +     ret
>> +ENDPROC(__aes_ce_decrypt)
>> +
>> +/*
>> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
>> + *                  substitution on each byte in 'input'
>> + */
>> +ENTRY(__aes_ce_sub)
>> +     dup             v1.4s, w0
>> +     movi            v0.16b, #0
>> +     aese            v0.16b, v1.16b
>> +     umov            w0, v0.s[0]
>> +     ret
>> +ENDPROC(__aes_ce_sub)
>> +
>> +ENTRY(__aes_ce_invert)
>> +     ld1             {v0.4s}, [x1]
>> +     aesimc          v1.16b, v0.16b
>> +     st1             {v1.4s}, [x0]
>> +     ret
>> +ENDPROC(__aes_ce_invert)
>> diff --git a/arch/arm64/crypto/aes-ce-cipher.c
>> b/arch/arm64/crypto/aes-ce-glue.c similarity index 62%
>> rename from arch/arm64/crypto/aes-ce-cipher.c
>> rename to arch/arm64/crypto/aes-ce-glue.c
>> index 6a75cd75ed11..e6b3227bbf57 100644
>> --- a/arch/arm64/crypto/aes-ce-cipher.c
>> +++ b/arch/arm64/crypto/aes-ce-glue.c
>> @@ -29,6 +29,13 @@ struct aes_block {
>>       u8 b[AES_BLOCK_SIZE];
>>  };
>>
>> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int
>> rounds); +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8
>> *in, int rounds); +
>> +asmlinkage u32 __aes_ce_sub(u32 l);
>> +asmlinkage void __aes_ce_invert(struct aes_block *out,
>> +                             const struct aes_block *in);
>> +
>>  static int num_rounds(struct crypto_aes_ctx *ctx)
>>  {
>>       /*
>> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>>  static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8
>> const src[]) {
>>       struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>> -     struct aes_block *out = (struct aes_block *)dst;
>> -     struct aes_block const *in = (struct aes_block *)src;
>> -     void *dummy0;
>> -     int dummy1;
>>
>>       if (!may_use_simd()) {
>>               __aes_arm64_encrypt(ctx->key_enc, dst, src,
>> num_rounds(ctx)); @@ -55,49 +58,13 @@ static void
>> aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>>
>>       kernel_neon_begin();
>> -
>> -     __asm__("       ld1     {v0.16b},
>> %[in]                 ;"
>> -             "       ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       cmp     %w[rounds],
>> #10                   ;"
>> -             "       bmi
>> 0f                            ;"
>> -             "       bne
>> 3f                            ;"
>> -             "       mov     v3.16b,
>> v1.16b                        ;"
>> -             "       b
>> 2f                            ;"
>> -             "0:     mov     v2.16b,
>> v1.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "1:     aese    v0.16b,
>> v2.16b                        ;"
>> -             "       aesmc   v0.16b,
>> v0.16b                        ;"
>> -             "2:     ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       aese    v0.16b,
>> v3.16b                        ;"
>> -             "       aesmc   v0.16b,
>> v0.16b                        ;"
>> -             "3:     ld1     {v2.4s}, [%[key]],
>> #16           ;"
>> -             "       subs    %w[rounds], %w[rounds],
>> #3    ;"
>> -             "       aese    v0.16b,
>> v1.16b                        ;"
>> -             "       aesmc   v0.16b,
>> v0.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "       bpl
>> 1b                            ;"
>> -             "       aese    v0.16b,
>> v2.16b                        ;"
>> -             "       eor     v0.16b, v0.16b,
>> v3.16b                ;"
>> -             "       st1     {v0.16b},
>> %[out]                ;" -
>> -     :       [out]           "=Q"(*out),
>> -             [key]           "=r"(dummy0),
>> -             [rounds]        "=r"(dummy1)
>> -     :       [in]            "Q"(*in),
>> -                             "1"(ctx->key_enc),
>> -                             "2"(num_rounds(ctx) - 2)
>> -     :       "cc");
>> -
>> +     __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>>       kernel_neon_end();
>>  }
>>
>>  static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8
>> const src[]) {
>>       struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
>> -     struct aes_block *out = (struct aes_block *)dst;
>> -     struct aes_block const *in = (struct aes_block *)src;
>> -     void *dummy0;
>> -     int dummy1;
>>
>>       if (!may_use_simd()) {
>>               __aes_arm64_decrypt(ctx->key_dec, dst, src,
>> num_rounds(ctx)); @@ -105,62 +72,10 @@ static void
>> aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[]) }
>>
>>       kernel_neon_begin();
>> -
>> -     __asm__("       ld1     {v0.16b},
>> %[in]                 ;"
>> -             "       ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       cmp     %w[rounds],
>> #10                   ;"
>> -             "       bmi
>> 0f                            ;"
>> -             "       bne
>> 3f                            ;"
>> -             "       mov     v3.16b,
>> v1.16b                        ;"
>> -             "       b
>> 2f                            ;"
>> -             "0:     mov     v2.16b,
>> v1.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "1:     aesd    v0.16b,
>> v2.16b                        ;"
>> -             "       aesimc  v0.16b,
>> v0.16b                        ;"
>> -             "2:     ld1     {v1.4s}, [%[key]],
>> #16           ;"
>> -             "       aesd    v0.16b,
>> v3.16b                        ;"
>> -             "       aesimc  v0.16b,
>> v0.16b                        ;"
>> -             "3:     ld1     {v2.4s}, [%[key]],
>> #16           ;"
>> -             "       subs    %w[rounds], %w[rounds],
>> #3    ;"
>> -             "       aesd    v0.16b,
>> v1.16b                        ;"
>> -             "       aesimc  v0.16b,
>> v0.16b                        ;"
>> -             "       ld1     {v3.4s}, [%[key]],
>> #16           ;"
>> -             "       bpl
>> 1b                            ;"
>> -             "       aesd    v0.16b,
>> v2.16b                        ;"
>> -             "       eor     v0.16b, v0.16b,
>> v3.16b                ;"
>> -             "       st1     {v0.16b},
>> %[out]                ;" -
>> -     :       [out]           "=Q"(*out),
>> -             [key]           "=r"(dummy0),
>> -             [rounds]        "=r"(dummy1)
>> -     :       [in]            "Q"(*in),
>> -                             "1"(ctx->key_dec),
>> -                             "2"(num_rounds(ctx) - 2)
>> -     :       "cc");
>> -
>> +     __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>>       kernel_neon_end();
>>  }
>>
>> -/*
>> - * aes_sub() - use the aese instruction to perform the AES sbox
>> substitution
>> - *             on each byte in 'input'
>> - */
>> -static u32 aes_sub(u32 input)
>> -{
>> -     u32 ret;
>> -
>> -     __asm__("dup    v1.4s, %w[in]           ;"
>> -             "movi   v0.16b, #0              ;"
>> -             "aese   v0.16b, v1.16b          ;"
>> -             "umov   %w[out], v0.4s[0]       ;"
>> -
>> -     :       [out]   "=r"(ret)
>> -     :       [in]    "r"(input)
>> -     :               "v0","v1");
>> -
>> -     return ret;
>> -}
>> -
>>  int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>>                    unsigned int key_len)
>>  {
>> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key, u32 *rki = ctx->key_enc + (i * kwords);
>>               u32 *rko = rki + kwords;
>>
>> -             rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^
>> rcon[i] ^ rki[0];
>> +             rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^
>> rcon[i] ^ rki[0]; rko[1] = rko[0] ^ rki[1];
>>               rko[2] = rko[1] ^ rki[2];
>>               rko[3] = rko[2] ^ rki[3];
>> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key, } else if (key_len == AES_KEYSIZE_256) {
>>                       if (i >= 6)
>>                               break;
>> -                     rko[4] = aes_sub(rko[3]) ^ rki[4];
>> +                     rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>>                       rko[5] = rko[4] ^ rki[5];
>>                       rko[6] = rko[5] ^ rki[6];
>>                       rko[7] = rko[6] ^ rki[7];
>> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx,
>> const u8 *in_key,
>>       key_dec[0] = key_enc[j];
>>       for (i = 1, j--; j > 0; i++, j--)
>> -             __asm__("ld1    {v0.4s}, %[in]          ;"
>> -                     "aesimc v1.16b,
>> v0.16b                ;"
>> -                     "st1    {v1.4s}, %[out] ;"
>> -
>> -             :       [out]   "=Q"(key_dec[i])
>> -             :       [in]    "Q"(key_enc[j])
>> -             :               "v0","v1");
>> +             __aes_ce_invert(key_dec + i, key_enc + j);
>>       key_dec[i] = key_enc[0];
>>
>>       kernel_neon_end();
>
> Regards,
> Alex



More information about the linux-arm-kernel mailing list