[PATCH] crypto/arm64: aes-ce-cipher - move assembler code to .S file

Nick Desaulniers ndesaulniers at google.com
Tue Nov 21 09:32:24 PST 2017


Checked for transcription errors, calling convention, and removal of
temporary locals. LGTM.

Reviewed-By: Nick Desaulniers <ndesaulniers at google.com>

On Tue, Nov 21, 2017 at 5:40 AM, Ard Biesheuvel
<ard.biesheuvel at linaro.org> wrote:
> Most crypto drivers involving kernel mode NEON take care to put the code
> that actually touches the NEON register file in a separate compilation
> unit, to prevent the compiler from reordering code that preserves or
> restores the NEON context with code that may corrupt it. This is
> necessary because we currently have no way to express the restrictions
> imposed upon use of the NEON in kernel mode in a way that the compiler
> understands.
>
> However, in the case of aes-ce-cipher, it did not seem unreasonable to
> deviate from this rule, given how it does not seem possible for the
> compiler to reorder cross object function calls with asm blocks whose
> in- and output constraints reflect that it reads from and writes to
> memory.
>
> Now that LTO is being proposed for the arm64 kernel, it is time to
> revisit this. The link time optimization may replace the function
> calls to kernel_neon_begin() and kernel_neon_end() with instantiations
> of the IR that make up its implementation, allowing further reordering
> with the asm block.
>
> So let's clean this up, and move the asm() blocks into a separate .S
> file.
>
> Signed-off-by: Ard Biesheuvel <ard.biesheuvel at linaro.org>
> ---
>  arch/arm64/crypto/Makefile                         |   2 +-
>  arch/arm64/crypto/aes-ce-core.S                    |  87 ++++++++++++++++
>  .../crypto/{aes-ce-cipher.c => aes-ce-glue.c}      | 115 +++------------------
>  3 files changed, 100 insertions(+), 104 deletions(-)
>  create mode 100644 arch/arm64/crypto/aes-ce-core.S
>  rename arch/arm64/crypto/{aes-ce-cipher.c => aes-ce-glue.c} (62%)
>
> diff --git a/arch/arm64/crypto/Makefile b/arch/arm64/crypto/Makefile
> index b5edc5918c28..f5e8295fd756 100644
> --- a/arch/arm64/crypto/Makefile
> +++ b/arch/arm64/crypto/Makefile
> @@ -24,7 +24,7 @@ obj-$(CONFIG_CRYPTO_CRC32_ARM64_CE) += crc32-ce.o
>  crc32-ce-y:= crc32-ce-core.o crc32-ce-glue.o
>
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE) += aes-ce-cipher.o
> -CFLAGS_aes-ce-cipher.o += -march=armv8-a+crypto
> +aes-ce-cipher-y := aes-ce-core.o aes-ce-glue.o
>
>  obj-$(CONFIG_CRYPTO_AES_ARM64_CE_CCM) += aes-ce-ccm.o
>  aes-ce-ccm-y := aes-ce-ccm-glue.o aes-ce-ccm-core.o
> diff --git a/arch/arm64/crypto/aes-ce-core.S b/arch/arm64/crypto/aes-ce-core.S
> new file mode 100644
> index 000000000000..8efdfdade393
> --- /dev/null
> +++ b/arch/arm64/crypto/aes-ce-core.S
> @@ -0,0 +1,87 @@
> +/*
> + * Copyright (C) 2013 - 2017 Linaro Ltd <ard.biesheuvel at linaro.org>
> + *
> + * This program is free software; you can redistribute it and/or modify
> + * it under the terms of the GNU General Public License version 2 as
> + * published by the Free Software Foundation.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/assembler.h>
> +
> +       .arch           armv8-a+crypto
> +
> +ENTRY(__aes_ce_encrypt)
> +       sub             w3, w3, #2
> +       ld1             {v0.16b}, [x2]
> +       ld1             {v1.4s}, [x0], #16
> +       cmp             w3, #10
> +       bmi             0f
> +       bne             3f
> +       mov             v3.16b, v1.16b
> +       b               2f
> +0:     mov             v2.16b, v1.16b
> +       ld1             {v3.4s}, [x0], #16
> +1:     aese            v0.16b, v2.16b
> +       aesmc           v0.16b, v0.16b
> +2:     ld1             {v1.4s}, [x0], #16
> +       aese            v0.16b, v3.16b
> +       aesmc           v0.16b, v0.16b
> +3:     ld1             {v2.4s}, [x0], #16
> +       subs            w3, w3, #3
> +       aese            v0.16b, v1.16b
> +       aesmc           v0.16b, v0.16b
> +       ld1             {v3.4s}, [x0], #16
> +       bpl             1b
> +       aese            v0.16b, v2.16b
> +       eor             v0.16b, v0.16b, v3.16b
> +       st1             {v0.16b}, [x1]
> +       ret
> +ENDPROC(__aes_ce_encrypt)
> +
> +ENTRY(__aes_ce_decrypt)
> +       sub             w3, w3, #2
> +       ld1             {v0.16b}, [x2]
> +       ld1             {v1.4s}, [x0], #16
> +       cmp             w3, #10
> +       bmi             0f
> +       bne             3f
> +       mov             v3.16b, v1.16b
> +       b               2f
> +0:     mov             v2.16b, v1.16b
> +       ld1             {v3.4s}, [x0], #16
> +1:     aesd            v0.16b, v2.16b
> +       aesimc          v0.16b, v0.16b
> +2:     ld1             {v1.4s}, [x0], #16
> +       aesd            v0.16b, v3.16b
> +       aesimc          v0.16b, v0.16b
> +3:     ld1             {v2.4s}, [x0], #16
> +       subs            w3, w3, #3
> +       aesd            v0.16b, v1.16b
> +       aesimc          v0.16b, v0.16b
> +       ld1             {v3.4s}, [x0], #16
> +       bpl             1b
> +       aesd            v0.16b, v2.16b
> +       eor             v0.16b, v0.16b, v3.16b
> +       st1             {v0.16b}, [x1]
> +       ret
> +ENDPROC(__aes_ce_decrypt)
> +
> +/*
> + * __aes_ce_sub() - use the aese instruction to perform the AES sbox
> + *                  substitution on each byte in 'input'
> + */
> +ENTRY(__aes_ce_sub)
> +       dup             v1.4s, w0
> +       movi            v0.16b, #0
> +       aese            v0.16b, v1.16b
> +       umov            w0, v0.s[0]
> +       ret
> +ENDPROC(__aes_ce_sub)
> +
> +ENTRY(__aes_ce_invert)
> +       ld1             {v0.4s}, [x1]
> +       aesimc          v1.16b, v0.16b
> +       st1             {v1.4s}, [x0]
> +       ret
> +ENDPROC(__aes_ce_invert)
> diff --git a/arch/arm64/crypto/aes-ce-cipher.c b/arch/arm64/crypto/aes-ce-glue.c
> similarity index 62%
> rename from arch/arm64/crypto/aes-ce-cipher.c
> rename to arch/arm64/crypto/aes-ce-glue.c
> index 6a75cd75ed11..e6b3227bbf57 100644
> --- a/arch/arm64/crypto/aes-ce-cipher.c
> +++ b/arch/arm64/crypto/aes-ce-glue.c
> @@ -29,6 +29,13 @@ struct aes_block {
>         u8 b[AES_BLOCK_SIZE];
>  };
>
> +asmlinkage void __aes_ce_encrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
> +asmlinkage void __aes_ce_decrypt(u32 *rk, u8 *out, const u8 *in, int rounds);
> +
> +asmlinkage u32 __aes_ce_sub(u32 l);
> +asmlinkage void __aes_ce_invert(struct aes_block *out,
> +                               const struct aes_block *in);
> +
>  static int num_rounds(struct crypto_aes_ctx *ctx)
>  {
>         /*
> @@ -44,10 +51,6 @@ static int num_rounds(struct crypto_aes_ctx *ctx)
>  static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -       struct aes_block *out = (struct aes_block *)dst;
> -       struct aes_block const *in = (struct aes_block *)src;
> -       void *dummy0;
> -       int dummy1;
>
>         if (!may_use_simd()) {
>                 __aes_arm64_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
> @@ -55,49 +58,13 @@ static void aes_cipher_encrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>         }
>
>         kernel_neon_begin();
> -
> -       __asm__("       ld1     {v0.16b}, %[in]                 ;"
> -               "       ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       cmp     %w[rounds], #10                 ;"
> -               "       bmi     0f                              ;"
> -               "       bne     3f                              ;"
> -               "       mov     v3.16b, v1.16b                  ;"
> -               "       b       2f                              ;"
> -               "0:     mov     v2.16b, v1.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "1:     aese    v0.16b, v2.16b                  ;"
> -               "       aesmc   v0.16b, v0.16b                  ;"
> -               "2:     ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       aese    v0.16b, v3.16b                  ;"
> -               "       aesmc   v0.16b, v0.16b                  ;"
> -               "3:     ld1     {v2.4s}, [%[key]], #16          ;"
> -               "       subs    %w[rounds], %w[rounds], #3      ;"
> -               "       aese    v0.16b, v1.16b                  ;"
> -               "       aesmc   v0.16b, v0.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "       bpl     1b                              ;"
> -               "       aese    v0.16b, v2.16b                  ;"
> -               "       eor     v0.16b, v0.16b, v3.16b          ;"
> -               "       st1     {v0.16b}, %[out]                ;"
> -
> -       :       [out]           "=Q"(*out),
> -               [key]           "=r"(dummy0),
> -               [rounds]        "=r"(dummy1)
> -       :       [in]            "Q"(*in),
> -                               "1"(ctx->key_enc),
> -                               "2"(num_rounds(ctx) - 2)
> -       :       "cc");
> -
> +       __aes_ce_encrypt(ctx->key_enc, dst, src, num_rounds(ctx));
>         kernel_neon_end();
>  }
>
>  static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>  {
>         struct crypto_aes_ctx *ctx = crypto_tfm_ctx(tfm);
> -       struct aes_block *out = (struct aes_block *)dst;
> -       struct aes_block const *in = (struct aes_block *)src;
> -       void *dummy0;
> -       int dummy1;
>
>         if (!may_use_simd()) {
>                 __aes_arm64_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
> @@ -105,62 +72,10 @@ static void aes_cipher_decrypt(struct crypto_tfm *tfm, u8 dst[], u8 const src[])
>         }
>
>         kernel_neon_begin();
> -
> -       __asm__("       ld1     {v0.16b}, %[in]                 ;"
> -               "       ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       cmp     %w[rounds], #10                 ;"
> -               "       bmi     0f                              ;"
> -               "       bne     3f                              ;"
> -               "       mov     v3.16b, v1.16b                  ;"
> -               "       b       2f                              ;"
> -               "0:     mov     v2.16b, v1.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "1:     aesd    v0.16b, v2.16b                  ;"
> -               "       aesimc  v0.16b, v0.16b                  ;"
> -               "2:     ld1     {v1.4s}, [%[key]], #16          ;"
> -               "       aesd    v0.16b, v3.16b                  ;"
> -               "       aesimc  v0.16b, v0.16b                  ;"
> -               "3:     ld1     {v2.4s}, [%[key]], #16          ;"
> -               "       subs    %w[rounds], %w[rounds], #3      ;"
> -               "       aesd    v0.16b, v1.16b                  ;"
> -               "       aesimc  v0.16b, v0.16b                  ;"
> -               "       ld1     {v3.4s}, [%[key]], #16          ;"
> -               "       bpl     1b                              ;"
> -               "       aesd    v0.16b, v2.16b                  ;"
> -               "       eor     v0.16b, v0.16b, v3.16b          ;"
> -               "       st1     {v0.16b}, %[out]                ;"
> -
> -       :       [out]           "=Q"(*out),
> -               [key]           "=r"(dummy0),
> -               [rounds]        "=r"(dummy1)
> -       :       [in]            "Q"(*in),
> -                               "1"(ctx->key_dec),
> -                               "2"(num_rounds(ctx) - 2)
> -       :       "cc");
> -
> +       __aes_ce_decrypt(ctx->key_dec, dst, src, num_rounds(ctx));
>         kernel_neon_end();
>  }
>
> -/*
> - * aes_sub() - use the aese instruction to perform the AES sbox substitution
> - *             on each byte in 'input'
> - */
> -static u32 aes_sub(u32 input)
> -{
> -       u32 ret;
> -
> -       __asm__("dup    v1.4s, %w[in]           ;"
> -               "movi   v0.16b, #0              ;"
> -               "aese   v0.16b, v1.16b          ;"
> -               "umov   %w[out], v0.4s[0]       ;"
> -
> -       :       [out]   "=r"(ret)
> -       :       [in]    "r"(input)
> -       :               "v0","v1");
> -
> -       return ret;
> -}
> -
>  int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                      unsigned int key_len)
>  {
> @@ -189,7 +104,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                 u32 *rki = ctx->key_enc + (i * kwords);
>                 u32 *rko = rki + kwords;
>
> -               rko[0] = ror32(aes_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
> +               rko[0] = ror32(__aes_ce_sub(rki[kwords - 1]), 8) ^ rcon[i] ^ rki[0];
>                 rko[1] = rko[0] ^ rki[1];
>                 rko[2] = rko[1] ^ rki[2];
>                 rko[3] = rko[2] ^ rki[3];
> @@ -202,7 +117,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>                 } else if (key_len == AES_KEYSIZE_256) {
>                         if (i >= 6)
>                                 break;
> -                       rko[4] = aes_sub(rko[3]) ^ rki[4];
> +                       rko[4] = __aes_ce_sub(rko[3]) ^ rki[4];
>                         rko[5] = rko[4] ^ rki[5];
>                         rko[6] = rko[5] ^ rki[6];
>                         rko[7] = rko[6] ^ rki[7];
> @@ -221,13 +136,7 @@ int ce_aes_expandkey(struct crypto_aes_ctx *ctx, const u8 *in_key,
>
>         key_dec[0] = key_enc[j];
>         for (i = 1, j--; j > 0; i++, j--)
> -               __asm__("ld1    {v0.4s}, %[in]          ;"
> -                       "aesimc v1.16b, v0.16b          ;"
> -                       "st1    {v1.4s}, %[out] ;"
> -
> -               :       [out]   "=Q"(key_dec[i])
> -               :       [in]    "Q"(key_enc[j])
> -               :               "v0","v1");
> +               __aes_ce_invert(key_dec + i, key_enc + j);
>         key_dec[i] = key_enc[0];
>
>         kernel_neon_end();
> --
> 2.11.0
>



-- 
Thanks,
~Nick Desaulniers



More information about the linux-arm-kernel mailing list