[PATCH v3 4/5] arm64: crypto: Remove conditional yield logic
Mark Rutland
mark.rutland at arm.com
Mon Nov 27 05:46:25 PST 2023
On Mon, Nov 27, 2023 at 01:23:04PM +0100, Ard Biesheuvel wrote:
> From: Ard Biesheuvel <ardb at kernel.org>
>
> Some classes of crypto algorithms (such as skciphers or aeads) have
> natural yield points, but SIMD based shashes yield the NEON unit
> manually to avoid causing scheduling blackouts when operating on large
> inputs.
>
> This is no longer necessary now that kernel mode NEON runs with
> preemption enabled, so remove this logic from the crypto assembler code,
> along with the macro that implements the TIF_NEED_RESCHED check.
>
> Signed-off-by: Ard Biesheuvel <ardb at kernel.org>
I definitely want to get rid of all of the voluntary preemption points, but
IIUC for the moment we need to keep these for PREEMPT_NONE and
PREEMPT_VOLUNTARY (and consequently for PREEMPT_DYNAMIC). Once the preemption
rework lands, these should no longer be necessary and can be removed:
https://lore.kernel.org/lkml/20231107215742.363031-1-ankur.a.arora@oracle.com/
Thanks,
Mark.
> ---
> arch/arm64/crypto/aes-glue.c | 21 +++++---------
> arch/arm64/crypto/aes-modes.S | 2 --
> arch/arm64/crypto/sha1-ce-core.S | 6 ++--
> arch/arm64/crypto/sha1-ce-glue.c | 19 ++++---------
> arch/arm64/crypto/sha2-ce-core.S | 6 ++--
> arch/arm64/crypto/sha2-ce-glue.c | 19 ++++---------
> arch/arm64/crypto/sha3-ce-core.S | 6 ++--
> arch/arm64/crypto/sha3-ce-glue.c | 14 ++++------
> arch/arm64/crypto/sha512-ce-core.S | 8 ++----
> arch/arm64/crypto/sha512-ce-glue.c | 16 ++++-------
> arch/arm64/include/asm/assembler.h | 29 --------------------
> arch/arm64/kernel/asm-offsets.c | 4 ---
> 12 files changed, 38 insertions(+), 112 deletions(-)
>
> diff --git a/arch/arm64/crypto/aes-glue.c b/arch/arm64/crypto/aes-glue.c
> index 162787c7aa86..c42c903b7d60 100644
> --- a/arch/arm64/crypto/aes-glue.c
> +++ b/arch/arm64/crypto/aes-glue.c
> @@ -109,9 +109,9 @@ asmlinkage void aes_essiv_cbc_decrypt(u8 out[], u8 const in[], u32 const rk1[],
> int rounds, int blocks, u8 iv[],
> u32 const rk2[]);
>
> -asmlinkage int aes_mac_update(u8 const in[], u32 const rk[], int rounds,
> - int blocks, u8 dg[], int enc_before,
> - int enc_after);
> +asmlinkage void aes_mac_update(u8 const in[], u32 const rk[], int rounds,
> + int blocks, u8 dg[], int enc_before,
> + int enc_after);
>
> struct crypto_aes_xts_ctx {
> struct crypto_aes_ctx key1;
> @@ -880,17 +880,10 @@ static void mac_do_update(struct crypto_aes_ctx *ctx, u8 const in[], int blocks,
> int rounds = 6 + ctx->key_length / 4;
>
> if (crypto_simd_usable()) {
> - int rem;
> -
> - do {
> - kernel_neon_begin();
> - rem = aes_mac_update(in, ctx->key_enc, rounds, blocks,
> - dg, enc_before, enc_after);
> - kernel_neon_end();
> - in += (blocks - rem) * AES_BLOCK_SIZE;
> - blocks = rem;
> - enc_before = 0;
> - } while (blocks);
> + kernel_neon_begin();
> + aes_mac_update(in, ctx->key_enc, rounds, blocks, dg,
> + enc_before, enc_after);
> + kernel_neon_end();
> } else {
> if (enc_before)
> aes_encrypt(ctx, dg, dg);
> diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
> index 0e834a2c062c..4d68853d0caf 100644
> --- a/arch/arm64/crypto/aes-modes.S
> +++ b/arch/arm64/crypto/aes-modes.S
> @@ -842,7 +842,6 @@ AES_FUNC_START(aes_mac_update)
> cbz w5, .Lmacout
> encrypt_block v0, w2, x1, x7, w8
> st1 {v0.16b}, [x4] /* return dg */
> - cond_yield .Lmacout, x7, x8
> b .Lmacloop4x
> .Lmac1x:
> add w3, w3, #4
> @@ -861,6 +860,5 @@ AES_FUNC_START(aes_mac_update)
>
> .Lmacout:
> st1 {v0.16b}, [x4] /* return dg */
> - mov w0, w3
> ret
> AES_FUNC_END(aes_mac_update)
> diff --git a/arch/arm64/crypto/sha1-ce-core.S b/arch/arm64/crypto/sha1-ce-core.S
> index 9b1f2d82a6fe..9e37bc09c3a5 100644
> --- a/arch/arm64/crypto/sha1-ce-core.S
> +++ b/arch/arm64/crypto/sha1-ce-core.S
> @@ -62,8 +62,8 @@
> .endm
>
> /*
> - * int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> - * int blocks)
> + * void __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> + * int blocks)
> */
> SYM_FUNC_START(__sha1_ce_transform)
> /* load round constants */
> @@ -121,7 +121,6 @@ CPU_LE( rev32 v11.16b, v11.16b )
> add dgav.4s, dgav.4s, dg0v.4s
>
> cbz w2, 2f
> - cond_yield 3f, x5, x6
> b 0b
>
> /*
> @@ -145,6 +144,5 @@ CPU_LE( rev32 v11.16b, v11.16b )
> /* store new state */
> 3: st1 {dgav.4s}, [x0]
> str dgb, [x0, #16]
> - mov w0, w2
> ret
> SYM_FUNC_END(__sha1_ce_transform)
> diff --git a/arch/arm64/crypto/sha1-ce-glue.c b/arch/arm64/crypto/sha1-ce-glue.c
> index 1dd93e1fcb39..c1c5c5cb104b 100644
> --- a/arch/arm64/crypto/sha1-ce-glue.c
> +++ b/arch/arm64/crypto/sha1-ce-glue.c
> @@ -29,23 +29,16 @@ struct sha1_ce_state {
> extern const u32 sha1_ce_offsetof_count;
> extern const u32 sha1_ce_offsetof_finalize;
>
> -asmlinkage int __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> - int blocks);
> +asmlinkage void __sha1_ce_transform(struct sha1_ce_state *sst, u8 const *src,
> + int blocks);
>
> static void sha1_ce_transform(struct sha1_state *sst, u8 const *src,
> int blocks)
> {
> - while (blocks) {
> - int rem;
> -
> - kernel_neon_begin();
> - rem = __sha1_ce_transform(container_of(sst,
> - struct sha1_ce_state,
> - sst), src, blocks);
> - kernel_neon_end();
> - src += (blocks - rem) * SHA1_BLOCK_SIZE;
> - blocks = rem;
> - }
> + kernel_neon_begin();
> + __sha1_ce_transform(container_of(sst, struct sha1_ce_state, sst), src,
> + blocks);
> + kernel_neon_end();
> }
>
> const u32 sha1_ce_offsetof_count = offsetof(struct sha1_ce_state, sst.count);
> diff --git a/arch/arm64/crypto/sha2-ce-core.S b/arch/arm64/crypto/sha2-ce-core.S
> index fce84d88ddb2..112d772b29db 100644
> --- a/arch/arm64/crypto/sha2-ce-core.S
> +++ b/arch/arm64/crypto/sha2-ce-core.S
> @@ -71,8 +71,8 @@
> .word 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
>
> /*
> - * int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> - * int blocks)
> + * void __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> + * int blocks)
> */
> .text
> SYM_FUNC_START(__sha256_ce_transform)
> @@ -129,7 +129,6 @@ CPU_LE( rev32 v19.16b, v19.16b )
>
> /* handled all input blocks? */
> cbz w2, 2f
> - cond_yield 3f, x5, x6
> b 0b
>
> /*
> @@ -152,6 +151,5 @@ CPU_LE( rev32 v19.16b, v19.16b )
>
> /* store new state */
> 3: st1 {dgav.4s, dgbv.4s}, [x0]
> - mov w0, w2
> ret
> SYM_FUNC_END(__sha256_ce_transform)
> diff --git a/arch/arm64/crypto/sha2-ce-glue.c b/arch/arm64/crypto/sha2-ce-glue.c
> index 0a44d2e7ee1f..f785a66a1de4 100644
> --- a/arch/arm64/crypto/sha2-ce-glue.c
> +++ b/arch/arm64/crypto/sha2-ce-glue.c
> @@ -30,23 +30,16 @@ struct sha256_ce_state {
> extern const u32 sha256_ce_offsetof_count;
> extern const u32 sha256_ce_offsetof_finalize;
>
> -asmlinkage int __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> - int blocks);
> +asmlinkage void __sha256_ce_transform(struct sha256_ce_state *sst, u8 const *src,
> + int blocks);
>
> static void sha256_ce_transform(struct sha256_state *sst, u8 const *src,
> int blocks)
> {
> - while (blocks) {
> - int rem;
> -
> - kernel_neon_begin();
> - rem = __sha256_ce_transform(container_of(sst,
> - struct sha256_ce_state,
> - sst), src, blocks);
> - kernel_neon_end();
> - src += (blocks - rem) * SHA256_BLOCK_SIZE;
> - blocks = rem;
> - }
> + kernel_neon_begin();
> + __sha256_ce_transform(container_of(sst, struct sha256_ce_state, sst),
> + src, blocks);
> + kernel_neon_end();
> }
>
> const u32 sha256_ce_offsetof_count = offsetof(struct sha256_ce_state,
> diff --git a/arch/arm64/crypto/sha3-ce-core.S b/arch/arm64/crypto/sha3-ce-core.S
> index 9c77313f5a60..db64831ad35d 100644
> --- a/arch/arm64/crypto/sha3-ce-core.S
> +++ b/arch/arm64/crypto/sha3-ce-core.S
> @@ -37,7 +37,7 @@
> .endm
>
> /*
> - * int sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
> + * void sha3_ce_transform(u64 *st, const u8 *data, int blocks, int dg_size)
> */
> .text
> SYM_FUNC_START(sha3_ce_transform)
> @@ -184,18 +184,16 @@ SYM_FUNC_START(sha3_ce_transform)
> eor v0.16b, v0.16b, v31.16b
>
> cbnz w8, 3b
> - cond_yield 4f, x8, x9
> cbnz w2, 0b
>
> /* save state */
> -4: st1 { v0.1d- v3.1d}, [x0], #32
> + st1 { v0.1d- v3.1d}, [x0], #32
> st1 { v4.1d- v7.1d}, [x0], #32
> st1 { v8.1d-v11.1d}, [x0], #32
> st1 {v12.1d-v15.1d}, [x0], #32
> st1 {v16.1d-v19.1d}, [x0], #32
> st1 {v20.1d-v23.1d}, [x0], #32
> st1 {v24.1d}, [x0]
> - mov w0, w2
> ret
> SYM_FUNC_END(sha3_ce_transform)
>
> diff --git a/arch/arm64/crypto/sha3-ce-glue.c b/arch/arm64/crypto/sha3-ce-glue.c
> index 250e1377c481..d689cd2bf4cf 100644
> --- a/arch/arm64/crypto/sha3-ce-glue.c
> +++ b/arch/arm64/crypto/sha3-ce-glue.c
> @@ -28,8 +28,8 @@ MODULE_ALIAS_CRYPTO("sha3-256");
> MODULE_ALIAS_CRYPTO("sha3-384");
> MODULE_ALIAS_CRYPTO("sha3-512");
>
> -asmlinkage int sha3_ce_transform(u64 *st, const u8 *data, int blocks,
> - int md_len);
> +asmlinkage void sha3_ce_transform(u64 *st, const u8 *data, int blocks,
> + int md_len);
>
> static int sha3_update(struct shash_desc *desc, const u8 *data,
> unsigned int len)
> @@ -59,15 +59,11 @@ static int sha3_update(struct shash_desc *desc, const u8 *data,
> blocks = len / sctx->rsiz;
> len %= sctx->rsiz;
>
> - while (blocks) {
> - int rem;
> -
> + if (blocks) {
> kernel_neon_begin();
> - rem = sha3_ce_transform(sctx->st, data, blocks,
> - digest_size);
> + sha3_ce_transform(sctx->st, data, blocks, digest_size);
> kernel_neon_end();
> - data += (blocks - rem) * sctx->rsiz;
> - blocks = rem;
> + data += blocks * sctx->rsiz;
> }
> }
>
> diff --git a/arch/arm64/crypto/sha512-ce-core.S b/arch/arm64/crypto/sha512-ce-core.S
> index 91ef68b15fcc..96acc9295230 100644
> --- a/arch/arm64/crypto/sha512-ce-core.S
> +++ b/arch/arm64/crypto/sha512-ce-core.S
> @@ -102,8 +102,8 @@
> .endm
>
> /*
> - * int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> - * int blocks)
> + * void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> + * int blocks)
> */
> .text
> SYM_FUNC_START(__sha512_ce_transform)
> @@ -195,12 +195,10 @@ CPU_LE( rev64 v19.16b, v19.16b )
> add v10.2d, v10.2d, v2.2d
> add v11.2d, v11.2d, v3.2d
>
> - cond_yield 3f, x4, x5
> /* handled all input blocks? */
> cbnz w2, 0b
>
> /* store new state */
> -3: st1 {v8.2d-v11.2d}, [x0]
> - mov w0, w2
> + st1 {v8.2d-v11.2d}, [x0]
> ret
> SYM_FUNC_END(__sha512_ce_transform)
> diff --git a/arch/arm64/crypto/sha512-ce-glue.c b/arch/arm64/crypto/sha512-ce-glue.c
> index f3431fc62315..70eef74fe031 100644
> --- a/arch/arm64/crypto/sha512-ce-glue.c
> +++ b/arch/arm64/crypto/sha512-ce-glue.c
> @@ -26,23 +26,17 @@ MODULE_LICENSE("GPL v2");
> MODULE_ALIAS_CRYPTO("sha384");
> MODULE_ALIAS_CRYPTO("sha512");
>
> -asmlinkage int __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> - int blocks);
> +asmlinkage void __sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> + int blocks);
>
> asmlinkage void sha512_block_data_order(u64 *digest, u8 const *src, int blocks);
>
> static void sha512_ce_transform(struct sha512_state *sst, u8 const *src,
> int blocks)
> {
> - while (blocks) {
> - int rem;
> -
> - kernel_neon_begin();
> - rem = __sha512_ce_transform(sst, src, blocks);
> - kernel_neon_end();
> - src += (blocks - rem) * SHA512_BLOCK_SIZE;
> - blocks = rem;
> - }
> + kernel_neon_begin();
> + __sha512_ce_transform(sst, src, blocks);
> + kernel_neon_end();
> }
>
> static void sha512_arm64_transform(struct sha512_state *sst, u8 const *src,
> diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
> index 376a980f2bad..f0da53a0388f 100644
> --- a/arch/arm64/include/asm/assembler.h
> +++ b/arch/arm64/include/asm/assembler.h
> @@ -759,35 +759,6 @@ alternative_endif
> set_sctlr sctlr_el2, \reg
> .endm
>
> - /*
> - * Check whether preempt/bh-disabled asm code should yield as soon as
> - * it is able. This is the case if we are currently running in task
> - * context, and either a softirq is pending, or the TIF_NEED_RESCHED
> - * flag is set and re-enabling preemption a single time would result in
> - * a preempt count of zero. (Note that the TIF_NEED_RESCHED flag is
> - * stored negated in the top word of the thread_info::preempt_count
> - * field)
> - */
> - .macro cond_yield, lbl:req, tmp:req, tmp2:req
> - get_current_task \tmp
> - ldr \tmp, [\tmp, #TSK_TI_PREEMPT]
> - /*
> - * If we are serving a softirq, there is no point in yielding: the
> - * softirq will not be preempted no matter what we do, so we should
> - * run to completion as quickly as we can.
> - */
> - tbnz \tmp, #SOFTIRQ_SHIFT, .Lnoyield_\@
> -#ifdef CONFIG_PREEMPTION
> - sub \tmp, \tmp, #PREEMPT_DISABLE_OFFSET
> - cbz \tmp, \lbl
> -#endif
> - adr_l \tmp, irq_stat + IRQ_CPUSTAT_SOFTIRQ_PENDING
> - get_this_cpu_offset \tmp2
> - ldr w\tmp, [\tmp, \tmp2]
> - cbnz w\tmp, \lbl // yield on pending softirq in task context
> -.Lnoyield_\@:
> - .endm
> -
> /*
> * Branch Target Identifier (BTI)
> */
> diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c
> index 5ff1942b04fc..fb9e9ef9b527 100644
> --- a/arch/arm64/kernel/asm-offsets.c
> +++ b/arch/arm64/kernel/asm-offsets.c
> @@ -116,10 +116,6 @@ int main(void)
> DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE);
> DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE);
> BLANK();
> - DEFINE(PREEMPT_DISABLE_OFFSET, PREEMPT_DISABLE_OFFSET);
> - DEFINE(SOFTIRQ_SHIFT, SOFTIRQ_SHIFT);
> - DEFINE(IRQ_CPUSTAT_SOFTIRQ_PENDING, offsetof(irq_cpustat_t, __softirq_pending));
> - BLANK();
> DEFINE(CPU_BOOT_TASK, offsetof(struct secondary_data, task));
> BLANK();
> DEFINE(FTR_OVR_VAL_OFFSET, offsetof(struct arm64_ftr_override, val));
> --
> 2.43.0.rc1.413.gea7ed67945-goog
>
More information about the linux-arm-kernel
mailing list