[PATCH 6/6] crypto: arm/crct10dif - Implement plain NEON variant
Ard Biesheuvel
ardb+git at google.com
Mon Oct 28 12:02:14 PDT 2024
From: Ard Biesheuvel <ardb at kernel.org>
The CRC-T10DIF algorithm produces a 16-bit CRC, and this is reflected in
the folding coefficients, which are also only 16 bits wide.
This means that the polynomial multiplications involving these
coefficients can be performed using 8-bit long polynomial multiplication
(8x8 -> 16) in only a few steps, and this is an instruction that is part
of the base NEON ISA, which is all most real ARMv7 cores implement. (The
64-bit PMULL instruction is part of the crypto extensions, which are
only implemented by 64-bit cores)
The final reduction is a bit more involved, but we can delegate that to
the generic CRC-T10DIF implementation after folding the entire input
into a 16 byte vector.
This results in a speedup of around 6.6x on Cortex-A72 running in 32-bit
mode.
Signed-off-by: Ard Biesheuvel <ardb at kernel.org>
---
arch/arm/crypto/crct10dif-ce-core.S | 50 ++++++++++++++++++--
arch/arm/crypto/crct10dif-ce-glue.c | 44 +++++++++++++++--
2 files changed, 85 insertions(+), 9 deletions(-)
diff --git a/arch/arm/crypto/crct10dif-ce-core.S b/arch/arm/crypto/crct10dif-ce-core.S
index 6b72167574b2..5e103a9a42dd 100644
--- a/arch/arm/crypto/crct10dif-ce-core.S
+++ b/arch/arm/crypto/crct10dif-ce-core.S
@@ -112,6 +112,34 @@
FOLD_CONST_L .req q10l
FOLD_CONST_H .req q10h
+__pmull16x64_p8:
+ vmull.p8 q13, d23, d24
+ vmull.p8 q14, d23, d25
+ vmull.p8 q15, d22, d24
+ vmull.p8 q12, d22, d25
+
+ veor q14, q14, q15
+ veor d24, d24, d25
+ veor d26, d26, d27
+ veor d28, d28, d29
+ vmov.i32 d25, #0
+ vmov.i32 d29, #0
+ vext.8 q12, q12, q12, #14
+ vext.8 q14, q14, q14, #15
+ veor d24, d24, d26
+ bx lr
+ENDPROC(__pmull16x64_p8)
+
+ .macro pmull16x64_p8, v16, v64
+ vext.8 q11, \v64, \v64, #1
+ vld1.64 {q12}, [r4, :128]
+ vuzp.8 q11, \v64
+ vtbl.8 d24, {\v16\()_L-\v16\()_H}, d24
+ vtbl.8 d25, {\v16\()_L-\v16\()_H}, d25
+ bl __pmull16x64_p8
+ veor \v64, q12, q14
+ .endm
+
.macro pmull16x64_p64, v16, v64
vmull.p64 q11, \v64\()l, \v16\()_L
vmull.p64 \v64, \v64\()h, \v16\()_H
@@ -249,9 +277,9 @@ CPU_LE( vrev64.8 q0, q0 )
vswp q0l, q0h
// q1 = high order part of second chunk: q7 left-shifted by 'len' bytes.
- mov_l r3, .Lbyteshift_table + 16
- sub r3, r3, len
- vld1.8 {q2}, [r3]
+ mov_l r1, .Lbyteshift_table + 16
+ sub r1, r1, len
+ vld1.8 {q2}, [r1]
vtbl.8 q1l, {q7l-q7h}, q2l
vtbl.8 q1h, {q7l-q7h}, q2h
@@ -341,9 +369,20 @@ ENTRY(crc_t10dif_pmull64)
vmov.u16 r0, q0l[0]
bx lr
-
ENDPROC(crc_t10dif_pmull64)
+ENTRY(crc_t10dif_pmull8)
+ push {r4, lr}
+ mov_l r4, .L16x64perm
+
+ crct10dif p8
+
+CPU_LE( vrev64.8 q7, q7 )
+ vswp q7l, q7h
+ vst1.64 {q7}, [r3, :128]
+ pop {r4, pc}
+ENDPROC(crc_t10dif_pmull8)
+
.section ".rodata", "a"
.align 4
@@ -376,3 +415,6 @@ ENDPROC(crc_t10dif_pmull64)
.byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f
.byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7
.byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0
+
+.L16x64perm:
+ .quad 0x808080800000000, 0x909090901010101
diff --git a/arch/arm/crypto/crct10dif-ce-glue.c b/arch/arm/crypto/crct10dif-ce-glue.c
index 60aa79c2fcdb..4431e4ce2dbe 100644
--- a/arch/arm/crypto/crct10dif-ce-glue.c
+++ b/arch/arm/crypto/crct10dif-ce-glue.c
@@ -20,6 +20,7 @@
#define CRC_T10DIF_PMULL_CHUNK_SIZE 16U
asmlinkage u16 crc_t10dif_pmull64(u16 init_crc, const u8 *buf, size_t len);
+asmlinkage void crc_t10dif_pmull8(u16 init_crc, const u8 *buf, size_t len, u8 *out);
static int crct10dif_init(struct shash_desc *desc)
{
@@ -45,6 +46,27 @@ static int crct10dif_update_ce(struct shash_desc *desc, const u8 *data,
return 0;
}
+static int crct10dif_update_neon(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ u16 *crcp = shash_desc_ctx(desc);
+ u8 buf[16] __aligned(16);
+ u16 crc = *crcp;
+
+ if (length >= CRC_T10DIF_PMULL_CHUNK_SIZE && crypto_simd_usable()) {
+ kernel_neon_begin();
+ crc_t10dif_pmull8(crc, data, length, buf);
+ kernel_neon_end();
+
+ crc = 0;
+ data = buf;
+ length = sizeof(buf);
+ }
+
+ *crcp = crc_t10dif_generic(crc, data, length);
+ return 0;
+}
+
static int crct10dif_final(struct shash_desc *desc, u8 *out)
{
u16 *crc = shash_desc_ctx(desc);
@@ -53,7 +75,19 @@ static int crct10dif_final(struct shash_desc *desc, u8 *out)
return 0;
}
-static struct shash_alg crc_t10dif_alg = {
+static struct shash_alg algs[] = {{
+ .digestsize = CRC_T10DIF_DIGEST_SIZE,
+ .init = crct10dif_init,
+ .update = crct10dif_update_neon,
+ .final = crct10dif_final,
+ .descsize = CRC_T10DIF_DIGEST_SIZE,
+
+ .base.cra_name = "crct10dif",
+ .base.cra_driver_name = "crct10dif-arm-neon",
+ .base.cra_priority = 150,
+ .base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
+ .base.cra_module = THIS_MODULE,
+}, {
.digestsize = CRC_T10DIF_DIGEST_SIZE,
.init = crct10dif_init,
.update = crct10dif_update_ce,
@@ -65,19 +99,19 @@ static struct shash_alg crc_t10dif_alg = {
.base.cra_priority = 200,
.base.cra_blocksize = CRC_T10DIF_BLOCK_SIZE,
.base.cra_module = THIS_MODULE,
-};
+}};
static int __init crc_t10dif_mod_init(void)
{
- if (!(elf_hwcap2 & HWCAP2_PMULL))
+ if (!(elf_hwcap & HWCAP_NEON))
return -ENODEV;
- return crypto_register_shash(&crc_t10dif_alg);
+ return crypto_register_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
static void __exit crc_t10dif_mod_exit(void)
{
- crypto_unregister_shash(&crc_t10dif_alg);
+ crypto_unregister_shashes(algs, 1 + !!(elf_hwcap2 & HWCAP2_PMULL));
}
module_init(crc_t10dif_mod_init);
--
2.47.0.163.g1226f6d8fa-goog
More information about the linux-arm-kernel
mailing list