[PATCHv3 10/10] x86/crypto: add pclmul acceleration for crc64
Keith Busch
kbusch at kernel.org
Tue Feb 22 08:31:44 PST 2022
The crc64 table lookup method is inefficient, using a significant number
of CPU cycles in the block stack per IO. If available on x86, use a
PCLMULQDQ implementation to accelerate the calculation.
The assembly from this patch was mostly generated by gcc from a C
program using library functions provided by x86 intrinsics, and measures
~20x faster than the table lookup.
Signed-off-by: Keith Busch <kbusch at kernel.org>
---
arch/x86/crypto/Makefile | 3 +
arch/x86/crypto/crc64-rocksoft-pcl-asm.S | 215 ++++++++++++++++++++++
arch/x86/crypto/crc64-rocksoft-pcl_glue.c | 117 ++++++++++++
crypto/Kconfig | 11 ++
4 files changed, 346 insertions(+)
create mode 100644 arch/x86/crypto/crc64-rocksoft-pcl-asm.S
create mode 100644 arch/x86/crypto/crc64-rocksoft-pcl_glue.c
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index c3af959648e6..036520c59f0e 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -79,6 +79,9 @@ crc32-pclmul-y := crc32-pclmul_asm.o crc32-pclmul_glue.o
obj-$(CONFIG_CRYPTO_CRCT10DIF_PCLMUL) += crct10dif-pclmul.o
crct10dif-pclmul-y := crct10dif-pcl-asm_64.o crct10dif-pclmul_glue.o
+obj-$(CONFIG_CRYPTO_CRC64_ROCKSOFT_PCLMUL) += crc64-rocksoft-pclmul.o
+crc64-rocksoft-pclmul-y := crc64-rocksoft-pcl-asm.o crc64-rocksoft-pcl_glue.o
+
obj-$(CONFIG_CRYPTO_POLY1305_X86_64) += poly1305-x86_64.o
poly1305-x86_64-y := poly1305-x86_64-cryptogams.o poly1305_glue.o
targets += poly1305-x86_64-cryptogams.S
diff --git a/arch/x86/crypto/crc64-rocksoft-pcl-asm.S b/arch/x86/crypto/crc64-rocksoft-pcl-asm.S
new file mode 100644
index 000000000000..e3b633a776a9
--- /dev/null
+++ b/arch/x86/crypto/crc64-rocksoft-pcl-asm.S
@@ -0,0 +1,215 @@
+########################################################################
+# Implement fast Rocksoft CRC-64 computation with SSE and PCLMULQDQ instructions
+#
+
+#include <linux/linkage.h>
+
+SYM_FUNC_START(crc_rocksoft_pcl)
+ leaq (%rsi,%rdx), %rcx
+ movq %rsi, %r10
+ andl $15, %esi
+ movq %rdi, %xmm3
+ leaq 15(%rcx), %rax
+ andq $-16, %r10
+ pxor %xmm1, %xmm1
+ andq $-16, %rax
+ movdqa %xmm1, %xmm5
+ movq %rax, %r8
+ subq %r10, %rax
+ subq %rcx, %r8
+ movl $16, %ecx
+ movq %rax, %r11
+ movq %rcx, %r9
+ sarq $4, %r11
+ subq %rsi, %r9
+ movdqu shuffleMasks(%r9), %xmm4
+ movdqa %xmm4, %xmm0
+ pblendvb %xmm0, (%r10), %xmm5
+ cmpq $16, %rax
+ je .L12
+ movdqa 16(%r10), %xmm2
+ cmpq $2, %r11
+ je .L13
+ pcmpeqd %xmm1, %xmm1
+ leaq -16(%rsi,%rdx), %rdi
+ leaq 16(%r10), %r9
+ pxor %xmm1, %xmm4
+ movdqa %xmm3, %xmm1
+ pshufb %xmm0, %xmm3
+ pshufb %xmm4, %xmm1
+ movdqa %xmm3, %xmm0
+ movdqa .LC0(%rip), %xmm3
+ pxor %xmm5, %xmm1
+ movdqa %xmm1, %xmm4
+ pclmulqdq $0, %xmm3, %xmm1
+ pclmulqdq $17, %xmm3, %xmm4
+ pxor %xmm4, %xmm1
+ pxor %xmm1, %xmm0
+ cmpq $31, %rdi
+ jbe .L6
+ leaq -32(%rdi), %rax
+ movq %rax, %rsi
+ andq $-16, %rax
+ leaq 32(%r10,%rax), %rcx
+ shrq $4, %rsi
+ movq %r9, %rax
+ .p2align 4,,10
+ .p2align 3
+.L7:
+ pxor %xmm2, %xmm0
+ movq %rax, %rdx
+ addq $16, %rax
+ movdqa %xmm0, %xmm1
+ pclmulqdq $0, %xmm3, %xmm0
+ movdqa 16(%rdx), %xmm2
+ pclmulqdq $17, %xmm3, %xmm1
+ pxor %xmm1, %xmm0
+ cmpq %rcx, %rax
+ jne .L7
+ movq %rsi, %rax
+ addq $1, %rsi
+ negq %rax
+ salq $4, %rsi
+ salq $4, %rax
+ addq %rsi, %r9
+ leaq -16(%rdi,%rax), %rdi
+.L6:
+ pxor %xmm2, %xmm0
+ cmpq $16, %rdi
+ je .L9
+ movl $16, %eax
+ pcmpeqd %xmm2, %xmm2
+ movdqa %xmm0, %xmm7
+ subq %r8, %rax
+ movdqu shuffleMasks(%rax), %xmm4
+ pxor %xmm4, %xmm2
+ pshufb %xmm4, %xmm0
+ movdqa 16(%r9), %xmm4
+ pshufb %xmm2, %xmm7
+ pshufb %xmm2, %xmm4
+ movdqa %xmm7, %xmm1
+ movdqa %xmm4, %xmm2
+ movdqa %xmm7, %xmm4
+ pclmulqdq $0, %xmm3, %xmm1
+ pclmulqdq $17, %xmm3, %xmm4
+ por %xmm2, %xmm0
+ pxor %xmm4, %xmm1
+ pxor %xmm1, %xmm0
+.L9:
+ movdqa %xmm0, %xmm2
+ pclmulqdq $16, %xmm3, %xmm0
+ psrldq $8, %xmm2
+ pxor %xmm2, %xmm0
+.L3:
+ movdqa .LC1(%rip), %xmm2
+ movdqa %xmm0, %xmm1
+ pclmulqdq $0, %xmm2, %xmm1
+ movdqa %xmm1, %xmm3
+ pclmulqdq $16, %xmm2, %xmm1
+ pslldq $8, %xmm3
+ pxor %xmm3, %xmm1
+ pxor %xmm1, %xmm0
+ pextrd $3, %xmm0, %eax
+ salq $32, %rax
+ movq %rax, %rdx
+ pextrd $2, %xmm0, %eax
+ orq %rdx, %rax
+ notq %rax
+ ret
+ .p2align 4,,10
+ .p2align 3
+.L13:
+ subq %r8, %rcx
+ pcmpeqd %xmm1, %xmm1
+ movdqu shuffleMasks(%rcx), %xmm7
+ movdqa %xmm7, %xmm6
+ pxor %xmm1, %xmm6
+ cmpq $7, %rdx
+ ja .L5
+ movdqa %xmm1, %xmm4
+ pshufb %xmm7, %xmm5
+ movdqa %xmm3, %xmm1
+ movdqu shuffleMasks(%rdx), %xmm8
+ pshufb %xmm6, %xmm2
+ pxor %xmm8, %xmm4
+ pxor %xmm5, %xmm2
+ pshufb %xmm8, %xmm3
+ pshufb %xmm4, %xmm1
+ movdqa %xmm3, %xmm0
+ pxor %xmm1, %xmm2
+ pslldq $8, %xmm0
+ movdqa %xmm2, %xmm3
+ pclmulqdq $16, .LC0(%rip), %xmm2
+ psrldq $8, %xmm3
+ pxor %xmm3, %xmm0
+ pxor %xmm2, %xmm0
+ jmp .L3
+ .p2align 4,,10
+ .p2align 3
+.L12:
+ movdqu shuffleMasks(%rdx), %xmm2
+ subq %r8, %rcx
+ movdqa %xmm3, %xmm6
+ pcmpeqd %xmm4, %xmm4
+ movdqa %xmm2, %xmm0
+ pshufb %xmm2, %xmm3
+ movdqu shuffleMasks(%rcx), %xmm2
+ pxor %xmm4, %xmm0
+ pslldq $8, %xmm3
+ pxor %xmm4, %xmm2
+ pshufb %xmm0, %xmm6
+ pshufb %xmm2, %xmm5
+ movdqa %xmm5, %xmm1
+ pxor %xmm6, %xmm1
+ movdqa %xmm1, %xmm0
+ pclmulqdq $16, .LC0(%rip), %xmm1
+ psrldq $8, %xmm0
+ pxor %xmm3, %xmm0
+ pxor %xmm1, %xmm0
+ jmp .L3
+ .p2align 4,,10
+ .p2align 3
+.L5:
+ pxor %xmm1, %xmm4
+ movdqa %xmm3, %xmm1
+ pshufb %xmm0, %xmm3
+ pshufb %xmm4, %xmm1
+ pxor %xmm3, %xmm2
+ movdqa .LC0(%rip), %xmm3
+ pxor %xmm5, %xmm1
+ pshufb %xmm6, %xmm2
+ movdqa %xmm1, %xmm5
+ pshufb %xmm7, %xmm1
+ pshufb %xmm6, %xmm5
+ pxor %xmm2, %xmm1
+ movdqa %xmm5, %xmm4
+ movdqa %xmm5, %xmm0
+ pclmulqdq $17, %xmm3, %xmm0
+ pclmulqdq $0, %xmm3, %xmm4
+ pxor %xmm0, %xmm4
+ pxor %xmm4, %xmm1
+ movdqa %xmm1, %xmm0
+ pclmulqdq $16, %xmm3, %xmm1
+ psrldq $8, %xmm0
+ pxor %xmm1, %xmm0
+ jmp .L3
+SYM_FUNC_END(crc_rocksoft_pcl)
+
+.section .rodata
+.align 32
+.type shuffleMasks, @object
+.size shuffleMasks, 32
+shuffleMasks:
+ .string ""
+ .ascii "\001\002\003\004\005\006\007\b\t\n\013\f\r\016\017\217\216\215"
+ .ascii "\214\213\212\211\210\207\206\205\204\203\202\201\200"
+
+.section .rodata.cst16,"aM", at progbits,16
+.align 16
+.LC0:
+ .quad -1523270018343381984
+ .quad 2443614144669557164
+ .align 16
+.LC1:
+ .quad 2876949357237608311
+ .quad 3808117099328934763
diff --git a/arch/x86/crypto/crc64-rocksoft-pcl_glue.c b/arch/x86/crypto/crc64-rocksoft-pcl_glue.c
new file mode 100644
index 000000000000..996780aa3d93
--- /dev/null
+++ b/arch/x86/crypto/crc64-rocksoft-pcl_glue.c
@@ -0,0 +1,117 @@
+#include <linux/types.h>
+#include <linux/module.h>
+#include <linux/crc64.h>
+#include <crypto/internal/hash.h>
+#include <crypto/internal/simd.h>
+#include <linux/init.h>
+#include <linux/string.h>
+#include <linux/kernel.h>
+#include <asm/cpufeatures.h>
+#include <asm/cpu_device_id.h>
+#include <asm/simd.h>
+
+asmlinkage u64 crc_rocksoft_pcl(u64 init_crc, const u8 *buf, size_t len);
+
+struct chksum_desc_ctx {
+ u64 crc;
+};
+
+static int chksum_init(struct shash_desc *desc)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ ctx->crc = 0;
+
+ return 0;
+}
+
+static int chksum_update(struct shash_desc *desc, const u8 *data,
+ unsigned int length)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ if (length >= 16 && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ ctx->crc = crc_rocksoft_pcl(ctx->crc, data, length);
+ kernel_fpu_end();
+ } else
+ ctx->crc = crc64_rocksoft_generic(ctx->crc, data, length);
+ return 0;
+}
+
+static int chksum_final(struct shash_desc *desc, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ *(u64 *)out = ctx->crc;
+ return 0;
+}
+
+static int __chksum_finup(u64 crc, const u8 *data, unsigned int len, u8 *out)
+{
+ if (len >= 16 && crypto_simd_usable()) {
+ kernel_fpu_begin();
+ *(u64 *)out = crc_rocksoft_pcl(crc, data, len);
+ kernel_fpu_end();
+ } else
+ *(u64 *)out = crc64_rocksoft_generic(crc, data, len);
+ return 0;
+}
+
+static int chksum_finup(struct shash_desc *desc, const u8 *data,
+ unsigned int len, u8 *out)
+{
+ struct chksum_desc_ctx *ctx = shash_desc_ctx(desc);
+
+ return __chksum_finup(ctx->crc, data, len, out);
+}
+
+static int chksum_digest(struct shash_desc *desc, const u8 *data,
+ unsigned int length, u8 *out)
+{
+ return __chksum_finup(0, data, length, out);
+}
+
+static struct shash_alg alg = {
+ .digestsize = 8,
+ .init = chksum_init,
+ .update = chksum_update,
+ .final = chksum_final,
+ .finup = chksum_finup,
+ .digest = chksum_digest,
+ .descsize = sizeof(struct chksum_desc_ctx),
+ .base = {
+ .cra_name = CRC64_ROCKSOFT_STRING,
+ .cra_driver_name = "crc64-rocksoft-pclmul",
+ .cra_priority = 200,
+ .cra_blocksize = 1,
+ .cra_module = THIS_MODULE,
+ }
+};
+
+static const struct x86_cpu_id crc64_rocksoft_cpu_id[] = {
+ X86_MATCH_FEATURE(X86_FEATURE_PCLMULQDQ, NULL),
+ {}
+};
+MODULE_DEVICE_TABLE(x86cpu, crc64_rocksoft_cpu_id);
+
+static int __init crc64_rocksoft_x86_mod_init(void)
+{
+ if (!x86_match_cpu(crc64_rocksoft_cpu_id))
+ return -ENODEV;
+
+ return crypto_register_shash(&alg);
+}
+
+static void __exit crc64_rocksoft_x86_mod_fini(void)
+{
+ crypto_unregister_shash(&alg);
+}
+
+module_init(crc64_rocksoft_x86_mod_init);
+module_exit(crc64_rocksoft_x86_mod_fini);
+
+MODULE_AUTHOR("Keith Busch <kbusch at kernel.org>");
+MODULE_DESCRIPTION("Rocksoft CRC64 calculation accelerated with PCLMULQDQ.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_CRYPTO("crc64-rocksoft-pclmul");
diff --git a/crypto/Kconfig b/crypto/Kconfig
index e343147b9f8f..d8861138f117 100644
--- a/crypto/Kconfig
+++ b/crypto/Kconfig
@@ -744,6 +744,17 @@ config CRYPTO_CRC64_ROCKSOFT
transform. This allows for faster crc64 transforms to be used
if they are available.
+config CRYPTO_CRC64_ROCKSOFT_PCLMUL
+ tristate "Rocksoft model CRC64 PCLMULQDQ hardware acceleration"
+ depends on X86 && 64BIT && CRC64
+ select CRYPTO_HASH
+ help
+ For x86_64 processors with SSE4.2 and PCLMULQDQ supported,
+ CRC64 PCLMULQDQ computation can be hardware accelerated PCLMULQDQ
+ instruction. This option will create 'crc64-rocksoft-pclmul'
+ module, which is faster when computing crc64 checksum compared
+ with the generic table implementation.
+
config CRYPTO_VPMSUM_TESTER
tristate "Powerpc64 vpmsum hardware acceleration tester"
depends on CRYPTO_CRCT10DIF_VPMSUM && CRYPTO_CRC32C_VPMSUM
--
2.25.4
More information about the Linux-nvme
mailing list