[PATCH 4/5] xor/arm64: Use shared NEON intrinsics implementation from 32-bit ARM
Ard Biesheuvel
ardb+git at google.com
Fri Mar 27 04:30:52 PDT 2026
From: Ard Biesheuvel <ardb at kernel.org>
Tweak the arm64 code so that the pure NEON intrinsics implementation of
XOR is shared between arm64 and ARM.
Signed-off-by: Ard Biesheuvel <ardb at kernel.org>
---
lib/raid/xor/arm64/xor-neon.c | 170 +-------------------
lib/raid/xor/arm64/xor-neon.h | 3 +
lib/raid/xor/arm64/xor_arch.h | 4 +-
3 files changed, 5 insertions(+), 172 deletions(-)
diff --git a/lib/raid/xor/arm64/xor-neon.c b/lib/raid/xor/arm64/xor-neon.c
index 97ef3cb92496..43fa5236fd41 100644
--- a/lib/raid/xor/arm64/xor-neon.c
+++ b/lib/raid/xor/arm64/xor-neon.c
@@ -1,179 +1,11 @@
// SPDX-License-Identifier: GPL-2.0-only
-/*
- * Authors: Jackie Liu <liuyun01 at kylinos.cn>
- * Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
- */
#include <linux/cache.h>
#include <asm/neon-intrinsics.h>
#include "xor_impl.h"
-#include "xor_arch.h"
#include "xor-neon.h"
-static void __xor_neon_2(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2)
-{
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
-
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
-
- do {
- /* p1 ^= p2 */
- v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
- v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
- v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
- v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
-
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
-
- dp1 += 8;
- dp2 += 8;
- } while (--lines > 0);
-}
-
-static void __xor_neon_3(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3)
-{
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
-
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
-
- do {
- /* p1 ^= p2 */
- v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
- v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
- v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
- v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
-
- /* p1 ^= p3 */
- v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
-
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
-
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- } while (--lines > 0);
-}
-
-static void __xor_neon_4(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3,
- const unsigned long * __restrict p4)
-{
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- uint64_t *dp4 = (uint64_t *)p4;
-
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
-
- do {
- /* p1 ^= p2 */
- v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
- v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
- v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
- v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
-
- /* p1 ^= p3 */
- v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
-
- /* p1 ^= p4 */
- v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
-
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
-
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- dp4 += 8;
- } while (--lines > 0);
-}
-
-static void __xor_neon_5(unsigned long bytes, unsigned long * __restrict p1,
- const unsigned long * __restrict p2,
- const unsigned long * __restrict p3,
- const unsigned long * __restrict p4,
- const unsigned long * __restrict p5)
-{
- uint64_t *dp1 = (uint64_t *)p1;
- uint64_t *dp2 = (uint64_t *)p2;
- uint64_t *dp3 = (uint64_t *)p3;
- uint64_t *dp4 = (uint64_t *)p4;
- uint64_t *dp5 = (uint64_t *)p5;
-
- register uint64x2_t v0, v1, v2, v3;
- long lines = bytes / (sizeof(uint64x2_t) * 4);
-
- do {
- /* p1 ^= p2 */
- v0 = veorq_u64(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0));
- v1 = veorq_u64(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2));
- v2 = veorq_u64(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4));
- v3 = veorq_u64(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6));
-
- /* p1 ^= p3 */
- v0 = veorq_u64(v0, vld1q_u64(dp3 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp3 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp3 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp3 + 6));
-
- /* p1 ^= p4 */
- v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
-
- /* p1 ^= p5 */
- v0 = veorq_u64(v0, vld1q_u64(dp5 + 0));
- v1 = veorq_u64(v1, vld1q_u64(dp5 + 2));
- v2 = veorq_u64(v2, vld1q_u64(dp5 + 4));
- v3 = veorq_u64(v3, vld1q_u64(dp5 + 6));
-
- /* store */
- vst1q_u64(dp1 + 0, v0);
- vst1q_u64(dp1 + 2, v1);
- vst1q_u64(dp1 + 4, v2);
- vst1q_u64(dp1 + 6, v3);
-
- dp1 += 8;
- dp2 += 8;
- dp3 += 8;
- dp4 += 8;
- dp5 += 8;
- } while (--lines > 0);
-}
-
-__DO_XOR_BLOCKS(neon_inner, __xor_neon_2, __xor_neon_3, __xor_neon_4,
- __xor_neon_5);
+#include "../arm/xor-neon.c"
static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
{
diff --git a/lib/raid/xor/arm64/xor-neon.h b/lib/raid/xor/arm64/xor-neon.h
index 514699ba8f5f..d49e7a7f0e14 100644
--- a/lib/raid/xor/arm64/xor-neon.h
+++ b/lib/raid/xor/arm64/xor-neon.h
@@ -1,5 +1,8 @@
/* SPDX-License-Identifier: GPL-2.0-only */
+extern struct xor_block_template xor_block_neon;
+extern struct xor_block_template xor_block_eor3;
+
void xor_gen_neon_inner(void *dest, void **srcs, unsigned int src_cnt,
unsigned int bytes);
void xor_gen_eor3_inner(void *dest, void **srcs, unsigned int src_cnt,
diff --git a/lib/raid/xor/arm64/xor_arch.h b/lib/raid/xor/arm64/xor_arch.h
index 5dbb40319501..7c9d16324c33 100644
--- a/lib/raid/xor/arm64/xor_arch.h
+++ b/lib/raid/xor/arm64/xor_arch.h
@@ -4,9 +4,7 @@
* Copyright (C) 2018,Tianjin KYLIN Information Technology Co., Ltd.
*/
#include <asm/simd.h>
-
-extern struct xor_block_template xor_block_neon;
-extern struct xor_block_template xor_block_eor3;
+#include "xor-neon.h"
static __always_inline void __init arch_xor_init(void)
{
--
2.53.0.1018.g2bb0e51243-goog
More information about the linux-arm-kernel
mailing list