[PATCH 3/6] arm64: lib: Implement optimized memset routine
zhichang.yuan at linaro.org
zhichang.yuan at linaro.org
Wed Dec 11 01:24:39 EST 2013
From: "zhichang.yuan" <zhichang.yuan at linaro.org>
This patch, based on Linaro's Cortex Strings library, improves
the performance of the assembly optimized memset() function.
Signed-off-by: Zhichang Yuan <zhichang.yuan at linaro.org>
Signed-off-by: Deepak Saxena <dsaxena at linaro.org>
---
arch/arm64/lib/memset.S | 227 +++++++++++++++++++++++++++++++++++++++++------
1 file changed, 201 insertions(+), 26 deletions(-)
diff --git a/arch/arm64/lib/memset.S b/arch/arm64/lib/memset.S
index 87e4a68..90b973e 100644
--- a/arch/arm64/lib/memset.S
+++ b/arch/arm64/lib/memset.S
@@ -1,13 +1,21 @@
/*
* Copyright (C) 2013 ARM Ltd.
+ * Copyright (C) 2013 Linaro.
+ *
+ * This code is based on glibc cortex strings work originally authored by Linaro
+ * and re-licensed under GPLv2 for the Linux kernel. The original code can
+ * be found @
+ *
+ * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
+ * files/head:/src/aarch64/
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * This program is distributed "as is" WITHOUT ANY WARRANTY of any
+ * kind, whether express or implied; without even the implied warranty
+ * of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
@@ -18,7 +26,7 @@
#include <asm/assembler.h>
/*
- * Fill in the buffer with character c (alignment handled by the hardware)
+ * Fill in the buffer with character c
*
* Parameters:
* x0 - buf
@@ -27,27 +35,194 @@
* Returns:
* x0 - buf
*/
+
+/* By default we assume that the DC instruction can be used to zero
+* data blocks more efficiently. In some circumstances this might be
+* unsafe, for example in an asymmetric multiprocessor environment with
+* different DC clear lengths (neither the upper nor lower lengths are
+* safe to use). The feature can be disabled by defining DONT_USE_DC.
+*/
+
+#define dstin x0
+#define val w1
+#define count x2
+#define tmp1 x3
+#define tmp1w w3
+#define tmp2 x4
+#define tmp2w w4
+#define zva_len_x x5
+#define zva_len w5
+#define zva_bits_x x6
+
+#define A_l x7
+#define A_lw w7
+#define dst x8
+#define tmp3w w9
+#define tmp3 x9
+
ENTRY(memset)
- mov x4, x0
- and w1, w1, #0xff
- orr w1, w1, w1, lsl #8
- orr w1, w1, w1, lsl #16
- orr x1, x1, x1, lsl #32
- subs x2, x2, #8
- b.mi 2f
-1: str x1, [x4], #8
- subs x2, x2, #8
- b.pl 1b
-2: adds x2, x2, #4
- b.mi 3f
- sub x2, x2, #4
- str w1, [x4], #4
-3: adds x2, x2, #2
- b.mi 4f
- sub x2, x2, #2
- strh w1, [x4], #2
-4: adds x2, x2, #1
- b.mi 5f
- strb w1, [x4]
-5: ret
+ mov dst, dstin /* Preserve return value. */
+ and A_lw, val, #255
+ orr A_lw, A_lw, A_lw, lsl #8
+ orr A_lw, A_lw, A_lw, lsl #16
+ orr A_l, A_l, A_l, lsl #32
+
+ /*first align dst with 16...*/
+ neg tmp2, dst
+ ands tmp2, tmp2, #15
+ b.eq .Laligned
+
+ cmp count, #15
+ b.le .Ltail15tiny
+ /*
+ * The count is not less than 16, we can use stp to set 16 bytes
+ * once. This way is more efficient but the access is non-aligned.
+ */
+ stp A_l, A_l, [dst]
+ /*make the dst aligned..*/
+ sub count, count, tmp2
+ add dst, dst, tmp2
+
+ /*Here, dst is aligned 16 now...*/
+.Laligned:
+#ifndef DONT_USE_DC
+ cbz A_l, .Lzero_mem
+#endif
+
+.Ltail_maybe_long:
+ cmp count, #64
+ b.ge .Lnot_short
+.Ltail63:
+ ands tmp1, count, #0x30
+ b.eq .Ltail15tiny
+ cmp tmp1w, #0x20
+ b.eq 1f
+ b.lt 2f
+ stp A_l, A_l, [dst], #16
+1:
+ stp A_l, A_l, [dst], #16
+2:
+ stp A_l, A_l, [dst], #16
+/*
+* The following process is non-aligned access. But it is more efficient than
+* .Ltail15tiny. Of-course, we can delete this code, but have a bit
+* performance cost.
+*/
+ ands count, count, #15
+ cbz count, 1f
+ add dst, dst, count
+ stp A_l, A_l, [dst, #-16] /* Repeat some/all of last store. */
+1:
+ ret
+
+.Ltail15tiny:
+ /* Set up to 15 bytes. Does not assume earlier memory
+ being set. */
+ tbz count, #3, 1f
+ str A_l, [dst], #8
+1:
+ tbz count, #2, 1f
+ str A_lw, [dst], #4
+1:
+ tbz count, #1, 1f
+ strh A_lw, [dst], #2
+1:
+ tbz count, #0, 1f
+ strb A_lw, [dst]
+1:
+ ret
+
+ /*
+ * Critical loop. Start at a new cache line boundary. Assuming
+ * 64 bytes per line, this ensures the entire loop is in one line.
+ */
+ .p2align 6
+.Lnot_short: /*count must be not less than 64*/
+ sub dst, dst, #16/* Pre-bias. */
+ sub count, count, #64
+1:
+ stp A_l, A_l, [dst, #16]
+ stp A_l, A_l, [dst, #32]
+ stp A_l, A_l, [dst, #48]
+ stp A_l, A_l, [dst, #64]!
+ subs count, count, #64
+ b.ge 1b
+ tst count, #0x3f
+ add dst, dst, #16
+ b.ne .Ltail63
+.Lexitfunc:
+ ret
+
+#ifndef DONT_USE_DC
+ /*
+ * For zeroing memory, check to see if we can use the ZVA feature to
+ * zero entire 'cache' lines.
+ */
+.Lzero_mem:
+ cmp count, #63
+ b.le .Ltail63
+ /*
+ * For zeroing small amounts of memory, it's not worth setting up
+ * the line-clear code.
+ */
+ cmp count, #128
+ b.lt .Lnot_short /*count is at least 128 bytes*/
+
+ mrs tmp1, dczid_el0
+ tbnz tmp1, #4, .Lnot_short
+ mov tmp3w, #4
+ and zva_len, tmp1w, #15 /* Safety: other bits reserved. */
+ lsl zva_len, tmp3w, zva_len
+
+ ands tmp3w, zva_len, #63
+ /*
+ * ensure the zva_len is not less than 64.
+ * It is not meaningful to use ZVA if the block size is less than 64.
+ */
+ b.ne .Lnot_short
+.Lzero_by_line:
+ /*
+ * Compute how far we need to go to become suitably aligned. We're
+ * already at quad-word alignment.
+ */
+ cmp count, zva_len_x
+ b.lt .Lnot_short /* Not enough to reach alignment. */
+ sub zva_bits_x, zva_len_x, #1
+ neg tmp2, dst
+ ands tmp2, tmp2, zva_bits_x
+ b.eq 1f /* Already aligned. */
+ /* Not aligned, check that there's enough to copy after alignment.*/
+ sub tmp1, count, tmp2
+ /*
+ * grantee the remain length to be ZVA is bigger than 64,
+ * avoid to make the 2f's process over mem range.*/
+ cmp tmp1, #64
+ ccmp tmp1, zva_len_x, #8, ge /* NZCV=0b1000 */
+ b.lt .Lnot_short
+ /*
+ * We know that there's at least 64 bytes to zero and that it's safe
+ * to overrun by 64 bytes.
+ */
+ mov count, tmp1
+2:
+ stp A_l, A_l, [dst]
+ stp A_l, A_l, [dst, #16]
+ stp A_l, A_l, [dst, #32]
+ subs tmp2, tmp2, #64
+ stp A_l, A_l, [dst, #48]
+ add dst, dst, #64
+ b.ge 2b
+ /* We've overrun a bit, so adjust dst downwards.*/
+ add dst, dst, tmp2
+1:
+ sub count, count, zva_len_x
+3:
+ dc zva, dst
+ add dst, dst, zva_len_x
+ subs count, count, zva_len_x
+ b.ge 3b
+ ands count, count, zva_bits_x
+ b.ne .Ltail_maybe_long
+ ret
+#endif /* DONT_USE_DC */
ENDPROC(memset)
--
1.7.9.5
More information about the linux-arm-kernel
mailing list