[PATCH 1/4] arm64: optimized copy_to_user assembly code
Feng Kan
fkan at apm.com
Thu Aug 15 14:29:57 EDT 2013
Using the optimized copy_to_user code, iperf performance improved 20%.
---
arch/arm64/lib/copy_to_user.S | 249 +++++++++++++++++++++++++++++++++--------
1 files changed, 204 insertions(+), 45 deletions(-)
diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
index a0aeeb9..2a07b13 100644
--- a/arch/arm64/lib/copy_to_user.S
+++ b/arch/arm64/lib/copy_to_user.S
@@ -1,61 +1,220 @@
/*
- * Copyright (C) 2012 ARM Ltd.
+ * Copyright (c) 2013, Applied Micro Circuits Corporation
+ * Copyright (c) 2012-2013, Linaro Limited
*
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
+ * Author: Feng Kan <fkan at apm.com>
+ * Author: Philipp Tomsich <philipp.tomsich at theobroma-systems.com>
*
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
+ * The code is adopted from the memcpy routine by Linaro Limited.
*
- * You should have received a copy of the GNU General Public License
- * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ * This file is dual licensed: you can use it either under the terms of
+ * the GPL, or the BSD license, at your option.
+ *
+ * a) This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of the
+ * License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this library; if not, write to the Free
+ * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
+ * MA 02110-1301 USA
+ *
+ * Alternatively,
+ *
+ * b) Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND
+ * CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
+ * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
+ * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <linux/linkage.h>
#include <asm/assembler.h>
-/*
- * Copy to user space from a kernel buffer (alignment handled by the hardware)
- *
- * Parameters:
- * x0 - to
- * x1 - from
- * x2 - n
- * Returns:
- * x0 - bytes not copied
- */
-ENTRY(__copy_to_user)
- add x4, x0, x2 // upper user buffer boundary
- subs x2, x2, #8
- b.mi 2f
+#define dstin x0
+#define src x1
+#define count x2
+#define tmp1 x3
+#define tmp1w w3
+#define tmp2 x4
+#define tmp2w w4
+#define tmp3 x5
+#define tmp3w w5
+#define dst x6
+
+#define A_l x7
+#define A_h x8
+#define B_l x9
+#define B_h x10
+#define C_l x11
+#define C_h x12
+#define D_l x13
+#define D_h x14
+
+ /*
+ * Prototype: size_t copy_to_user (void *dst, const void *src, size_t nb)
+ */
+ .text
+ .align 2
+ .p2align 6,,63
+ENTRY(__copy_to_user)
+ mov dst, dstin
+ cmp count, #64
+ b.ge .Lcpy_not_short
+ cmp count, #15
+ b.le .Ltail15tiny
+
+ /* Deal with small copies quickly by dropping straight into the
+ * exit block. */
+.Ltail63:
+ /* Copy up to 48 bytes of data. At this point we only need the
+ * bottom 6 bits of count to be accurate. */
+ ands tmp1, count, #0x30
+ b.eq .Ltail15
+ add dst, dst, tmp1
+ add src, src, tmp1
+ cmp tmp1w, #0x20
+ b.eq 1f
+ b.lt 2f
+ ldp A_l, A_h, [src, #-48]
+USER (9f, stp A_l, A_h, [dst, #-48])
+1:
+ ldp A_l, A_h, [src, #-32]
+USER (9f, stp A_l, A_h, [dst, #-32])
+2:
+ ldp A_l, A_h, [src, #-16]
+USER (9f, stp A_l, A_h, [dst, #-16])
+
+.Ltail15:
+ ands count, count, #15
+ beq 1f
+ add src, src, count
+ ldp A_l, A_h, [src, #-16]
+ add dst, dst, count
+USER (9f, stp A_l, A_h, [dst, #-16])
+1:
+ b .Lsuccess
+
+.Ltail15tiny:
+ /* Copy up to 15 bytes of data. Does not assume additional data
+ being copied. */
+ tbz count, #3, 1f
+ ldr tmp1, [src], #8
+USER (9f, str tmp1, [dst], #8)
+1:
+ tbz count, #2, 1f
+ ldr tmp1w, [src], #4
+USER (9f, str tmp1w, [dst], #4)
1:
- ldr x3, [x1], #8
- subs x2, x2, #8
-USER(9f, str x3, [x0], #8 )
- b.pl 1b
-2: adds x2, x2, #4
- b.mi 3f
- ldr w3, [x1], #4
- sub x2, x2, #4
-USER(9f, str w3, [x0], #4 )
-3: adds x2, x2, #2
- b.mi 4f
- ldrh w3, [x1], #2
- sub x2, x2, #2
-USER(9f, strh w3, [x0], #2 )
-4: adds x2, x2, #1
- b.mi 5f
- ldrb w3, [x1]
-USER(9f, strb w3, [x0] )
-5: mov x0, #0
+ tbz count, #1, 1f
+ ldrh tmp1w, [src], #2
+USER (9f, strh tmp1w, [dst], #2)
+1:
+ tbz count, #0, 1f
+ ldrb tmp1w, [src]
+USER (9f, strb tmp1w, [dst])
+1:
+ b .Lsuccess
+
+.Lcpy_not_short:
+ /* We don't much care about the alignment of DST, but we want SRC
+ * to be 128-bit (16 byte) aligned so that we don't cross cache line
+ * boundaries on both loads and stores. */
+ neg tmp2, src
+ ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
+ b.eq 2f
+ sub count, count, tmp2
+ /* Copy more data than needed; it's faster than jumping
+ * around copying sub-Quadword quantities. We know that
+ * it can't overrun. */
+ ldp A_l, A_h, [src]
+ add src, src, tmp2
+USER (9f, stp A_l, A_h, [dst])
+ add dst, dst, tmp2
+ /* There may be less than 63 bytes to go now. */
+ cmp count, #63
+ b.le .Ltail63
+2:
+ subs count, count, #128
+ b.ge .Lcpy_body_large
+ /* Less than 128 bytes to copy, so handle 64 here and then jump
+ * to the tail. */
+ ldp A_l, A_h, [src]
+ ldp B_l, B_h, [src, #16]
+ ldp C_l, C_h, [src, #32]
+ ldp D_l, D_h, [src, #48]
+USER (9f, stp A_l, A_h, [dst])
+USER (9f, stp B_l, B_h, [dst, #16])
+USER (9f, stp C_l, C_h, [dst, #32])
+USER (9f, stp D_l, D_h, [dst, #48])
+ tst count, #0x3f
+ add src, src, #64
+ add dst, dst, #64
+ b.ne .Ltail63
+ b .Lsuccess
+
+ /* Critical loop. Start at a new cache line boundary. Assuming
+ * 64 bytes per line this ensures the entire loop is in one line. */
+ .p2align 6
+.Lcpy_body_large:
+ /* There are at least 128 bytes to copy. */
+ ldp A_l, A_h, [src, #0]
+ sub dst, dst, #16 /* Pre-bias. */
+ ldp B_l, B_h, [src, #16]
+ ldp C_l, C_h, [src, #32]
+ ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
+1:
+USER (9f, stp A_l, A_h, [dst, #16])
+ ldp A_l, A_h, [src, #16]
+USER (9f, stp B_l, B_h, [dst, #32])
+ ldp B_l, B_h, [src, #32]
+USER (9f, stp C_l, C_h, [dst, #48])
+ ldp C_l, C_h, [src, #48]
+USER (9f, stp D_l, D_h, [dst, #64]!)
+ ldp D_l, D_h, [src, #64]!
+ subs count, count, #64
+ b.ge 1b
+USER (9f, stp A_l, A_h, [dst, #16])
+USER (9f, stp B_l, B_h, [dst, #32])
+USER (9f, stp C_l, C_h, [dst, #48])
+USER (9f, stp D_l, D_h, [dst, #64])
+ add src, src, #16
+ add dst, dst, #64 + 16
+ tst count, #0x3f
+ b.ne .Ltail63
+.Lsuccess:
+ mov x0, #0 // nothing left to copy
ret
ENDPROC(__copy_to_user)
.section .fixup,"ax"
- .align 2
-9: sub x0, x4, x0 // bytes not copied
+ .align 2
+9: mov x0, count // approximate the number of bytes not copied
ret
.previous
--
1.7.6.1
More information about the linux-arm-kernel
mailing list