[PATCH 1/3] arm64: Update copy_from_user()
Robin Murphy
robin.murphy at arm.com
Wed Sep 28 04:58:51 PDT 2022
Replace the old mangled-beyond-hope copy_from_user() routine with a
shiny new one based on our newest memcpy() implementation. When plumbed
into the standalone memcpy benchmark from the Arm Optimized Routines
library, this routine outperforms the old one by up to ~1.7x for small
copies, with comparable gains across a range of microarchitectures,
levelling off once sizes get up to ~2KB and general load/store
throughput starts to dominate.
Some of this is paid for by pushing more complexity into the fixup
handlers, but much of that could be recovered again with cleverer
exception records that can give more information about the original
access to the handler itself. For now though, the label sleds are at
least entertaining.
Signed-off-by: Robin Murphy <robin.murphy at arm.com>
---
arch/arm64/lib/copy_from_user.S | 274 ++++++++++++++++++++++++++------
1 file changed, 223 insertions(+), 51 deletions(-)
diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
index 34e317907524..a4b9bd73a5a8 100644
--- a/arch/arm64/lib/copy_from_user.S
+++ b/arch/arm64/lib/copy_from_user.S
@@ -1,73 +1,245 @@
/* SPDX-License-Identifier: GPL-2.0-only */
/*
- * Copyright (C) 2012 ARM Ltd.
+ * Copyright (c) 2012-2022, Arm Limited.
*/
#include <linux/linkage.h>
#include <asm/asm-uaccess.h>
#include <asm/assembler.h>
-#include <asm/cache.h>
-/*
- * Copy from user space to a kernel buffer (alignment handled by the hardware)
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, unaligned accesses.
*
- * Parameters:
- * x0 - to
- * x1 - from
- * x2 - n
- * Returns:
- * x0 - bytes not copied
*/
- .macro ldrb1 reg, ptr, val
- user_ldst 9998f, ldtrb, \reg, \ptr, \val
- .endm
+#define L(label) .L ## label
- .macro strb1 reg, ptr, val
- strb \reg, [\ptr], \val
- .endm
+#define dstin x0
+#define src x1
+#define count x2
+#define dst x3
+#define srcend x4
+#define dstend x5
+#define A_l x6
+#define A_lw w6
+#define A_h x7
+#define B_l x8
+#define B_lw w8
+#define B_h x9
+#define C_l x10
+#define C_lw w10
+#define C_h x11
+#define D_l x12
+#define D_h x13
+#define E_l x14
+#define E_h x15
+#define F_l x16
+#define F_h x17
+#define tmp1 x14
- .macro ldrh1 reg, ptr, val
- user_ldst 9997f, ldtrh, \reg, \ptr, \val
- .endm
+/*
+ * Derived from memcpy with various adjustments:
+ *
+ * - memmove parts are removed since user and kernel pointers won't overlap.
+ * - The main loop is scaled down to 48 bytes per iteration since the increase
+ * in load ops changes the balance; little cores barely notice the difference,
+ * so big cores can benefit from keeping the loop relatively short.
+ * - Similarly, preferring source rather than destination alignment works out
+ * better on average.
+ * - The 33-128 byte cases are reworked to better balance the stores with the
+ * doubled-up load ops, and keep a more consistent access pattern.
+ * - The 0-3 byte sequence is replaced with the one borrowed from clear_user,
+ * since LDTRB lacks a register-offset addressing mode.
+ */
- .macro strh1 reg, ptr, val
- strh \reg, [\ptr], \val
- .endm
+#define U_pre(x...) USER(L(fixup_pre), x)
+#define U_dst(x...) USER(L(fixup_dst), x)
+#define U_S1(x...) USER(L(fixup_s1), x)
+#define U_M16(x...) USER(L(fixup_m16), x)
+#define U_M32(x...) USER(L(fixup_m32), x)
+#define U_M64(x...) USER(L(fixup_m64), x)
+#define U_L32(x...) USER(L(fixup_l32), x)
+#define U_L48(x...) USER(L(fixup_l48), x)
+#define U_L64(x...) USER(L(fixup_l64), x)
- .macro ldr1 reg, ptr, val
- user_ldst 9997f, ldtr, \reg, \ptr, \val
- .endm
-
- .macro str1 reg, ptr, val
- str \reg, [\ptr], \val
- .endm
-
- .macro ldp1 reg1, reg2, ptr, val
- user_ldp 9997f, \reg1, \reg2, \ptr, \val
- .endm
-
- .macro stp1 reg1, reg2, ptr, val
- stp \reg1, \reg2, [\ptr], \val
- .endm
-
-end .req x5
-srcin .req x15
SYM_FUNC_START(__arch_copy_from_user)
- add end, x0, x2
- mov srcin, x1
-#include "copy_template.S"
- mov x0, #0 // Nothing to copy
+ add srcend, src, count
+ add dstend, dstin, count
+ cmp count, 128
+ b.hi L(copy_long)
+ cmp count, 32
+ b.hi L(copy32_128)
+
+ /* Small copies: 0..32 bytes. */
+ cmp count, 16
+ b.lo L(copy16)
+U_pre( ldtr A_l, [src])
+U_pre( ldtr A_h, [src, 8])
+U_pre( ldtr D_l, [srcend, -16])
+U_pre( ldtr D_h, [srcend, -8])
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ mov x0, #0
ret
- // Exception fixups
-9997: cmp dst, dstin
- b.ne 9998f
- // Before being absolutely sure we couldn't copy anything, try harder
-USER(9998f, ldtrb tmp1w, [srcin])
- strb tmp1w, [dst], #1
-9998: sub x0, end, dst // bytes not copied
+ /* Copy 8-15 bytes. */
+L(copy16):
+ tbz count, 3, L(copy8)
+U_pre( ldtr A_l, [src])
+U_pre( ldtr A_h, [srcend, -8])
+ str A_l, [dstin]
+ str A_h, [dstend, -8]
+ mov x0, #0
ret
+
+ .p2align 3
+ /* Copy 4-7 bytes. */
+L(copy8):
+ tbz count, 2, L(copy4)
+U_pre( ldtr A_lw, [src])
+U_pre( ldtr B_lw, [srcend, -4])
+ str A_lw, [dstin]
+ str B_lw, [dstend, -4]
+ mov x0, #0
+ ret
+
+ /* Copy 0..3 bytes. */
+L(copy4):
+ tbz count, #1, L(copy1)
+U_pre( ldtrh A_lw, [src])
+ strh A_lw, [dstin]
+L(copy1):
+ tbz count, #0, L(copy0)
+U_S1( ldtrb A_lw, [srcend, -1])
+ strb A_lw, [dstend, -1]
+L(copy0):
+ mov x0, #0
+ ret
+
+ .p2align 4
+ /* Medium copies: 33..128 bytes. */
+L(copy32_128):
+U_pre( ldtr A_l, [src])
+U_pre( ldtr A_h, [src, 8])
+U_pre( ldtr B_l, [src, 16])
+U_pre( ldtr B_h, [src, 24])
+ stp A_l, A_h, [dstin]
+ stp B_l, B_h, [dstin, 16]
+U_M32( ldtr C_l, [srcend, -32])
+U_M32( ldtr C_h, [srcend, -24])
+U_M32( ldtr D_l, [srcend, -16])
+U_M32( ldtr D_h, [srcend, -8])
+ cmp count, 64
+ b.ls L(copy64)
+U_M32( ldtr E_l, [src, 32])
+U_M32( ldtr E_h, [src, 40])
+U_M32( ldtr F_l, [src, 48])
+U_M32( ldtr F_h, [src, 56])
+ stp E_l, E_h, [dstin, 32]
+ stp F_l, F_h, [dstin, 48]
+U_M64( ldtr A_l, [srcend, -64])
+U_M64( ldtr A_h, [srcend, -56])
+U_M64( ldtr B_l, [srcend, -48])
+U_M64( ldtr B_h, [srcend, -40])
+ stp A_l, A_h, [dstend, -64]
+ stp B_l, B_h, [dstend, -48]
+L(copy64):
+ stp C_l, C_h, [dstend, -32]
+ stp D_l, D_h, [dstend, -16]
+ mov x0, #0
+ ret
+
+ .p2align 4
+ /* Copy more than 128 bytes. */
+L(copy_long):
+ /* Copy 16 bytes and then align src to 16-byte alignment. */
+
+U_pre( ldtr D_l, [src])
+U_pre( ldtr D_h, [src, 8])
+ and tmp1, src, 15
+ bic src, src, 15
+ sub dst, dstin, tmp1
+ add count, count, tmp1 /* Count is now 16 too large. */
+U_pre( ldtr A_l, [src, 16])
+U_pre( ldtr A_h, [src, 24])
+ stp D_l, D_h, [dstin]
+U_M16( ldtr B_l, [src, 32])
+U_M16( ldtr B_h, [src, 40])
+U_M16( ldtr C_l, [src, 48])
+U_M16( ldtr C_h, [src, 56])
+ add src, src, #48
+ subs count, count, 96 + 16 /* Test and readjust count. */
+ b.ls L(copy48_from_end)
+
+L(loop48):
+ stp A_l, A_h, [dst, 16]
+U_L32( ldtr A_l, [src, 16])
+U_L32( ldtr A_h, [src, 24])
+ stp B_l, B_h, [dst, 32]
+U_L48( ldtr B_l, [src, 32])
+U_L48( ldtr B_h, [src, 40])
+ stp C_l, C_h, [dst, 48]!
+U_dst( ldtr C_l, [src, 48])
+U_dst( ldtr C_h, [src, 56])
+ add src, src, #48
+ subs count, count, 48
+ b.hi L(loop48)
+
+ /* Write the last iteration and copy 48 bytes from the end. */
+L(copy48_from_end):
+ stp A_l, A_h, [dst, 16]
+U_L32( ldtr A_l, [srcend, -48])
+U_L32( ldtr A_h, [srcend, -40])
+ stp B_l, B_h, [dst, 32]
+U_L48( ldtr B_l, [srcend, -32])
+U_L48( ldtr B_h, [srcend, -24])
+ stp C_l, C_h, [dst, 48]
+U_L64( ldtr C_l, [srcend, -16])
+U_L64( ldtr C_h, [srcend, -8])
+ stp A_l, A_h, [dstend, -48]
+ stp B_l, B_h, [dstend, -32]
+ stp C_l, C_h, [dstend, -16]
+ mov x0, #0
+ ret
+
+ /* Fixups... */
+
+ /*
+ * Fault before anything has been written, but progress may have
+ * been possible; realign dst and retry a single byte to confirm.
+ */
+L(fixup_pre):
+ mov dst, dstin
+U_dst( ldtrb A_lw, [src])
+ strb A_lw, [dst], #1
+L(fixup_dst):
+ sub x0, dstend, dst
+ ret
+
+ /* Small: Fault with 1 byte remaining, regardless of count */
+L(fixup_s1):
+ mov x0, #1
+ ret
+
+ /* Medium: Faults after n bytes beyond dstin have been written */
+L(fixup_m64):
+ add dstin, dstin, #32
+L(fixup_m32):
+ add dstin, dstin, #16
+L(fixup_m16):
+ add dst, dstin, #16
+ b L(fixup_dst)
+
+ /* Large: Faults after n bytes beyond dst have been written */
+L(fixup_l64):
+ add dst, dst, #16
+L(fixup_l48):
+ add dst, dst, #16
+L(fixup_l32):
+ add dst, dst, #32
+ b L(fixup_dst)
+
SYM_FUNC_END(__arch_copy_from_user)
EXPORT_SYMBOL(__arch_copy_from_user)
--
2.36.1.dirty
More information about the linux-arm-kernel
mailing list