[PATCH V2] arm64: optimized copy_to_user and copy_from_user assembly code
Steve Capper
steve.capper at linaro.org
Thu Nov 27 03:43:24 PST 2014
On Fri, Aug 08, 2014 at 04:02:20PM -0700, Feng Kan wrote:
> Using the glibc cortex string work work authored by Linaro as base to
> create new copy to/from user kernel routine.
>
> Iperf performance increase:
> -l (size) 1 core result
> Optimized 64B 44-51Mb/s
> 1500B 4.9Gb/s
> 30000B 16.2Gb/s
> Original 64B 34-50.7Mb/s
> 1500B 4.7Gb/s
> 30000B 14.5Gb/s
I ran into some horrible crashes with this patch, it had been applied
to the Ubuntu kernel I was running. Details below:
>
> Signed-off-by: Feng Kan <fkan at apm.com>
> ---
> arch/arm64/lib/copy_from_user.S | 36 +-----
> arch/arm64/lib/copy_template.S | 278 ++++++++++++++++++++++++++++++++++++++++
> arch/arm64/lib/copy_to_user.S | 31 +----
> 3 files changed, 284 insertions(+), 61 deletions(-)
> create mode 100644 arch/arm64/lib/copy_template.S
>
> diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
> index 5e27add..c4c5187 100644
> --- a/arch/arm64/lib/copy_from_user.S
> +++ b/arch/arm64/lib/copy_from_user.S
> @@ -15,7 +15,6 @@
> */
>
> #include <linux/linkage.h>
> -#include <asm/assembler.h>
>
> /*
> * Copy from user space to a kernel buffer (alignment handled by the hardware)
> @@ -28,39 +27,10 @@
> * x0 - bytes not copied
> */
> ENTRY(__copy_from_user)
> - add x4, x1, x2 // upper user buffer boundary
> - subs x2, x2, #8
> - b.mi 2f
> -1:
> -USER(9f, ldr x3, [x1], #8 )
> - subs x2, x2, #8
> - str x3, [x0], #8
> - b.pl 1b
> -2: adds x2, x2, #4
> - b.mi 3f
> -USER(9f, ldr w3, [x1], #4 )
> - sub x2, x2, #4
> - str w3, [x0], #4
> -3: adds x2, x2, #2
> - b.mi 4f
> -USER(9f, ldrh w3, [x1], #2 )
> - sub x2, x2, #2
> - strh w3, [x0], #2
> -4: adds x2, x2, #1
> - b.mi 5f
> -USER(9f, ldrb w3, [x1] )
> - strb w3, [x0]
> -5: mov x0, #0
> - ret
> +#include "copy_template.S"
> ENDPROC(__copy_from_user)
>
> .section .fixup,"ax"
> - .align 2
> -9: sub x2, x4, x1
> - mov x3, x2
> -10: strb wzr, [x0], #1 // zero remaining buffer space
> - subs x3, x3, #1
> - b.ne 10b
> - mov x0, x2 // bytes not copied
> - ret
> + .align 2
> + copy_abort_table
> .previous
> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
> new file mode 100644
> index 0000000..f2c7003
> --- /dev/null
> +++ b/arch/arm64/lib/copy_template.S
> @@ -0,0 +1,278 @@
> +/*
> + * Copyright (c) 2013, Applied Micro Circuits Corporation
> + * Copyright (c) 2012-2013, Linaro Limited
> + *
> + * Author: Feng Kan <fkan at apm.com>
> + * Author: Philipp Tomsich <philipp.tomsich at theobroma-systems.com>
> + *
> + * The code is adopted from the memcpy routine by Linaro Limited.
> + *
> + * This file is free software: you may copy, redistribute and/or modify it
> + * under the terms of the GNU General Public License as published by the
> + * Free Software Foundation, either version 2 of the License, or (at your
> + * option) any later version.
> + *
> + * This file is distributed in the hope that it will be useful, but
> + * WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + * General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + *
> + * This file incorporates work covered by the following copyright and
> + * permission notice:
> + *
> + * Redistribution and use in source and binary forms, with or without
> + * modification, are permitted provided that the following conditions are met:
> + * 1 Redistributions of source code must retain the above copyright
> + * notice, this list of conditions and the following disclaimer.
> + * 2 Redistributions in binary form must reproduce the above copyright
> + * notice, this list of conditions and the following disclaimer in the
> + * documentation and/or other materials provided with the distribution.
> + * 3 Neither the name of the Linaro nor the
> + * names of its contributors may be used to endorse or promote products
> + * derived from this software without specific prior written permission.
> + *
> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
> + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
> + */
> +#include <asm/assembler.h>
> +
> +dstin .req x0
> +src .req x1
> +count .req x2
> +tmp1 .req x3
> +tmp1w .req w3
> +tmp2 .req x4
> +tmp2w .req w4
> +tmp3 .req x5
> +tmp3w .req w5
> +dst .req x6
> +
> +A_l .req x7
> +A_h .req x8
> +B_l .req x9
> +B_h .req x10
> +C_l .req x11
> +C_h .req x12
> +D_l .req x13
> +D_h .req x14
> +
> + mov dst, dstin
> + cmp count, #64
> + b.ge .Lcpy_not_short
> + cmp count, #15
> + b.le .Ltail15tiny
> +
> + /*
> + * Deal with small copies quickly by dropping straight into the
> + * exit block.
> + */
> +.Ltail63:
> + /*
> + * Copy up to 48 bytes of data. At this point we only need the
> + * bottom 6 bits of count to be accurate.
> + */
> + ands tmp1, count, #0x30
> + b.eq .Ltail15
> + add dst, dst, tmp1
> + add src, src, tmp1
> + cmp tmp1w, #0x20
> + b.eq 1f
> + b.lt 2f
> + USER(8f, ldp A_l, A_h, [src, #-48])
> + USER(8f, stp A_l, A_h, [dst, #-48])
> +1:
> + USER(8f, ldp A_l, A_h, [src, #-32])
> + USER(8f, stp A_l, A_h, [dst, #-32])
> +2:
> + USER(8f, ldp A_l, A_h, [src, #-16])
> + USER(8f, stp A_l, A_h, [dst, #-16])
> +
> +.Ltail15:
> + ands count, count, #15
> + beq 1f
> + add src, src, count
> + USER(9f, ldp A_l, A_h, [src, #-16])
> + add dst, dst, count
> + USER(9f, stp A_l, A_h, [dst, #-16])
> +1:
> + b .Lsuccess
> +
> +.Ltail15tiny:
> + /*
> + * Copy up to 15 bytes of data. Does not assume additional data
> + * being copied.
> + */
> + tbz count, #3, 1f
> + USER(10f, ldr tmp1, [src], #8)
> + USER(10f, str tmp1, [dst], #8)
> +1:
> + tbz count, #2, 1f
> + USER(10f, ldr tmp1w, [src], #4)
> + USER(10f, str tmp1w, [dst], #4)
> +1:
> + tbz count, #1, 1f
> + USER(10f, ldrh tmp1w, [src], #2)
> + USER(10f, strh tmp1w, [dst], #2)
> +1:
> + tbz count, #0, 1f
> + USER(10f, ldrb tmp1w, [src])
> + USER(10f, strb tmp1w, [dst])
> +1:
> + b .Lsuccess
> +
> +.Lcpy_not_short:
> + /*
> + * We don't much care about the alignment of DST, but we want SRC
> + * to be 128-bit (16 byte) aligned so that we don't cross cache line
> + * boundaries on both loads and stores.
> + */
> + neg tmp2, src
> + ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
> + b.eq 2f
> + sub count, count, tmp2
> + /*
> + * Copy more data than needed; it's faster than jumping
> + * around copying sub-Quadword quantities. We know that
> + * it can't overrun.
> + */
> + USER(11f, ldp A_l, A_h, [src])
> + add src, src, tmp2
> + USER(11f, stp A_l, A_h, [dst])
> + add dst, dst, tmp2
> + /* There may be less than 63 bytes to go now. */
> + cmp count, #63
> + b.le .Ltail63
> +2:
> + subs count, count, #128
> + b.ge .Lcpy_body_large
> + /*
> + * Less than 128 bytes to copy, so handle 64 here and then jump
> + * to the tail.
> + */
> + USER(12f, ldp A_l, A_h, [src])
> + USER(12f, ldp B_l, B_h, [src, #16])
> + USER(12f, ldp C_l, C_h, [src, #32])
> + USER(12f, ldp D_l, D_h, [src, #48])
> + USER(12f, stp A_l, A_h, [dst])
> + USER(12f, stp B_l, B_h, [dst, #16])
> + USER(12f, stp C_l, C_h, [dst, #32])
> + USER(12f, stp D_l, D_h, [dst, #48])
> + tst count, #0x3f
> + add src, src, #64
> + add dst, dst, #64
> + b.ne .Ltail63
> + b .Lsuccess
> +
> + /*
> + * Critical loop. Start at a new cache line boundary. Assuming
> + * 64 bytes per line this ensures the entire loop is in one line.
> + */
> + .p2align 6
> +.Lcpy_body_large:
> + /* There are at least 128 bytes to copy. */
> + USER(12f, ldp A_l, A_h, [src, #0])
> + sub dst, dst, #16 /* Pre-bias. */
> + USER(13f, ldp B_l, B_h, [src, #16])
> + USER(13f, ldp C_l, C_h, [src, #32])
> + USER(13f, ldp D_l, D_h, [src, #48]!) /* src += 64 - Pre-bias. */
> +1:
> + USER(13f, stp A_l, A_h, [dst, #16])
> + USER(13f, ldp A_l, A_h, [src, #16])
> + USER(13f, stp B_l, B_h, [dst, #32])
> + USER(13f, ldp B_l, B_h, [src, #32])
> + USER(13f, stp C_l, C_h, [dst, #48])
> + USER(13f, ldp C_l, C_h, [src, #48])
> + USER(13f, stp D_l, D_h, [dst, #64]!)
> + USER(13f, ldp D_l, D_h, [src, #64]!)
> + subs count, count, #64
> + b.ge 1b
> + USER(14f, stp A_l, A_h, [dst, #16])
> + USER(14f, stp B_l, B_h, [dst, #32])
> + USER(14f, stp C_l, C_h, [dst, #48])
> + USER(14f, stp D_l, D_h, [dst, #64])
> + add src, src, #16
> + add dst, dst, #64 + 16
> + tst count, #0x3f
> + b.ne .Ltail63
> +.Lsuccess:
> + /* Nothing left to copy */
> + mov x0, #0
> + ret
> +
> + .macro copy_abort_table
> +8:
> + /*
> + * Count bytes remain
> + * dst points to (dst + tmp1)
> + */
> + mov x0, count
> + sub dst, dst, tmp1
> + b .Lfinalize
> +9:
> + /*
> + * 16 bytes remain
> + * dst is accurate
> + */
> + mov x0, #16
> + b .Lfinalize
> +10:
> + /*
> + * count is accurate
> + * dst is accurate
> + */
> + mov x0, count
> + b .Lfinalize
> +11:
> + /*
> + *(count + tmp2) bytes remain
> + * dst points to the start of the remaining bytes
> + */
> + add x0, count, tmp2
> + b .Lfinalize
> +12:
> + /*
> + * (count + 128) bytes remain
> + * dst is accurate
> + */
> + add x0, count, #128
> + b .Lfinalize
> +13:
> + /*
> + * (count + 128) bytes remain
> + * dst is pre-biased to (dst + 16)
> + */
> + add x0, count, #128
> + sub dst, dst, #16
> + b .Lfinalize
> +14:
> + /*
> + * count is accurate
> + * dst is pre-biased to (dst + 16)
> + */
> + mov x0, count
> + sub dst, dst, #16
> + /* fall-through */
> +.Lfinalize:
> + /*
> + * Zeroize remaining destination-buffer
> + */
> + mov count, x0
> +20:
> + /* Zero remaining buffer space */
> + strb wzr, [dst], #1
> + subs count, count, #1
> + b.ne 20b
> + ret
> + .endm
> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
> index a0aeeb9..08787b0 100644
> --- a/arch/arm64/lib/copy_to_user.S
> +++ b/arch/arm64/lib/copy_to_user.S
> @@ -15,7 +15,6 @@
> */
>
> #include <linux/linkage.h>
> -#include <asm/assembler.h>
>
> /*
> * Copy to user space from a kernel buffer (alignment handled by the hardware)
> @@ -28,34 +27,10 @@
> * x0 - bytes not copied
> */
> ENTRY(__copy_to_user)
> - add x4, x0, x2 // upper user buffer boundary
> - subs x2, x2, #8
> - b.mi 2f
> -1:
> - ldr x3, [x1], #8
> - subs x2, x2, #8
> -USER(9f, str x3, [x0], #8 )
> - b.pl 1b
> -2: adds x2, x2, #4
> - b.mi 3f
> - ldr w3, [x1], #4
> - sub x2, x2, #4
> -USER(9f, str w3, [x0], #4 )
> -3: adds x2, x2, #2
> - b.mi 4f
> - ldrh w3, [x1], #2
> - sub x2, x2, #2
> -USER(9f, strh w3, [x0], #2 )
> -4: adds x2, x2, #1
> - b.mi 5f
> - ldrb w3, [x1]
> -USER(9f, strb w3, [x0] )
> -5: mov x0, #0
> - ret
> +#include "copy_template.S"
> ENDPROC(__copy_to_user)
>
> .section .fixup,"ax"
> - .align 2
> -9: sub x0, x4, x0 // bytes not copied
> - ret
> + .align 2
> + copy_abort_table
The exact same fixup code is being used for copy_to_user and
copy_from_user.
For the copy_from_user case we want to zero the rest of the kernel
destination buffer when we hit a pagefault reading from user space.
However, for the copy_to_user case we most definitely don't want to
write zeros in the destination buffer when we hit a pagefault writing
to user space! I get unhandled pagefaults here, when copy_to_user is
called:
0xffffffc00073c638 <+8920>: strb wzr, [x6],#1
0xffffffc00073c63c <+8924>: subs x2, x2, #0x1
0xffffffc00073c640 <+8928>: b.ne 0xffffffc00073c638 <__hyp_text_end+8920>
0xffffffc00073c644 <+8932>: ret
I would suggest re-working the fixup path and testing both fixup paths
thoroughly by placing the system under memory pressure and confirming
that they are both "hit".
Dann, Tim,
Could you please revert this from the Ubuntu kernels, whilst it is
being fixed?
Cheers,
--
Steve
More information about the linux-arm-kernel
mailing list