[PATCH V2] arm64: optimized copy_to_user and copy_from_user assembly code
Dann Frazier
dann.frazier at canonical.com
Tue Dec 2 14:59:31 PST 2014
On Thu, Nov 27, 2014 at 4:43 AM, Steve Capper <steve.capper at linaro.org> wrote:
> On Fri, Aug 08, 2014 at 04:02:20PM -0700, Feng Kan wrote:
>> Using the glibc cortex string work work authored by Linaro as base to
>> create new copy to/from user kernel routine.
>>
>> Iperf performance increase:
>> -l (size) 1 core result
>> Optimized 64B 44-51Mb/s
>> 1500B 4.9Gb/s
>> 30000B 16.2Gb/s
>> Original 64B 34-50.7Mb/s
>> 1500B 4.7Gb/s
>> 30000B 14.5Gb/s
>
> I ran into some horrible crashes with this patch, it had been applied
> to the Ubuntu kernel I was running. Details below:
>
>>
>> Signed-off-by: Feng Kan <fkan at apm.com>
>> ---
>> arch/arm64/lib/copy_from_user.S | 36 +-----
>> arch/arm64/lib/copy_template.S | 278 ++++++++++++++++++++++++++++++++++++++++
>> arch/arm64/lib/copy_to_user.S | 31 +----
>> 3 files changed, 284 insertions(+), 61 deletions(-)
>> create mode 100644 arch/arm64/lib/copy_template.S
>>
>> diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S
>> index 5e27add..c4c5187 100644
>> --- a/arch/arm64/lib/copy_from_user.S
>> +++ b/arch/arm64/lib/copy_from_user.S
>> @@ -15,7 +15,6 @@
>> */
>>
>> #include <linux/linkage.h>
>> -#include <asm/assembler.h>
>>
>> /*
>> * Copy from user space to a kernel buffer (alignment handled by the hardware)
>> @@ -28,39 +27,10 @@
>> * x0 - bytes not copied
>> */
>> ENTRY(__copy_from_user)
>> - add x4, x1, x2 // upper user buffer boundary
>> - subs x2, x2, #8
>> - b.mi 2f
>> -1:
>> -USER(9f, ldr x3, [x1], #8 )
>> - subs x2, x2, #8
>> - str x3, [x0], #8
>> - b.pl 1b
>> -2: adds x2, x2, #4
>> - b.mi 3f
>> -USER(9f, ldr w3, [x1], #4 )
>> - sub x2, x2, #4
>> - str w3, [x0], #4
>> -3: adds x2, x2, #2
>> - b.mi 4f
>> -USER(9f, ldrh w3, [x1], #2 )
>> - sub x2, x2, #2
>> - strh w3, [x0], #2
>> -4: adds x2, x2, #1
>> - b.mi 5f
>> -USER(9f, ldrb w3, [x1] )
>> - strb w3, [x0]
>> -5: mov x0, #0
>> - ret
>> +#include "copy_template.S"
>> ENDPROC(__copy_from_user)
>>
>> .section .fixup,"ax"
>> - .align 2
>> -9: sub x2, x4, x1
>> - mov x3, x2
>> -10: strb wzr, [x0], #1 // zero remaining buffer space
>> - subs x3, x3, #1
>> - b.ne 10b
>> - mov x0, x2 // bytes not copied
>> - ret
>> + .align 2
>> + copy_abort_table
>> .previous
>> diff --git a/arch/arm64/lib/copy_template.S b/arch/arm64/lib/copy_template.S
>> new file mode 100644
>> index 0000000..f2c7003
>> --- /dev/null
>> +++ b/arch/arm64/lib/copy_template.S
>> @@ -0,0 +1,278 @@
>> +/*
>> + * Copyright (c) 2013, Applied Micro Circuits Corporation
>> + * Copyright (c) 2012-2013, Linaro Limited
>> + *
>> + * Author: Feng Kan <fkan at apm.com>
>> + * Author: Philipp Tomsich <philipp.tomsich at theobroma-systems.com>
>> + *
>> + * The code is adopted from the memcpy routine by Linaro Limited.
>> + *
>> + * This file is free software: you may copy, redistribute and/or modify it
>> + * under the terms of the GNU General Public License as published by the
>> + * Free Software Foundation, either version 2 of the License, or (at your
>> + * option) any later version.
>> + *
>> + * This file is distributed in the hope that it will be useful, but
>> + * WITHOUT ANY WARRANTY; without even the implied warranty of
>> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
>> + * General Public License for more details.
>> + *
>> + * You should have received a copy of the GNU General Public License
>> + * along with this program. If not, see <http://www.gnu.org/licenses/>.
>> + *
>> + * This file incorporates work covered by the following copyright and
>> + * permission notice:
>> + *
>> + * Redistribution and use in source and binary forms, with or without
>> + * modification, are permitted provided that the following conditions are met:
>> + * 1 Redistributions of source code must retain the above copyright
>> + * notice, this list of conditions and the following disclaimer.
>> + * 2 Redistributions in binary form must reproduce the above copyright
>> + * notice, this list of conditions and the following disclaimer in the
>> + * documentation and/or other materials provided with the distribution.
>> + * 3 Neither the name of the Linaro nor the
>> + * names of its contributors may be used to endorse or promote products
>> + * derived from this software without specific prior written permission.
>> + *
>> + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
>> + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
>> + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
>> + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
>> + * HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
>> + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
>> + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
>> + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
>> + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
>> + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
>> + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
>> + */
>> +#include <asm/assembler.h>
>> +
>> +dstin .req x0
>> +src .req x1
>> +count .req x2
>> +tmp1 .req x3
>> +tmp1w .req w3
>> +tmp2 .req x4
>> +tmp2w .req w4
>> +tmp3 .req x5
>> +tmp3w .req w5
>> +dst .req x6
>> +
>> +A_l .req x7
>> +A_h .req x8
>> +B_l .req x9
>> +B_h .req x10
>> +C_l .req x11
>> +C_h .req x12
>> +D_l .req x13
>> +D_h .req x14
>> +
>> + mov dst, dstin
>> + cmp count, #64
>> + b.ge .Lcpy_not_short
>> + cmp count, #15
>> + b.le .Ltail15tiny
>> +
>> + /*
>> + * Deal with small copies quickly by dropping straight into the
>> + * exit block.
>> + */
>> +.Ltail63:
>> + /*
>> + * Copy up to 48 bytes of data. At this point we only need the
>> + * bottom 6 bits of count to be accurate.
>> + */
>> + ands tmp1, count, #0x30
>> + b.eq .Ltail15
>> + add dst, dst, tmp1
>> + add src, src, tmp1
>> + cmp tmp1w, #0x20
>> + b.eq 1f
>> + b.lt 2f
>> + USER(8f, ldp A_l, A_h, [src, #-48])
>> + USER(8f, stp A_l, A_h, [dst, #-48])
>> +1:
>> + USER(8f, ldp A_l, A_h, [src, #-32])
>> + USER(8f, stp A_l, A_h, [dst, #-32])
>> +2:
>> + USER(8f, ldp A_l, A_h, [src, #-16])
>> + USER(8f, stp A_l, A_h, [dst, #-16])
>> +
>> +.Ltail15:
>> + ands count, count, #15
>> + beq 1f
>> + add src, src, count
>> + USER(9f, ldp A_l, A_h, [src, #-16])
>> + add dst, dst, count
>> + USER(9f, stp A_l, A_h, [dst, #-16])
>> +1:
>> + b .Lsuccess
>> +
>> +.Ltail15tiny:
>> + /*
>> + * Copy up to 15 bytes of data. Does not assume additional data
>> + * being copied.
>> + */
>> + tbz count, #3, 1f
>> + USER(10f, ldr tmp1, [src], #8)
>> + USER(10f, str tmp1, [dst], #8)
>> +1:
>> + tbz count, #2, 1f
>> + USER(10f, ldr tmp1w, [src], #4)
>> + USER(10f, str tmp1w, [dst], #4)
>> +1:
>> + tbz count, #1, 1f
>> + USER(10f, ldrh tmp1w, [src], #2)
>> + USER(10f, strh tmp1w, [dst], #2)
>> +1:
>> + tbz count, #0, 1f
>> + USER(10f, ldrb tmp1w, [src])
>> + USER(10f, strb tmp1w, [dst])
>> +1:
>> + b .Lsuccess
>> +
>> +.Lcpy_not_short:
>> + /*
>> + * We don't much care about the alignment of DST, but we want SRC
>> + * to be 128-bit (16 byte) aligned so that we don't cross cache line
>> + * boundaries on both loads and stores.
>> + */
>> + neg tmp2, src
>> + ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
>> + b.eq 2f
>> + sub count, count, tmp2
>> + /*
>> + * Copy more data than needed; it's faster than jumping
>> + * around copying sub-Quadword quantities. We know that
>> + * it can't overrun.
>> + */
>> + USER(11f, ldp A_l, A_h, [src])
>> + add src, src, tmp2
>> + USER(11f, stp A_l, A_h, [dst])
>> + add dst, dst, tmp2
>> + /* There may be less than 63 bytes to go now. */
>> + cmp count, #63
>> + b.le .Ltail63
>> +2:
>> + subs count, count, #128
>> + b.ge .Lcpy_body_large
>> + /*
>> + * Less than 128 bytes to copy, so handle 64 here and then jump
>> + * to the tail.
>> + */
>> + USER(12f, ldp A_l, A_h, [src])
>> + USER(12f, ldp B_l, B_h, [src, #16])
>> + USER(12f, ldp C_l, C_h, [src, #32])
>> + USER(12f, ldp D_l, D_h, [src, #48])
>> + USER(12f, stp A_l, A_h, [dst])
>> + USER(12f, stp B_l, B_h, [dst, #16])
>> + USER(12f, stp C_l, C_h, [dst, #32])
>> + USER(12f, stp D_l, D_h, [dst, #48])
>> + tst count, #0x3f
>> + add src, src, #64
>> + add dst, dst, #64
>> + b.ne .Ltail63
>> + b .Lsuccess
>> +
>> + /*
>> + * Critical loop. Start at a new cache line boundary. Assuming
>> + * 64 bytes per line this ensures the entire loop is in one line.
>> + */
>> + .p2align 6
>> +.Lcpy_body_large:
>> + /* There are at least 128 bytes to copy. */
>> + USER(12f, ldp A_l, A_h, [src, #0])
>> + sub dst, dst, #16 /* Pre-bias. */
>> + USER(13f, ldp B_l, B_h, [src, #16])
>> + USER(13f, ldp C_l, C_h, [src, #32])
>> + USER(13f, ldp D_l, D_h, [src, #48]!) /* src += 64 - Pre-bias. */
>> +1:
>> + USER(13f, stp A_l, A_h, [dst, #16])
>> + USER(13f, ldp A_l, A_h, [src, #16])
>> + USER(13f, stp B_l, B_h, [dst, #32])
>> + USER(13f, ldp B_l, B_h, [src, #32])
>> + USER(13f, stp C_l, C_h, [dst, #48])
>> + USER(13f, ldp C_l, C_h, [src, #48])
>> + USER(13f, stp D_l, D_h, [dst, #64]!)
>> + USER(13f, ldp D_l, D_h, [src, #64]!)
>> + subs count, count, #64
>> + b.ge 1b
>> + USER(14f, stp A_l, A_h, [dst, #16])
>> + USER(14f, stp B_l, B_h, [dst, #32])
>> + USER(14f, stp C_l, C_h, [dst, #48])
>> + USER(14f, stp D_l, D_h, [dst, #64])
>> + add src, src, #16
>> + add dst, dst, #64 + 16
>> + tst count, #0x3f
>> + b.ne .Ltail63
>> +.Lsuccess:
>> + /* Nothing left to copy */
>> + mov x0, #0
>> + ret
>> +
>> + .macro copy_abort_table
>> +8:
>> + /*
>> + * Count bytes remain
>> + * dst points to (dst + tmp1)
>> + */
>> + mov x0, count
>> + sub dst, dst, tmp1
>> + b .Lfinalize
>> +9:
>> + /*
>> + * 16 bytes remain
>> + * dst is accurate
>> + */
>> + mov x0, #16
>> + b .Lfinalize
>> +10:
>> + /*
>> + * count is accurate
>> + * dst is accurate
>> + */
>> + mov x0, count
>> + b .Lfinalize
>> +11:
>> + /*
>> + *(count + tmp2) bytes remain
>> + * dst points to the start of the remaining bytes
>> + */
>> + add x0, count, tmp2
>> + b .Lfinalize
>> +12:
>> + /*
>> + * (count + 128) bytes remain
>> + * dst is accurate
>> + */
>> + add x0, count, #128
>> + b .Lfinalize
>> +13:
>> + /*
>> + * (count + 128) bytes remain
>> + * dst is pre-biased to (dst + 16)
>> + */
>> + add x0, count, #128
>> + sub dst, dst, #16
>> + b .Lfinalize
>> +14:
>> + /*
>> + * count is accurate
>> + * dst is pre-biased to (dst + 16)
>> + */
>> + mov x0, count
>> + sub dst, dst, #16
>> + /* fall-through */
>> +.Lfinalize:
>> + /*
>> + * Zeroize remaining destination-buffer
>> + */
>> + mov count, x0
>> +20:
>> + /* Zero remaining buffer space */
>> + strb wzr, [dst], #1
>> + subs count, count, #1
>> + b.ne 20b
>> + ret
>> + .endm
>> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
>> index a0aeeb9..08787b0 100644
>> --- a/arch/arm64/lib/copy_to_user.S
>> +++ b/arch/arm64/lib/copy_to_user.S
>> @@ -15,7 +15,6 @@
>> */
>>
>> #include <linux/linkage.h>
>> -#include <asm/assembler.h>
>>
>> /*
>> * Copy to user space from a kernel buffer (alignment handled by the hardware)
>> @@ -28,34 +27,10 @@
>> * x0 - bytes not copied
>> */
>> ENTRY(__copy_to_user)
>> - add x4, x0, x2 // upper user buffer boundary
>> - subs x2, x2, #8
>> - b.mi 2f
>> -1:
>> - ldr x3, [x1], #8
>> - subs x2, x2, #8
>> -USER(9f, str x3, [x0], #8 )
>> - b.pl 1b
>> -2: adds x2, x2, #4
>> - b.mi 3f
>> - ldr w3, [x1], #4
>> - sub x2, x2, #4
>> -USER(9f, str w3, [x0], #4 )
>> -3: adds x2, x2, #2
>> - b.mi 4f
>> - ldrh w3, [x1], #2
>> - sub x2, x2, #2
>> -USER(9f, strh w3, [x0], #2 )
>> -4: adds x2, x2, #1
>> - b.mi 5f
>> - ldrb w3, [x1]
>> -USER(9f, strb w3, [x0] )
>> -5: mov x0, #0
>> - ret
>> +#include "copy_template.S"
>> ENDPROC(__copy_to_user)
>>
>> .section .fixup,"ax"
>> - .align 2
>> -9: sub x0, x4, x0 // bytes not copied
>> - ret
>> + .align 2
>> + copy_abort_table
>
> The exact same fixup code is being used for copy_to_user and
> copy_from_user.
>
> For the copy_from_user case we want to zero the rest of the kernel
> destination buffer when we hit a pagefault reading from user space.
>
> However, for the copy_to_user case we most definitely don't want to
> write zeros in the destination buffer when we hit a pagefault writing
> to user space! I get unhandled pagefaults here, when copy_to_user is
> called:
>
> 0xffffffc00073c638 <+8920>: strb wzr, [x6],#1
> 0xffffffc00073c63c <+8924>: subs x2, x2, #0x1
> 0xffffffc00073c640 <+8928>: b.ne 0xffffffc00073c638 <__hyp_text_end+8920>
> 0xffffffc00073c644 <+8932>: ret
>
> I would suggest re-working the fixup path and testing both fixup paths
> thoroughly by placing the system under memory pressure and confirming
> that they are both "hit".
>
> Dann, Tim,
> Could you please revert this from the Ubuntu kernels, whilst it is
> being fixed?
Seems wise, thanks for the heads-up. My coworker has started the process here:
https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1398596
-dann
> Cheers,
> --
> Steve
More information about the linux-arm-kernel
mailing list