[PATCH 1/4] arm64: optimized copy_to_user assembly code
Catalin Marinas
catalin.marinas at arm.com
Mon Aug 19 13:22:39 EDT 2013
On Thu, Aug 15, 2013 at 07:29:57PM +0100, Feng Kan wrote:
> Using the optimized copy_to_user code, iperf performance improved 20%.
I have some questions first:
Was this code written from scratch or derived from something else? I can
see some Linaro copyright in there.
Since it is dual-licensed, have you replaced the original code entirely?
Is this patch part of a series? I can't seem to find the other 3 patches
(unless I deleted them by mistake).
> diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S
> index a0aeeb9..2a07b13 100644
> --- a/arch/arm64/lib/copy_to_user.S
> +++ b/arch/arm64/lib/copy_to_user.S
> @@ -1,61 +1,220 @@
> /*
> - * Copyright (C) 2012 ARM Ltd.
> + * Copyright (c) 2013, Applied Micro Circuits Corporation
> + * Copyright (c) 2012-2013, Linaro Limited
> *
> - * This program is free software; you can redistribute it and/or modify
> - * it under the terms of the GNU General Public License version 2 as
> - * published by the Free Software Foundation.
> + * Author: Feng Kan <fkan at apm.com>
> + * Author: Philipp Tomsich <philipp.tomsich at theobroma-systems.com>
> *
> - * This program is distributed in the hope that it will be useful,
> - * but WITHOUT ANY WARRANTY; without even the implied warranty of
> - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> - * GNU General Public License for more details.
> + * The code is adopted from the memcpy routine by Linaro Limited.
> *
> - * You should have received a copy of the GNU General Public License
> - * along with this program. If not, see <http://www.gnu.org/licenses/>.
> + * This file is dual licensed: you can use it either under the terms of
> + * the GPL, or the BSD license, at your option.
> + *
> + * a) This library is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation; either version 2 of the
> + * License, or (at your option) any later version.
> + *
> + * This library is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public
> + * License along with this library; if not, write to the Free
> + * Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
> + * MA 02110-1301 USA
This is raised regularly, please don't include the FSF address, they may
relocated. Just point at the web address.
> -/*
> - * Copy to user space from a kernel buffer (alignment handled by the hardware)
> - *
> - * Parameters:
> - * x0 - to
> - * x1 - from
> - * x2 - n
> - * Returns:
> - * x0 - bytes not copied
> - */
What's the point of dropping some of the documentation (not that it was
much either)?
> -ENTRY(__copy_to_user)
> - add x4, x0, x2 // upper user buffer boundary
> - subs x2, x2, #8
> - b.mi 2f
> +#define dstin x0
> +#define src x1
> +#define count x2
> +#define tmp1 x3
> +#define tmp1w w3
> +#define tmp2 x4
> +#define tmp2w w4
> +#define tmp3 x5
> +#define tmp3w w5
> +#define dst x6
> +
> +#define A_l x7
> +#define A_h x8
> +#define B_l x9
> +#define B_h x10
> +#define C_l x11
> +#define C_h x12
> +#define D_l x13
> +#define D_h x14
Could we use .req instead of #define?
> +
> + /*
> + * Prototype: size_t copy_to_user (void *dst, const void *src, size_t nb)
> + */
> + .text
> + .align 2
> + .p2align 6,,63
Do we need two align directives?
> +ENTRY(__copy_to_user)
> + mov dst, dstin
> + cmp count, #64
> + b.ge .Lcpy_not_short
> + cmp count, #15
> + b.le .Ltail15tiny
> +
> + /* Deal with small copies quickly by dropping straight into the
> + * exit block. */
> +.Ltail63:
> + /* Copy up to 48 bytes of data. At this point we only need the
> + * bottom 6 bits of count to be accurate. */
> + ands tmp1, count, #0x30
> + b.eq .Ltail15
> + add dst, dst, tmp1
> + add src, src, tmp1
> + cmp tmp1w, #0x20
> + b.eq 1f
> + b.lt 2f
> + ldp A_l, A_h, [src, #-48]
> +USER (9f, stp A_l, A_h, [dst, #-48])
> +1:
> + ldp A_l, A_h, [src, #-32]
> +USER (9f, stp A_l, A_h, [dst, #-32])
> +2:
> + ldp A_l, A_h, [src, #-16]
> +USER (9f, stp A_l, A_h, [dst, #-16])
> +
> +.Ltail15:
> + ands count, count, #15
> + beq 1f
> + add src, src, count
> + ldp A_l, A_h, [src, #-16]
> + add dst, dst, count
> +USER (9f, stp A_l, A_h, [dst, #-16])
> +1:
> + b .Lsuccess
> +
> +.Ltail15tiny:
> + /* Copy up to 15 bytes of data. Does not assume additional data
> + being copied. */
> + tbz count, #3, 1f
> + ldr tmp1, [src], #8
> +USER (9f, str tmp1, [dst], #8)
> +1:
> + tbz count, #2, 1f
> + ldr tmp1w, [src], #4
> +USER (9f, str tmp1w, [dst], #4)
> 1:
> - ldr x3, [x1], #8
> - subs x2, x2, #8
> -USER(9f, str x3, [x0], #8 )
> - b.pl 1b
> -2: adds x2, x2, #4
> - b.mi 3f
> - ldr w3, [x1], #4
> - sub x2, x2, #4
> -USER(9f, str w3, [x0], #4 )
> -3: adds x2, x2, #2
> - b.mi 4f
> - ldrh w3, [x1], #2
> - sub x2, x2, #2
> -USER(9f, strh w3, [x0], #2 )
> -4: adds x2, x2, #1
> - b.mi 5f
> - ldrb w3, [x1]
> -USER(9f, strb w3, [x0] )
> -5: mov x0, #0
> + tbz count, #1, 1f
> + ldrh tmp1w, [src], #2
> +USER (9f, strh tmp1w, [dst], #2)
> +1:
> + tbz count, #0, 1f
> + ldrb tmp1w, [src]
> +USER (9f, strb tmp1w, [dst])
> +1:
> + b .Lsuccess
> +
> +.Lcpy_not_short:
> + /* We don't much care about the alignment of DST, but we want SRC
> + * to be 128-bit (16 byte) aligned so that we don't cross cache line
> + * boundaries on both loads and stores. */
> + neg tmp2, src
> + ands tmp2, tmp2, #15 /* Bytes to reach alignment. */
> + b.eq 2f
> + sub count, count, tmp2
> + /* Copy more data than needed; it's faster than jumping
> + * around copying sub-Quadword quantities. We know that
> + * it can't overrun. */
> + ldp A_l, A_h, [src]
> + add src, src, tmp2
> +USER (9f, stp A_l, A_h, [dst])
> + add dst, dst, tmp2
> + /* There may be less than 63 bytes to go now. */
> + cmp count, #63
> + b.le .Ltail63
> +2:
> + subs count, count, #128
> + b.ge .Lcpy_body_large
> + /* Less than 128 bytes to copy, so handle 64 here and then jump
> + * to the tail. */
> + ldp A_l, A_h, [src]
> + ldp B_l, B_h, [src, #16]
> + ldp C_l, C_h, [src, #32]
> + ldp D_l, D_h, [src, #48]
> +USER (9f, stp A_l, A_h, [dst])
> +USER (9f, stp B_l, B_h, [dst, #16])
> +USER (9f, stp C_l, C_h, [dst, #32])
> +USER (9f, stp D_l, D_h, [dst, #48])
> + tst count, #0x3f
> + add src, src, #64
> + add dst, dst, #64
> + b.ne .Ltail63
> + b .Lsuccess
> +
> + /* Critical loop. Start at a new cache line boundary. Assuming
> + * 64 bytes per line this ensures the entire loop is in one line. */
> + .p2align 6
> +.Lcpy_body_large:
> + /* There are at least 128 bytes to copy. */
> + ldp A_l, A_h, [src, #0]
> + sub dst, dst, #16 /* Pre-bias. */
> + ldp B_l, B_h, [src, #16]
> + ldp C_l, C_h, [src, #32]
> + ldp D_l, D_h, [src, #48]! /* src += 64 - Pre-bias. */
> +1:
> +USER (9f, stp A_l, A_h, [dst, #16])
> + ldp A_l, A_h, [src, #16]
> +USER (9f, stp B_l, B_h, [dst, #32])
> + ldp B_l, B_h, [src, #32]
> +USER (9f, stp C_l, C_h, [dst, #48])
> + ldp C_l, C_h, [src, #48]
> +USER (9f, stp D_l, D_h, [dst, #64]!)
> + ldp D_l, D_h, [src, #64]!
> + subs count, count, #64
> + b.ge 1b
> +USER (9f, stp A_l, A_h, [dst, #16])
> +USER (9f, stp B_l, B_h, [dst, #32])
> +USER (9f, stp C_l, C_h, [dst, #48])
> +USER (9f, stp D_l, D_h, [dst, #64])
> + add src, src, #16
> + add dst, dst, #64 + 16
> + tst count, #0x3f
> + b.ne .Ltail63
> +.Lsuccess:
> + mov x0, #0 // nothing left to copy
> ret
> ENDPROC(__copy_to_user)
>
> .section .fixup,"ax"
> - .align 2
> -9: sub x0, x4, x0 // bytes not copied
> + .align 2
The .align change is just whitespace.
> +9: mov x0, count // approximate the number of bytes not copied
Is this an approximate or the real number of bytes not copied?
I haven't done a proper review of the algorithm yet, so I'll give more
comments. For the next version there are also a few coding style issues
that need clean-up (comment style, white space).
--
Catalin
More information about the linux-arm-kernel
mailing list