[PATCH v2 6/8] arm64: Import latest memcpy()/memmove() implementation
Robin Murphy
robin.murphy at arm.com
Tue Jun 8 04:37:01 PDT 2021
Hi Marek,
On 2021-06-08 12:15, Marek Szyprowski wrote:
> Hi Robin,
>
> On 27.05.2021 17:34, Robin Murphy wrote:
>> Import the latest implementation of memcpy(), based on the
>> upstream code of string/aarch64/memcpy.S at commit afd6244 from
>> https://github.com/ARM-software/optimized-routines, and subsuming
>> memmove() in the process.
>>
>> Note that for simplicity Arm have chosen to contribute this code
>> to Linux under GPLv2 rather than the original MIT license.
>>
>> Note also that the needs of the usercopy routines vs. regular memcpy()
>> have now diverged so far that we abandon the shared template idea
>> and the damage which that incurred to the tuning of LDP/STP loops.
>> We'll be back to tackle those routines separately in future.
>>
>> Signed-off-by: Robin Murphy <robin.murphy at arm.com>
>
> This patch landed recently in linux-next as commit 285133040e6c ("arm64:
> Import latest memcpy()/memmove() implementation"). Sadly it causes
> serious issues on Khadas VIM3 board. Reverting it on top of linux
> next-20210607 (together with 6b8f648959e5 and resolving the conflict in
> the Makefile) fixes the issue. Here is the kernel log:
>
> Unable to handle kernel paging request at virtual address ffff8000136bd204
> Mem abort info:
> ESR = 0x96000061
> EC = 0x25: DABT (current EL), IL = 32 bits
> SET = 0, FnV = 0
> EA = 0, S1PTW = 0
> Data abort info:
> ISV = 0, ISS = 0x00000061
That's an alignment fault, which implies we're accessing something which
isn't normal memory.
> CM = 0, WnR = 1
> swapper pgtable: 4k pages, 48-bit VAs, pgdp=0000000009da6000
> [ffff8000136bd204] pgd=10000000f4806003, p4d=10000000f4806003,
> pud=10000000f4805003, pmd=1000000000365003, pte=00680000ffe03713
> Internal error: Oops: 96000061 [#1] PREEMPT SMP
> Modules linked in: brcmfmac brcmutil cfg80211 dw_hdmi_i2s_audio
> meson_gxl hci_uart btqca btbcm bluetooth panfrost ecdh_generic ecc
> snd_soc_meson_axg_sound_card crct10dif_ce snd_soc_meson_card_utils
> rfkill rtc_hym8563 gpu_sched dwmac_generic rc_khadas meson_gxbb_wdt
> meson_ir pwm_meson snd_soc_meson_axg_tdmin snd_soc_meson_g12a_tohdmitx
> rtc_meson_vrtc snd_soc_meson_axg_tdmout snd_soc_meson_axg_frddr
> reset_meson_audio_arb snd_soc_meson_codec_glue axg_audio meson_rng
> sclk_div dwmac_meson8b snd_soc_meson_axg_toddr mdio_mux_meson_g12a
> clk_phase stmmac_platform rng_core snd_soc_meson_axg_fifo meson_dw_hdmi
> stmmac meson_drm meson_canvas dw_hdmi pcs_xpcs display_connector
> snd_soc_meson_axg_tdm_interface nvmem_meson_efuse adc_keys
> snd_soc_meson_axg_tdm_formatter
> CPU: 4 PID: 135 Comm: kworker/4:3 Not tainted 5.13.0-rc5-next-20210607
> #10441
> Hardware name: Khadas VIM3 (DT)
> Workqueue: events request_firmware_work_func
> pstate: 20000005 (nzCv daif -PAN -UAO -TCO BTYPE=--)
> pc : __memcpy+0x2c/0x260
> lr : sg_copy_buffer+0x90/0x118
> ...
> Call trace:
> __memcpy+0x2c/0x260
> sg_copy_to_buffer+0x14/0x20
> meson_mmc_start_cmd+0xf4/0x2c8
> meson_mmc_request+0x4c/0xb8
> __mmc_start_request+0xa4/0x2a8
> mmc_start_request+0x80/0xa8
> mmc_wait_for_req+0x68/0xd8
> mmc_io_rw_extended+0x1d4/0x2e0
> sdio_io_rw_ext_helper+0xb0/0x1e8
> sdio_memcpy_toio+0x20/0x28
> brcmf_sdiod_skbuff_write.isra.18+0x2c/0x68 [brcmfmac]
> brcmf_sdiod_ramrw+0xe0/0x230 [brcmfmac]
> brcmf_sdio_firmware_callback+0xa8/0x7c8 [brcmfmac]
> brcmf_fw_request_done+0x7c/0x100 [brcmfmac]
> request_firmware_work_func+0x4c/0xd8
> process_one_work+0x2a8/0x718
> worker_thread+0x48/0x460
> kthread+0x12c/0x160
> ret_from_fork+0x10/0x18
> Code: 540000c3 a9401c26 a97f348c a9001c06 (a93f34ac)
> ---[ end trace be83fa283dc82415 ]---
>
> I hope that the above log helps fixing the issue. IIRC the SDHCI driver
> on VIM3 board uses internal SRAM for transferring data (instead of DMA),
> so the issue is somehow related to that.
Drivers shouldn't be using memcpy() on iomem mappings. Even if they
happen to have got away with it sometimes ;)
Taking a quick look at that driver,
host->bounce_buf = host->regs + SD_EMMC_SRAM_DATA_BUF_OFF;
is completely bogus, as Sparse will readily point out.
Robin.
>> ---
>> arch/arm64/lib/Makefile | 2 +-
>> arch/arm64/lib/memcpy.S | 272 ++++++++++++++++++++++++++++++++-------
>> arch/arm64/lib/memmove.S | 189 ---------------------------
>> 3 files changed, 230 insertions(+), 233 deletions(-)
>> delete mode 100644 arch/arm64/lib/memmove.S
>>
>> diff --git a/arch/arm64/lib/Makefile b/arch/arm64/lib/Makefile
>> index d31e1169d9b8..01c596aa539c 100644
>> --- a/arch/arm64/lib/Makefile
>> +++ b/arch/arm64/lib/Makefile
>> @@ -1,7 +1,7 @@
>> # SPDX-License-Identifier: GPL-2.0
>> lib-y := clear_user.o delay.o copy_from_user.o \
>> copy_to_user.o copy_in_user.o copy_page.o \
>> - clear_page.o csum.o memchr.o memcpy.o memmove.o \
>> + clear_page.o csum.o memchr.o memcpy.o \
>> memset.o memcmp.o strcmp.o strncmp.o strlen.o \
>> strnlen.o strchr.o strrchr.o tishift.o
>>
>> diff --git a/arch/arm64/lib/memcpy.S b/arch/arm64/lib/memcpy.S
>> index dc8d2a216a6e..31073a8304fb 100644
>> --- a/arch/arm64/lib/memcpy.S
>> +++ b/arch/arm64/lib/memcpy.S
>> @@ -1,66 +1,252 @@
>> /* SPDX-License-Identifier: GPL-2.0-only */
>> /*
>> - * Copyright (C) 2013 ARM Ltd.
>> - * Copyright (C) 2013 Linaro.
>> + * Copyright (c) 2012-2020, Arm Limited.
>> *
>> - * This code is based on glibc cortex strings work originally authored by Linaro
>> - * be found @
>> - *
>> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
>> - * files/head:/src/aarch64/
>> + * Adapted from the original at:
>> + * https://github.com/ARM-software/optimized-routines/blob/master/string/aarch64/memcpy.S
>> */
>>
>> #include <linux/linkage.h>
>> #include <asm/assembler.h>
>> -#include <asm/cache.h>
>>
>> -/*
>> - * Copy a buffer from src to dest (alignment handled by the hardware)
>> +/* Assumptions:
>> + *
>> + * ARMv8-a, AArch64, unaligned accesses.
>> *
>> - * Parameters:
>> - * x0 - dest
>> - * x1 - src
>> - * x2 - n
>> - * Returns:
>> - * x0 - dest
>> */
>> - .macro ldrb1 reg, ptr, val
>> - ldrb \reg, [\ptr], \val
>> - .endm
>>
>> - .macro strb1 reg, ptr, val
>> - strb \reg, [\ptr], \val
>> - .endm
>> +#define L(label) .L ## label
>>
>> - .macro ldrh1 reg, ptr, val
>> - ldrh \reg, [\ptr], \val
>> - .endm
>> +#define dstin x0
>> +#define src x1
>> +#define count x2
>> +#define dst x3
>> +#define srcend x4
>> +#define dstend x5
>> +#define A_l x6
>> +#define A_lw w6
>> +#define A_h x7
>> +#define B_l x8
>> +#define B_lw w8
>> +#define B_h x9
>> +#define C_l x10
>> +#define C_lw w10
>> +#define C_h x11
>> +#define D_l x12
>> +#define D_h x13
>> +#define E_l x14
>> +#define E_h x15
>> +#define F_l x16
>> +#define F_h x17
>> +#define G_l count
>> +#define G_h dst
>> +#define H_l src
>> +#define H_h srcend
>> +#define tmp1 x14
>>
>> - .macro strh1 reg, ptr, val
>> - strh \reg, [\ptr], \val
>> - .endm
>> +/* This implementation handles overlaps and supports both memcpy and memmove
>> + from a single entry point. It uses unaligned accesses and branchless
>> + sequences to keep the code small, simple and improve performance.
>>
>> - .macro ldr1 reg, ptr, val
>> - ldr \reg, [\ptr], \val
>> - .endm
>> + Copies are split into 3 main cases: small copies of up to 32 bytes, medium
>> + copies of up to 128 bytes, and large copies. The overhead of the overlap
>> + check is negligible since it is only required for large copies.
>>
>> - .macro str1 reg, ptr, val
>> - str \reg, [\ptr], \val
>> - .endm
>> -
>> - .macro ldp1 reg1, reg2, ptr, val
>> - ldp \reg1, \reg2, [\ptr], \val
>> - .endm
>> -
>> - .macro stp1 reg1, reg2, ptr, val
>> - stp \reg1, \reg2, [\ptr], \val
>> - .endm
>> + Large copies use a software pipelined loop processing 64 bytes per iteration.
>> + The destination pointer is 16-byte aligned to minimize unaligned accesses.
>> + The loop tail is handled by always copying 64 bytes from the end.
>> +*/
>>
>> +SYM_FUNC_START_ALIAS(__memmove)
>> +SYM_FUNC_START_WEAK_ALIAS_PI(memmove)
>> SYM_FUNC_START_ALIAS(__memcpy)
>> SYM_FUNC_START_WEAK_PI(memcpy)
>> -#include "copy_template.S"
>> + add srcend, src, count
>> + add dstend, dstin, count
>> + cmp count, 128
>> + b.hi L(copy_long)
>> + cmp count, 32
>> + b.hi L(copy32_128)
>> +
>> + /* Small copies: 0..32 bytes. */
>> + cmp count, 16
>> + b.lo L(copy16)
>> + ldp A_l, A_h, [src]
>> + ldp D_l, D_h, [srcend, -16]
>> + stp A_l, A_h, [dstin]
>> + stp D_l, D_h, [dstend, -16]
>> ret
>> +
>> + /* Copy 8-15 bytes. */
>> +L(copy16):
>> + tbz count, 3, L(copy8)
>> + ldr A_l, [src]
>> + ldr A_h, [srcend, -8]
>> + str A_l, [dstin]
>> + str A_h, [dstend, -8]
>> + ret
>> +
>> + .p2align 3
>> + /* Copy 4-7 bytes. */
>> +L(copy8):
>> + tbz count, 2, L(copy4)
>> + ldr A_lw, [src]
>> + ldr B_lw, [srcend, -4]
>> + str A_lw, [dstin]
>> + str B_lw, [dstend, -4]
>> + ret
>> +
>> + /* Copy 0..3 bytes using a branchless sequence. */
>> +L(copy4):
>> + cbz count, L(copy0)
>> + lsr tmp1, count, 1
>> + ldrb A_lw, [src]
>> + ldrb C_lw, [srcend, -1]
>> + ldrb B_lw, [src, tmp1]
>> + strb A_lw, [dstin]
>> + strb B_lw, [dstin, tmp1]
>> + strb C_lw, [dstend, -1]
>> +L(copy0):
>> + ret
>> +
>> + .p2align 4
>> + /* Medium copies: 33..128 bytes. */
>> +L(copy32_128):
>> + ldp A_l, A_h, [src]
>> + ldp B_l, B_h, [src, 16]
>> + ldp C_l, C_h, [srcend, -32]
>> + ldp D_l, D_h, [srcend, -16]
>> + cmp count, 64
>> + b.hi L(copy128)
>> + stp A_l, A_h, [dstin]
>> + stp B_l, B_h, [dstin, 16]
>> + stp C_l, C_h, [dstend, -32]
>> + stp D_l, D_h, [dstend, -16]
>> + ret
>> +
>> + .p2align 4
>> + /* Copy 65..128 bytes. */
>> +L(copy128):
>> + ldp E_l, E_h, [src, 32]
>> + ldp F_l, F_h, [src, 48]
>> + cmp count, 96
>> + b.ls L(copy96)
>> + ldp G_l, G_h, [srcend, -64]
>> + ldp H_l, H_h, [srcend, -48]
>> + stp G_l, G_h, [dstend, -64]
>> + stp H_l, H_h, [dstend, -48]
>> +L(copy96):
>> + stp A_l, A_h, [dstin]
>> + stp B_l, B_h, [dstin, 16]
>> + stp E_l, E_h, [dstin, 32]
>> + stp F_l, F_h, [dstin, 48]
>> + stp C_l, C_h, [dstend, -32]
>> + stp D_l, D_h, [dstend, -16]
>> + ret
>> +
>> + .p2align 4
>> + /* Copy more than 128 bytes. */
>> +L(copy_long):
>> + /* Use backwards copy if there is an overlap. */
>> + sub tmp1, dstin, src
>> + cbz tmp1, L(copy0)
>> + cmp tmp1, count
>> + b.lo L(copy_long_backwards)
>> +
>> + /* Copy 16 bytes and then align dst to 16-byte alignment. */
>> +
>> + ldp D_l, D_h, [src]
>> + and tmp1, dstin, 15
>> + bic dst, dstin, 15
>> + sub src, src, tmp1
>> + add count, count, tmp1 /* Count is now 16 too large. */
>> + ldp A_l, A_h, [src, 16]
>> + stp D_l, D_h, [dstin]
>> + ldp B_l, B_h, [src, 32]
>> + ldp C_l, C_h, [src, 48]
>> + ldp D_l, D_h, [src, 64]!
>> + subs count, count, 128 + 16 /* Test and readjust count. */
>> + b.ls L(copy64_from_end)
>> +
>> +L(loop64):
>> + stp A_l, A_h, [dst, 16]
>> + ldp A_l, A_h, [src, 16]
>> + stp B_l, B_h, [dst, 32]
>> + ldp B_l, B_h, [src, 32]
>> + stp C_l, C_h, [dst, 48]
>> + ldp C_l, C_h, [src, 48]
>> + stp D_l, D_h, [dst, 64]!
>> + ldp D_l, D_h, [src, 64]!
>> + subs count, count, 64
>> + b.hi L(loop64)
>> +
>> + /* Write the last iteration and copy 64 bytes from the end. */
>> +L(copy64_from_end):
>> + ldp E_l, E_h, [srcend, -64]
>> + stp A_l, A_h, [dst, 16]
>> + ldp A_l, A_h, [srcend, -48]
>> + stp B_l, B_h, [dst, 32]
>> + ldp B_l, B_h, [srcend, -32]
>> + stp C_l, C_h, [dst, 48]
>> + ldp C_l, C_h, [srcend, -16]
>> + stp D_l, D_h, [dst, 64]
>> + stp E_l, E_h, [dstend, -64]
>> + stp A_l, A_h, [dstend, -48]
>> + stp B_l, B_h, [dstend, -32]
>> + stp C_l, C_h, [dstend, -16]
>> + ret
>> +
>> + .p2align 4
>> +
>> + /* Large backwards copy for overlapping copies.
>> + Copy 16 bytes and then align dst to 16-byte alignment. */
>> +L(copy_long_backwards):
>> + ldp D_l, D_h, [srcend, -16]
>> + and tmp1, dstend, 15
>> + sub srcend, srcend, tmp1
>> + sub count, count, tmp1
>> + ldp A_l, A_h, [srcend, -16]
>> + stp D_l, D_h, [dstend, -16]
>> + ldp B_l, B_h, [srcend, -32]
>> + ldp C_l, C_h, [srcend, -48]
>> + ldp D_l, D_h, [srcend, -64]!
>> + sub dstend, dstend, tmp1
>> + subs count, count, 128
>> + b.ls L(copy64_from_start)
>> +
>> +L(loop64_backwards):
>> + stp A_l, A_h, [dstend, -16]
>> + ldp A_l, A_h, [srcend, -16]
>> + stp B_l, B_h, [dstend, -32]
>> + ldp B_l, B_h, [srcend, -32]
>> + stp C_l, C_h, [dstend, -48]
>> + ldp C_l, C_h, [srcend, -48]
>> + stp D_l, D_h, [dstend, -64]!
>> + ldp D_l, D_h, [srcend, -64]!
>> + subs count, count, 64
>> + b.hi L(loop64_backwards)
>> +
>> + /* Write the last iteration and copy 64 bytes from the start. */
>> +L(copy64_from_start):
>> + ldp G_l, G_h, [src, 48]
>> + stp A_l, A_h, [dstend, -16]
>> + ldp A_l, A_h, [src, 32]
>> + stp B_l, B_h, [dstend, -32]
>> + ldp B_l, B_h, [src, 16]
>> + stp C_l, C_h, [dstend, -48]
>> + ldp C_l, C_h, [src]
>> + stp D_l, D_h, [dstend, -64]
>> + stp G_l, G_h, [dstin, 48]
>> + stp A_l, A_h, [dstin, 32]
>> + stp B_l, B_h, [dstin, 16]
>> + stp C_l, C_h, [dstin]
>> + ret
>> +
>> SYM_FUNC_END_PI(memcpy)
>> EXPORT_SYMBOL(memcpy)
>> SYM_FUNC_END_ALIAS(__memcpy)
>> EXPORT_SYMBOL(__memcpy)
>> +SYM_FUNC_END_ALIAS_PI(memmove)
>> +EXPORT_SYMBOL(memmove)
>> +SYM_FUNC_END_ALIAS(__memmove)
>> +EXPORT_SYMBOL(__memmove)
>> \ No newline at end of file
>> diff --git a/arch/arm64/lib/memmove.S b/arch/arm64/lib/memmove.S
>> deleted file mode 100644
>> index 1035dce4bdaf..000000000000
>> --- a/arch/arm64/lib/memmove.S
>> +++ /dev/null
>> @@ -1,189 +0,0 @@
>> -/* SPDX-License-Identifier: GPL-2.0-only */
>> -/*
>> - * Copyright (C) 2013 ARM Ltd.
>> - * Copyright (C) 2013 Linaro.
>> - *
>> - * This code is based on glibc cortex strings work originally authored by Linaro
>> - * be found @
>> - *
>> - * http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/
>> - * files/head:/src/aarch64/
>> - */
>> -
>> -#include <linux/linkage.h>
>> -#include <asm/assembler.h>
>> -#include <asm/cache.h>
>> -
>> -/*
>> - * Move a buffer from src to test (alignment handled by the hardware).
>> - * If dest <= src, call memcpy, otherwise copy in reverse order.
>> - *
>> - * Parameters:
>> - * x0 - dest
>> - * x1 - src
>> - * x2 - n
>> - * Returns:
>> - * x0 - dest
>> - */
>> -dstin .req x0
>> -src .req x1
>> -count .req x2
>> -tmp1 .req x3
>> -tmp1w .req w3
>> -tmp2 .req x4
>> -tmp2w .req w4
>> -tmp3 .req x5
>> -tmp3w .req w5
>> -dst .req x6
>> -
>> -A_l .req x7
>> -A_h .req x8
>> -B_l .req x9
>> -B_h .req x10
>> -C_l .req x11
>> -C_h .req x12
>> -D_l .req x13
>> -D_h .req x14
>> -
>> -SYM_FUNC_START_ALIAS(__memmove)
>> -SYM_FUNC_START_WEAK_PI(memmove)
>> - cmp dstin, src
>> - b.lo __memcpy
>> - add tmp1, src, count
>> - cmp dstin, tmp1
>> - b.hs __memcpy /* No overlap. */
>> -
>> - add dst, dstin, count
>> - add src, src, count
>> - cmp count, #16
>> - b.lo .Ltail15 /*probably non-alignment accesses.*/
>> -
>> - ands tmp2, src, #15 /* Bytes to reach alignment. */
>> - b.eq .LSrcAligned
>> - sub count, count, tmp2
>> - /*
>> - * process the aligned offset length to make the src aligned firstly.
>> - * those extra instructions' cost is acceptable. It also make the
>> - * coming accesses are based on aligned address.
>> - */
>> - tbz tmp2, #0, 1f
>> - ldrb tmp1w, [src, #-1]!
>> - strb tmp1w, [dst, #-1]!
>> -1:
>> - tbz tmp2, #1, 2f
>> - ldrh tmp1w, [src, #-2]!
>> - strh tmp1w, [dst, #-2]!
>> -2:
>> - tbz tmp2, #2, 3f
>> - ldr tmp1w, [src, #-4]!
>> - str tmp1w, [dst, #-4]!
>> -3:
>> - tbz tmp2, #3, .LSrcAligned
>> - ldr tmp1, [src, #-8]!
>> - str tmp1, [dst, #-8]!
>> -
>> -.LSrcAligned:
>> - cmp count, #64
>> - b.ge .Lcpy_over64
>> -
>> - /*
>> - * Deal with small copies quickly by dropping straight into the
>> - * exit block.
>> - */
>> -.Ltail63:
>> - /*
>> - * Copy up to 48 bytes of data. At this point we only need the
>> - * bottom 6 bits of count to be accurate.
>> - */
>> - ands tmp1, count, #0x30
>> - b.eq .Ltail15
>> - cmp tmp1w, #0x20
>> - b.eq 1f
>> - b.lt 2f
>> - ldp A_l, A_h, [src, #-16]!
>> - stp A_l, A_h, [dst, #-16]!
>> -1:
>> - ldp A_l, A_h, [src, #-16]!
>> - stp A_l, A_h, [dst, #-16]!
>> -2:
>> - ldp A_l, A_h, [src, #-16]!
>> - stp A_l, A_h, [dst, #-16]!
>> -
>> -.Ltail15:
>> - tbz count, #3, 1f
>> - ldr tmp1, [src, #-8]!
>> - str tmp1, [dst, #-8]!
>> -1:
>> - tbz count, #2, 2f
>> - ldr tmp1w, [src, #-4]!
>> - str tmp1w, [dst, #-4]!
>> -2:
>> - tbz count, #1, 3f
>> - ldrh tmp1w, [src, #-2]!
>> - strh tmp1w, [dst, #-2]!
>> -3:
>> - tbz count, #0, .Lexitfunc
>> - ldrb tmp1w, [src, #-1]
>> - strb tmp1w, [dst, #-1]
>> -
>> -.Lexitfunc:
>> - ret
>> -
>> -.Lcpy_over64:
>> - subs count, count, #128
>> - b.ge .Lcpy_body_large
>> - /*
>> - * Less than 128 bytes to copy, so handle 64 bytes here and then jump
>> - * to the tail.
>> - */
>> - ldp A_l, A_h, [src, #-16]
>> - stp A_l, A_h, [dst, #-16]
>> - ldp B_l, B_h, [src, #-32]
>> - ldp C_l, C_h, [src, #-48]
>> - stp B_l, B_h, [dst, #-32]
>> - stp C_l, C_h, [dst, #-48]
>> - ldp D_l, D_h, [src, #-64]!
>> - stp D_l, D_h, [dst, #-64]!
>> -
>> - tst count, #0x3f
>> - b.ne .Ltail63
>> - ret
>> -
>> - /*
>> - * Critical loop. Start at a new cache line boundary. Assuming
>> - * 64 bytes per line this ensures the entire loop is in one line.
>> - */
>> - .p2align L1_CACHE_SHIFT
>> -.Lcpy_body_large:
>> - /* pre-load 64 bytes data. */
>> - ldp A_l, A_h, [src, #-16]
>> - ldp B_l, B_h, [src, #-32]
>> - ldp C_l, C_h, [src, #-48]
>> - ldp D_l, D_h, [src, #-64]!
>> -1:
>> - /*
>> - * interlace the load of next 64 bytes data block with store of the last
>> - * loaded 64 bytes data.
>> - */
>> - stp A_l, A_h, [dst, #-16]
>> - ldp A_l, A_h, [src, #-16]
>> - stp B_l, B_h, [dst, #-32]
>> - ldp B_l, B_h, [src, #-32]
>> - stp C_l, C_h, [dst, #-48]
>> - ldp C_l, C_h, [src, #-48]
>> - stp D_l, D_h, [dst, #-64]!
>> - ldp D_l, D_h, [src, #-64]!
>> - subs count, count, #64
>> - b.ge 1b
>> - stp A_l, A_h, [dst, #-16]
>> - stp B_l, B_h, [dst, #-32]
>> - stp C_l, C_h, [dst, #-48]
>> - stp D_l, D_h, [dst, #-64]!
>> -
>> - tst count, #0x3f
>> - b.ne .Ltail63
>> - ret
>> -SYM_FUNC_END_PI(memmove)
>> -EXPORT_SYMBOL(memmove)
>> -SYM_FUNC_END_ALIAS(__memmove)
>> -EXPORT_SYMBOL(__memmove)
>
> Best regards
>
More information about the linux-amlogic
mailing list