[PATCH v2] riscv: lib: optimize memcmp with ld insn
Yipeng Zou
zouyipeng at huawei.com
Mon Sep 5 19:15:12 PDT 2022
在 2022/9/5 19:49, Andrew Jones 写道:
> On Fri, Sep 02, 2022 at 07:00:39PM +0800, Yipeng Zou wrote:
>> Currently memcmp was implemented in c code(lib/string.c), which compare
>> memory per byte.
>>
>> This patch use ld insn compare memory per word to improve. From the test
>> Results, this will take several times optimized.
>>
>> Alloc 8,4,1KB buffer to compare, each loop 10k times:
>>
>> Size(B) Min(ns) AVG(ns) //before
>>
>> 8k 40800 46316
>> 4k 26500 32302
>> 1k 15600 17965
>>
>> Size(B) Min(ns) AVG(ns) //after
>>
>> 8k 16100 21281
>> 4k 14200 16446
>> 1k 12400 14316
>>
>> Signed-off-by: Yipeng Zou <zouyipeng at huawei.com>
>> Reviewed-by: Conor Dooley <conor.dooley at microchip.com>
>> ---
>> V2: Patch test data into the commit message,and collect Reviewed-by Tags.
>>
>> arch/riscv/include/asm/string.h | 3 ++
>> arch/riscv/lib/Makefile | 1 +
>> arch/riscv/lib/memcmp.S | 59 +++++++++++++++++++++++++++++++++
>> 3 files changed, 63 insertions(+)
>> create mode 100644 arch/riscv/lib/memcmp.S
>>
>> diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
>> index 909049366555..3337b43d3803 100644
>> --- a/arch/riscv/include/asm/string.h
>> +++ b/arch/riscv/include/asm/string.h
>> @@ -18,6 +18,9 @@ extern asmlinkage void *__memcpy(void *, const void *, size_t);
>> #define __HAVE_ARCH_MEMMOVE
>> extern asmlinkage void *memmove(void *, const void *, size_t);
>> extern asmlinkage void *__memmove(void *, const void *, size_t);
>> +#define __HAVE_ARCH_MEMCMP
>> +extern int memcmp(const void *, const void *, size_t);
>> +
>> /* For those files which don't want to check by kasan. */
>> #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
>> #define memcpy(dst, src, len) __memcpy(dst, src, len)
>> diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
>> index 25d5c9664e57..70773bf0c471 100644
>> --- a/arch/riscv/lib/Makefile
>> +++ b/arch/riscv/lib/Makefile
>> @@ -3,6 +3,7 @@ lib-y += delay.o
>> lib-y += memcpy.o
>> lib-y += memset.o
>> lib-y += memmove.o
>> +lib-y += memcmp.o
>> lib-$(CONFIG_MMU) += uaccess.o
>> lib-$(CONFIG_64BIT) += tishift.o
>>
>> diff --git a/arch/riscv/lib/memcmp.S b/arch/riscv/lib/memcmp.S
>> new file mode 100644
>> index 000000000000..83af1c433e6f
>> --- /dev/null
>> +++ b/arch/riscv/lib/memcmp.S
>> @@ -0,0 +1,59 @@
>> +/* SPDX-License-Identifier: GPL-2.0-only */
>> +/*
>> + * Copyright (C) 2022 zouyipeng at huawei.com
>> + */
>> +#include <linux/linkage.h>
>> +#include <asm-generic/export.h>
>> +#include <asm/asm.h>
>> +
>> +/* argrments:
> arguments
>
> and please use an opening wing (/*)
ok,
>
>
>> +* a0: addr0
>> +* a1: addr1
>> +* a2: size
>> +*/
>> +#define addr0 a0
>> +#define addr1 a1
>> +#define limit a2
>> +
>> +#define data0 a3
>> +#define data1 a4
>> +#define tmp t3
>> +#define aaddr t4
>> +#define return a0
> I don't see much value in these defines. Why is addr0 better than a0?
> data0 and data1 are just temp registers. Defining 'return' seems really
> unnecessary. And, while a define for "the word end address" does make
> some sense, I can't figure out what the first 'a' "aaddr" is supposed to
> mean such that it conveys "the word end address". In general, if we're
> going to use defines let's not make them as cryptic or even more cryptic
> than the assembler itself :-)
The naming of the registers does need improve.
I will make it more reasonable on the next version.
>> +
>> +/* load and compare */
>> +.macro LD_CMP op d0 d1 a0 a1 offset
>> + \op \d0, 0(\a0)
>> + \op \d1, 0(\a1)
>> + addi \a0, \a0, \offset
>> + addi \a1, \a1, \offset
>> + sub tmp, \d0, \d1
> I'd prefer not to have a side-effect on tmp. Why not take this output
> register as another input to the macro?
oh, This is definitely needs to be modified here.
>> +.endm
>> +
>> +ENTRY(memcmp)
>> + /* test limit aligend with SZREG */
> is aligned
ok,
>> + andi tmp, limit, SZREG - 1
>> + /* load tail */
>> + add aaddr, addr0, limit
>> + sub aaddr, aaddr, tmp
>> + add limit, addr0, limit
>> +
>> +.LloopWord:
>> + sltu tmp, addr0, aaddr
>> + beqz tmp, .LloopByte
>> +
>> + LD_CMP REG_L data0 data1 addr0 addr1 SZREG
>> + beqz tmp, .LloopWord
>> + j .Lreturn
>> +
>> +.LloopByte:
>> + sltu tmp, addr0, limit
>> + beqz tmp, .Lreturn
>> +
>> + LD_CMP lbu data0 data1 addr0 addr1 1
>> + beqz tmp, .LloopByte
>> +.Lreturn:
>> + mv return, tmp
>> + ret
>> +END(memcmp)
>> +EXPORT_SYMBOL(memcmp);
>> --
>> 2.17.1
>>
> I'd prefer an assembly format were the operands are lined up
>
> op r1, r2
> op r1, r2
>
> Unfortunately I don't see much consistency in how to do this among other
> riscv .S files, though, so I'm not sure what to recommend if anything.
I agree with that, maybe we should get start on this patch.
For above all , thanks for review.
Will send next version as soon as possible.
>
> Thanks,
> drew
--
Regards,
Yipeng Zou
More information about the linux-riscv
mailing list