[PATCH v4 5/5] RISC-V: add zbb support to string functions
Christoph Müllner
christoph.muellner at vrull.eu
Wed Jan 11 06:27:28 PST 2023
On Wed, Jan 11, 2023 at 1:24 PM Andrew Jones <ajones at ventanamicro.com> wrote:
>
> On Mon, Jan 09, 2023 at 07:17:55PM +0100, Heiko Stuebner wrote:
> > From: Heiko Stuebner <heiko.stuebner at vrull.eu>
> >
> > Add handling for ZBB extension and add support for using it as a
> > variant for optimized string functions.
> >
> > Support for the Zbb-str-variants is limited to the GNU-assembler
> > for now, as LLVM has not yet acquired the functionality to
> > selectively change the arch option in assembler code.
> > This is still under review at
> > https://reviews.llvm.org/D123515
> >
> > Co-developed-by: Christoph Muellner <christoph.muellner at vrull.eu>
> > Signed-off-by: Christoph Muellner <christoph.muellner at vrull.eu>
> > Signed-off-by: Heiko Stuebner <heiko.stuebner at vrull.eu>
> > ---
> > arch/riscv/Kconfig | 24 ++++++
> > arch/riscv/include/asm/errata_list.h | 3 +-
> > arch/riscv/include/asm/hwcap.h | 1 +
> > arch/riscv/include/asm/string.h | 2 +
> > arch/riscv/kernel/cpu.c | 1 +
> > arch/riscv/kernel/cpufeature.c | 18 +++++
> > arch/riscv/lib/strcmp.S | 94 ++++++++++++++++++++++
> > arch/riscv/lib/strlen.S | 114 +++++++++++++++++++++++++++
> > arch/riscv/lib/strncmp.S | 111 ++++++++++++++++++++++++++
> > 9 files changed, 367 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index e2b656043abf..7c814fbf9527 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -416,6 +416,30 @@ config RISCV_ISA_SVPBMT
> >
> > If you don't know what to do here, say Y.
> >
> > +config TOOLCHAIN_HAS_ZBB
> > + bool
> > + default y
> > + depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb)
> > + depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb)
> > + depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
> > + depends on AS_IS_GNU
> > +
> > +config RISCV_ISA_ZBB
> > + bool "Zbb extension support for bit manipulation instructions"
> > + depends on TOOLCHAIN_HAS_ZBB
> > + depends on !XIP_KERNEL && MMU
> > + select RISCV_ALTERNATIVE
> > + default y
> > + help
> > + Adds support to dynamically detect the presence of the ZBB
> > + extension (basic bit manipulation) and enable its usage.
> > +
> > + The Zbb extension provides instructions to accelerate a number
> > + of bit-specific operations (count bit population, sign extending,
> > + bitrotation, etc).
> > +
> > + If you don't know what to do here, say Y.
> > +
> > config TOOLCHAIN_HAS_ZICBOM
> > bool
> > default y
> > diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> > index 4180312d2a70..95e626b7281e 100644
> > --- a/arch/riscv/include/asm/errata_list.h
> > +++ b/arch/riscv/include/asm/errata_list.h
> > @@ -24,7 +24,8 @@
> >
> > #define CPUFEATURE_SVPBMT 0
> > #define CPUFEATURE_ZICBOM 1
> > -#define CPUFEATURE_NUMBER 2
> > +#define CPUFEATURE_ZBB 2
> > +#define CPUFEATURE_NUMBER 3
> >
> > #ifdef __ASSEMBLY__
> >
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index 86328e3acb02..b727491fb100 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -59,6 +59,7 @@ enum riscv_isa_ext_id {
> > RISCV_ISA_EXT_ZIHINTPAUSE,
> > RISCV_ISA_EXT_SSTC,
> > RISCV_ISA_EXT_SVINVAL,
> > + RISCV_ISA_EXT_ZBB,
> > RISCV_ISA_EXT_ID_MAX
> > };
> > static_assert(RISCV_ISA_EXT_ID_MAX <= RISCV_ISA_EXT_MAX);
> > diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
> > index a96b1fea24fe..17dfc4ab4c80 100644
> > --- a/arch/riscv/include/asm/string.h
> > +++ b/arch/riscv/include/asm/string.h
> > @@ -6,6 +6,8 @@
> > #ifndef _ASM_RISCV_STRING_H
> > #define _ASM_RISCV_STRING_H
> >
> > +#include <asm/alternative-macros.h>
> > +#include <asm/errata_list.h>
> > #include <linux/types.h>
> > #include <linux/linkage.h>
> >
> > diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
> > index 1b9a5a66e55a..c4d1aa166f8b 100644
> > --- a/arch/riscv/kernel/cpu.c
> > +++ b/arch/riscv/kernel/cpu.c
> > @@ -162,6 +162,7 @@ arch_initcall(riscv_cpuinfo_init);
> > * extensions by an underscore.
> > */
> > static struct riscv_isa_ext_data isa_ext_arr[] = {
> > + __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB),
> > __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
> > __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
> > __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index 205bbd6b1fce..bf3a791d7110 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -222,6 +222,7 @@ void __init riscv_fill_hwcap(void)
> > set_bit(nr, this_isa);
> > }
> > } else {
> > + SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB);
> > SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF);
> > SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT);
> > SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
> > @@ -301,6 +302,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
> > return true;
> > }
> >
> > +static bool __init_or_module cpufeature_probe_zbb(unsigned int stage)
> > +{
> > + if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB))
> > + return false;
> > +
> > + if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
> > + return false;
> > +
> > + if (!riscv_isa_extension_available(NULL, ZBB))
> > + return false;
> > +
> > + return true;
> > +}
> > +
> > /*
> > * Probe presence of individual extensions.
> > *
> > @@ -318,6 +333,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
> > if (cpufeature_probe_zicbom(stage))
> > cpu_req_feature |= BIT(CPUFEATURE_ZICBOM);
> >
> > + if (cpufeature_probe_zbb(stage))
> > + cpu_req_feature |= BIT(CPUFEATURE_ZBB);
> > +
> > return cpu_req_feature;
> > }
> >
> > diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
> > index 94440fb8390c..5428a8f2eb84 100644
> > --- a/arch/riscv/lib/strcmp.S
> > +++ b/arch/riscv/lib/strcmp.S
> > @@ -3,9 +3,14 @@
> > #include <linux/linkage.h>
> > #include <asm/asm.h>
> > #include <asm-generic/export.h>
> > +#include <asm/alternative-macros.h>
> > +#include <asm/errata_list.h>
> >
> > /* int strcmp(const char *cs, const char *ct) */
> > SYM_FUNC_START(strcmp)
> > +
> > + ALTERNATIVE("nop", "j variant_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
>
> How about strcmp_zbb and similar for the other functions for the labels?
Fully agree!
>
> > +
> > /*
> > * Returns
> > * a0 - comparison result, value like strcmp
> > @@ -34,4 +39,93 @@ SYM_FUNC_START(strcmp)
> > bnez t1, 1b
> > li a0, 0
> > j 2b
> > +
> > +/*
> > + * Variant of strcmp using the ZBB extension if available
> > + */
> > +#ifdef CONFIG_RISCV_ISA_ZBB
> > +variant_zbb:
> > +#define src1 a0
> > +#define result a0
> > +#define src2 t5
> > +#define data1 t0
> > +#define data2 t1
> > +#define align t2
> > +#define data1_orcb t3
> > +#define m1 t4
>
> nit: It's probably just me, but I'm not a big fan of creating defines
> for registers, particularly when the meaning of the registers' contents
> gets changed, because then the name, at least temporarily, no longer
> accurately describes the content.
Without any strong feelings about this: I think this is common
practice when writing asm code (helps to write and understand the
code).
See random other sources where this is also done:
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/aarch64/strcmp.S;h=054ce15ef8273073de4c821e3f415f1f70541554;hb=HEAD#l35
https://github.com/openssl/openssl/blob/master/crypto/aes/asm/aes-x86_64.pl#L58
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/lib/memcmp.S?h=v6.2-rc3#n20
> And, in this case, it looks like the code matches Appendix A, except the
> register and label naming. I'd prefer to have those match too so I could
> "review" with the diff tool.
Good that you did not "review" the code with diff.
If so, you would not have found the LGPTR issue (below).
>
>
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > + /*
> > + * Returns
> > + * a0 - comparison result, value like strcmp
> > + *
> > + * Parameters
> > + * a0 - string1
> > + * a1 - string2
> > + *
> > + * Clobbers
> > + * t0, t1, t2, t3, t4, t5
> > + */
> > + mv src2, a1
> > +
> > + or align, src1, src2
> > + li m1, -1
> > + and align, align, SZREG-1
> > + bnez align, 3f
> > +
> > + /* Main loop for aligned string. */
> > + .p2align 3
>
> Why are the starts of the loops aligned to 8 byte boundaries?
Loop alignment seems to be a valid optimization method for other architectures.
I don't see anything specific in RISC-V that would make loop alignment
irrelevant.
AArch64 code seems to align loops (or other jump targets) to 8 or 16 bytes.
This was considered as not harmful, but potentially beneficial.
>
>
> > +1:
> > + REG_L data1, 0(src1)
> > + REG_L data2, 0(src2)
> > + orc.b data1_orcb, data1
> > + bne data1_orcb, m1, 2f
> > + addi src1, src1, SZREG
> > + addi src2, src2, SZREG
> > + beq data1, data2, 1b
> > +
> > + /*
> > + * Words don't match, and no null byte in the first
> > + * word. Get bytes in big-endian order and compare.
> > + */
> > +#ifndef CONFIG_CPU_BIG_ENDIAN
> > + rev8 data1, data1
> > + rev8 data2, data2
> > +#endif
> > +
> > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
> > + sltu result, data1, data2
> > + neg result, result
> > + ori result, result, 1
> > + ret
> > +
> > +2:
> > + /*
> > + * Found a null byte.
> > + * If words don't match, fall back to simple loop.
> > + */
> > + bne data1, data2, 3f
> > +
> > + /* Otherwise, strings are equal. */
> > + li result, 0
> > + ret
> > +
> > + /* Simple loop for misaligned strings. */
> > + .p2align 3
> > +3:
> > + lbu data1, 0(src1)
> > + lbu data2, 0(src2)
> > + addi src1, src1, 1
> > + addi src2, src2, 1
> > + bne data1, data2, 4f
> > + bnez data1, 3b
> > +
> > +4:
> > + sub result, data1, data2
>
> The non-optimized version returns explicitly -1, 0, 1, whereas this
> returns < 0, 0, > 0. Assuming we don't need the explicit -1 / 1 then we
> can change the non-optimized version to do this subtraction for its
> return value too, saving several instructions.
Regarding "we don't need":
"The strcmp() and strncmp() functions return an integer less than, equal
to, or greater than zero if s1 (or the first n bytes thereof) is found,
respectively, to be less than, to match, or be greater than s2."
So, yes, the values -1/+1 are just some of the allowed return values.
>
>
> > + ret
> > +
> > +.option pop
> > +#endif
> > SYM_FUNC_END(strcmp)
> > diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
> > index 09a7aaff26c8..738efb04307d 100644
> > --- a/arch/riscv/lib/strlen.S
> > +++ b/arch/riscv/lib/strlen.S
> > @@ -3,9 +3,14 @@
> > #include <linux/linkage.h>
> > #include <asm/asm.h>
> > #include <asm-generic/export.h>
> > +#include <asm/alternative-macros.h>
> > +#include <asm/errata_list.h>
> >
> > /* int strlen(const char *s) */
> > SYM_FUNC_START(strlen)
> > +
> > + ALTERNATIVE("nop", "j variant_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
> > +
> > /*
> > * Returns
> > * a0 - string length
> > @@ -25,4 +30,113 @@ SYM_FUNC_START(strlen)
> > 2:
> > addi t1, t1, 1
> > j 1b
> > +
> > +/*
> > + * Variant of strlen using the ZBB extension if available
> > + */
> > +#ifdef CONFIG_RISCV_ISA_ZBB
> > +variant_zbb:
> > +
> > +#define src a0
> > +#define result a0
> > +#define addr t0
> > +#define data t1
> > +#define offset t2
> > +#define offset_bits t2
> > +#define valid_bytes t3
> > +#define m1 t3
> > +
> > +#ifdef CONFIG_CPU_BIG_ENDIAN
> > +# define CZ clz
> > +# define SHIFT sll
> > +#else
> > +# define CZ ctz
> > +# define SHIFT srl
> > +#endif
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > + /*
> > + * Returns
> > + * a0 - string length
> > + *
> > + * Parameters
> > + * a0 - String to measure
> > + *
> > + * Clobbers
> > + * t0, t1, t2, t3
> > + */
> > +
> > + /* Number of irrelevant bytes in the first word. */
>
> I see that this comment doesn't just apply to the next instruction,
> but to several instructions. It threw me off at first. Again, I
> think I'd prefer we just copy+paste the Appendix, comments an all.
I think this comment only applies to this line.
The loop processes on word-aliged chunks and only the first chunk can
be misaligned.
The andi instruction calculates the number of irrelevant bytes in this
first word.
The Appendix states "// offset" here, so I think this comment is better.
But I don't mind if people want to stick with the comments from the Appendix.
>
>
> > + andi offset, src, SZREG-1
> > +
> > + /* Align pointer. */
> > + andi addr, src, -SZREG
> > +
> > + li valid_bytes, SZREG
> > + sub valid_bytes, valid_bytes, offset
> > + slli offset_bits, offset, RISCV_LGPTR
>
> The shift immediate should be 3, even for rv32, since we want
> bits-per-byte, not bytes-per-word. The spec example uses PTRLOG, which
> does imply bytes-per-word to me as well, so that seems like a spec bug.
Correct, that's an issue in the Appendix.
>
>
> > +
> > + /* Get the first word. */
> > + REG_L data, 0(addr)
> > +
> > + /*
> > + * Shift away the partial data we loaded to remove the irrelevant bytes
> > + * preceding the string with the effect of adding NUL bytes at the
> > + * end of the string.
>
> the end of the string's first word.
>
> > + */
> > + SHIFT data, data, offset_bits
> > +
> > + /* Convert non-NUL into 0xff and NUL into 0x00. */
> > + orc.b data, data
> > +
> > + /* Convert non-NUL into 0x00 and NUL into 0xff. */
> > + not data, data
> > +
> > + /*
> > + * Search for the first set bit (corresponding to a NUL byte in the
> > + * original chunk).
> > + */
> > + CZ data, data
> > +
> > + /*
> > + * The first chunk is special: commpare against the number
>
> compare
>
> > + * of valid bytes in this chunk.
> > + */
> > + srli result, data, 3
> > + bgtu valid_bytes, result, 3f
> > +
> > + /* Prepare for the word comparison loop. */
> > + addi offset, addr, SZREG
> > + li m1, -1
> > +
> > + /*
> > + * Our critical loop is 4 instructions and processes data in
> > + * 4 byte or 8 byte chunks.
> > + */
> > + .p2align 3
> > +1:
> > + REG_L data, SZREG(addr)
> > + addi addr, addr, SZREG
> > + orc.b data, data
> > + beq data, m1, 1b
> > +2:
> > + not data, data
> > + CZ data, data
> > +
> > + /* Get number of processed words. */
> > + sub offset, addr, offset
> > +
> > + /* Add number of characters in the first word. */
> > + add result, result, offset
> > + srli data, data, 3
> > +
> > + /* Add number of characters in the last word. */
> > + add result, result, data
> > +3:
> > + ret
> > +
> > +.option pop
> > +#endif
> > SYM_FUNC_END(strlen)
> > diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
> > index 493ab6febcb2..851428b439dc 100644
> > --- a/arch/riscv/lib/strncmp.S
> > +++ b/arch/riscv/lib/strncmp.S
> > @@ -3,9 +3,14 @@
> > #include <linux/linkage.h>
> > #include <asm/asm.h>
> > #include <asm-generic/export.h>
> > +#include <asm/alternative-macros.h>
> > +#include <asm/errata_list.h>
> >
> > /* int strncmp(const char *cs, const char *ct, size_t count) */
> > SYM_FUNC_START(strncmp)
> > +
> > + ALTERNATIVE("nop", "j variant_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
> > +
> > /*
> > * Returns
> > * a0 - comparison result, value like strncmp
> > @@ -37,4 +42,110 @@ SYM_FUNC_START(strncmp)
> > 4:
> > li a0, 0
> > j 2b
> > +
> > +/*
> > + * Variant of strncmp using the ZBB extension if available
> > + */
> > +#ifdef CONFIG_RISCV_ISA_ZBB
> > +variant_zbb:
> > +
> > +#define src1 a0
> > +#define result a0
> > +#define src2 t6
> > +#define len a2
> > +#define data1 t0
> > +#define data2 t1
> > +#define align t2
> > +#define data1_orcb t3
> > +#define limit t4
> > +#define m1 t5
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > + /*
> > + * Returns
> > + * a0 - comparison result, like strncmp
> > + *
> > + * Parameters
> > + * a0 - string1
> > + * a1 - string2
> > + * a2 - number of characters to compare
> > + *
> > + * Clobbers
> > + * t0, t1, t2, t3, t4, t5, t6
> > + */
> > + mv src2, a1
Why is this needed (i.e. why is src2 not a1, but t6)?
> > +
> > + or align, src1, src2
> > + li m1, -1
> > + and align, align, SZREG-1
> > + add limit, src1, len
> > + bnez align, 4f
> > +
> > + /* Adjust limit for fast-path. */
> > + addi limit, limit, -SZREG
>
> To avoid unnecessarily writing the last SZREG bytes one at a time when
> 'len' is SZREG aligned, we should be using the explicitly aligned limit
> here (limit & ~(SZREG - 1)) rather than just subtracting SZREG. Then, down
> in the slow-path we use the original limit to finish off if len wasn't
> aligned. When it was aligned, we'll just immediately branch to the ret.
> (I suspect many times 'len' will be SZREG aligned, since string buffers
> are typically allocated with power-of-two sizes).
Yes, that's a good idea!
So I propose to keep the (original) limit and introduce a new register here:
andi fast_limit, limit, -SZREG // ~(SZREG - 1) == -SZREG
...and use that in the loop:
bgt src1, fast_limit, 3f
...and remove the limit restore ("addi limit, limit, SZREG").
>
> > +
> > + /* Main loop for aligned string. */
> > + .p2align 3
> > +1:
> > + bgt src1, limit, 3f
> > + REG_L data1, 0(src1)
> > + REG_L data2, 0(src2)
> > + orc.b data1_orcb, data1
> > + bne data1_orcb, m1, 2f
> > + addi src1, src1, SZREG
> > + addi src2, src2, SZREG
> > + beq data1, data2, 1b
> > +
> > + /*
> > + * Words don't match, and no null byte in the first
> > + * word. Get bytes in big-endian order and compare.
> > + */
> > +#ifndef CONFIG_CPU_BIG_ENDIAN
> > + rev8 data1, data1
> > + rev8 data2, data2
> > +#endif
> > +
> > + /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
> > + sltu result, data1, data2
> > + neg result, result
> > + ori result, result, 1
> > + ret
> > +
> > +2:
> > + /*
> > + * Found a null byte.
> > + * If words don't match, fall back to simple loop.
> > + */
> > + bne data1, data2, 3f
> > +
> > + /* Otherwise, strings are equal. */
> > + li result, 0
> > + ret
> > +
> > + /* Simple loop for misaligned strings. */
> > +3:
> > + /* Restore limit for slow-path. */
> > + addi limit, limit, SZREG
> > + .p2align 3
> > +4:
> > + bge src1, limit, 6f
> > + lbu data1, 0(src1)
> > + lbu data2, 0(src2)
> > + addi src1, src1, 1
> > + addi src2, src2, 1
> > + bne data1, data2, 5f
> > + bnez data1, 4b
> > +
> > +5:
> > + sub result, data1, data2
> > + ret
> > +
> > +6:
> > + li result, 0
> > + ret
>
> As strcmp and strncmp have so much in common it's tempting to try and
> merge them somehow. Maybe with a handful of macros shared between them?
>
> > +
> > +.option pop
> > +#endif
> > SYM_FUNC_END(strncmp)
> > --
> > 2.35.1
> >
>
> Thanks,
> drew
More information about the linux-riscv
mailing list