[PATCH v4 5/5] RISC-V: add zbb support to string functions

Wed Jan 11 06:27:28 PST 2023

On Wed, Jan 11, 2023 at 1:24 PM Andrew Jones <ajones at ventanamicro.com> wrote:
>
> On Mon, Jan 09, 2023 at 07:17:55PM +0100, Heiko Stuebner wrote:
> > From: Heiko Stuebner <heiko.stuebner at vrull.eu>
> >
> > Add handling for ZBB extension and add support for using it as a
> > variant for optimized string functions.
> >
> > Support for the Zbb-str-variants is limited to the GNU-assembler
> > for now, as LLVM has not yet acquired the functionality to
> > selectively change the arch option in assembler code.
> > This is still under review at
> >     https://reviews.llvm.org/D123515
> >
> > Co-developed-by: Christoph Muellner <christoph.muellner at vrull.eu>
> > Signed-off-by: Christoph Muellner <christoph.muellner at vrull.eu>
> > Signed-off-by: Heiko Stuebner <heiko.stuebner at vrull.eu>
> > ---
> >  arch/riscv/Kconfig                   |  24 ++++++
> >  arch/riscv/include/asm/errata_list.h |   3 +-
> >  arch/riscv/include/asm/hwcap.h       |   1 +
> >  arch/riscv/include/asm/string.h      |   2 +
> >  arch/riscv/kernel/cpu.c              |   1 +
> >  arch/riscv/kernel/cpufeature.c       |  18 +++++
> >  arch/riscv/lib/strcmp.S              |  94 ++++++++++++++++++++++
> >  arch/riscv/lib/strlen.S              | 114 +++++++++++++++++++++++++++
> >  arch/riscv/lib/strncmp.S             | 111 ++++++++++++++++++++++++++
> >  9 files changed, 367 insertions(+), 1 deletion(-)
> >
> > diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> > index e2b656043abf..7c814fbf9527 100644
> > --- a/arch/riscv/Kconfig
> > +++ b/arch/riscv/Kconfig
> > @@ -416,6 +416,30 @@ config RISCV_ISA_SVPBMT
> >
> >          If you don't know what to do here, say Y.
> >
> > +config TOOLCHAIN_HAS_ZBB
> > +     bool
> > +     default y
> > +     depends on !64BIT || $(cc-option,-mabi=lp64 -march=rv64ima_zbb)
> > +     depends on !32BIT || $(cc-option,-mabi=ilp32 -march=rv32ima_zbb)
> > +     depends on LLD_VERSION >= 150000 || LD_VERSION >= 23900
> > +     depends on AS_IS_GNU
> > +
> > +config RISCV_ISA_ZBB
> > +     bool "Zbb extension support for bit manipulation instructions"
> > +     depends on TOOLCHAIN_HAS_ZBB
> > +     depends on !XIP_KERNEL && MMU
> > +     select RISCV_ALTERNATIVE
> > +     default y
> > +     help
> > +        Adds support to dynamically detect the presence of the ZBB
> > +        extension (basic bit manipulation) and enable its usage.
> > +
> > +        The Zbb extension provides instructions to accelerate a number
> > +        of bit-specific operations (count bit population, sign extending,
> > +        bitrotation, etc).
> > +
> > +        If you don't know what to do here, say Y.
> > +
> >  config TOOLCHAIN_HAS_ZICBOM
> >       bool
> >       default y
> > diff --git a/arch/riscv/include/asm/errata_list.h b/arch/riscv/include/asm/errata_list.h
> > index 4180312d2a70..95e626b7281e 100644
> > --- a/arch/riscv/include/asm/errata_list.h
> > +++ b/arch/riscv/include/asm/errata_list.h
> > @@ -24,7 +24,8 @@
> >
> >  #define      CPUFEATURE_SVPBMT 0
> >  #define      CPUFEATURE_ZICBOM 1
> > -#define      CPUFEATURE_NUMBER 2
> > +#define      CPUFEATURE_ZBB 2
> > +#define      CPUFEATURE_NUMBER 3
> >
> >  #ifdef __ASSEMBLY__
> >
> > diff --git a/arch/riscv/include/asm/hwcap.h b/arch/riscv/include/asm/hwcap.h
> > index 86328e3acb02..b727491fb100 100644
> > --- a/arch/riscv/include/asm/hwcap.h
> > +++ b/arch/riscv/include/asm/hwcap.h
> > @@ -59,6 +59,7 @@ enum riscv_isa_ext_id {
> >       RISCV_ISA_EXT_ZIHINTPAUSE,
> >       RISCV_ISA_EXT_SSTC,
> >       RISCV_ISA_EXT_SVINVAL,
> > +     RISCV_ISA_EXT_ZBB,
> >       RISCV_ISA_EXT_ID_MAX
> >  };
> >  static_assert(RISCV_ISA_EXT_ID_MAX <= RISCV_ISA_EXT_MAX);
> > diff --git a/arch/riscv/include/asm/string.h b/arch/riscv/include/asm/string.h
> > index a96b1fea24fe..17dfc4ab4c80 100644
> > --- a/arch/riscv/include/asm/string.h
> > +++ b/arch/riscv/include/asm/string.h
> > @@ -6,6 +6,8 @@
> >  #ifndef _ASM_RISCV_STRING_H
> >  #define _ASM_RISCV_STRING_H
> >
> > +#include <asm/alternative-macros.h>
> > +#include <asm/errata_list.h>
> >  #include <linux/types.h>
> >  #include <linux/linkage.h>
> >
> > diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
> > index 1b9a5a66e55a..c4d1aa166f8b 100644
> > --- a/arch/riscv/kernel/cpu.c
> > +++ b/arch/riscv/kernel/cpu.c
> > @@ -162,6 +162,7 @@ arch_initcall(riscv_cpuinfo_init);
> >   *    extensions by an underscore.
> >   */
> >  static struct riscv_isa_ext_data isa_ext_arr[] = {
> > +     __RISCV_ISA_EXT_DATA(zbb, RISCV_ISA_EXT_ZBB),
> >       __RISCV_ISA_EXT_DATA(sscofpmf, RISCV_ISA_EXT_SSCOFPMF),
> >       __RISCV_ISA_EXT_DATA(sstc, RISCV_ISA_EXT_SSTC),
> >       __RISCV_ISA_EXT_DATA(svinval, RISCV_ISA_EXT_SVINVAL),
> > diff --git a/arch/riscv/kernel/cpufeature.c b/arch/riscv/kernel/cpufeature.c
> > index 205bbd6b1fce..bf3a791d7110 100644
> > --- a/arch/riscv/kernel/cpufeature.c
> > +++ b/arch/riscv/kernel/cpufeature.c
> > @@ -222,6 +222,7 @@ void __init riscv_fill_hwcap(void)
> >                                       set_bit(nr, this_isa);
> >                               }
> >                       } else {
> > +                             SET_ISA_EXT_MAP("zbb", RISCV_ISA_EXT_ZBB);
> >                               SET_ISA_EXT_MAP("sscofpmf", RISCV_ISA_EXT_SSCOFPMF);
> >                               SET_ISA_EXT_MAP("svpbmt", RISCV_ISA_EXT_SVPBMT);
> >                               SET_ISA_EXT_MAP("zicbom", RISCV_ISA_EXT_ZICBOM);
> > @@ -301,6 +302,20 @@ static bool __init_or_module cpufeature_probe_zicbom(unsigned int stage)
> >       return true;
> >  }
> >
> > +static bool __init_or_module cpufeature_probe_zbb(unsigned int stage)
> > +{
> > +     if (!IS_ENABLED(CONFIG_RISCV_ISA_ZBB))
> > +             return false;
> > +
> > +     if (stage == RISCV_ALTERNATIVES_EARLY_BOOT)
> > +             return false;
> > +
> > +     if (!riscv_isa_extension_available(NULL, ZBB))
> > +             return false;
> > +
> > +     return true;
> > +}
> > +
> >  /*
> >   * Probe presence of individual extensions.
> >   *
> > @@ -318,6 +333,9 @@ static u32 __init_or_module cpufeature_probe(unsigned int stage)
> >       if (cpufeature_probe_zicbom(stage))
> >               cpu_req_feature |= BIT(CPUFEATURE_ZICBOM);
> >
> > +     if (cpufeature_probe_zbb(stage))
> > +             cpu_req_feature |= BIT(CPUFEATURE_ZBB);
> > +
> >       return cpu_req_feature;
> >  }
> >
> > diff --git a/arch/riscv/lib/strcmp.S b/arch/riscv/lib/strcmp.S
> > index 94440fb8390c..5428a8f2eb84 100644
> > --- a/arch/riscv/lib/strcmp.S
> > +++ b/arch/riscv/lib/strcmp.S
> > @@ -3,9 +3,14 @@
> >  #include <linux/linkage.h>
> >  #include <asm/asm.h>
> >  #include <asm-generic/export.h>
> > +#include <asm/alternative-macros.h>
> > +#include <asm/errata_list.h>
> >
> >  /* int strcmp(const char *cs, const char *ct) */
> >  SYM_FUNC_START(strcmp)
> > +
> > +     ALTERNATIVE("nop", "j variant_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
>
> How about strcmp_zbb and similar for the other functions for the labels?

Fully agree!

>
> > +
> >       /*
> >        * Returns
> >        *   a0 - comparison result, value like strcmp
> > @@ -34,4 +39,93 @@ SYM_FUNC_START(strcmp)
> >       bnez    t1, 1b
> >       li      a0, 0
> >       j       2b
> > +
> > +/*
> > + * Variant of strcmp using the ZBB extension if available
> > + */
> > +#ifdef CONFIG_RISCV_ISA_ZBB
> > +variant_zbb:
> > +#define src1         a0
> > +#define result               a0
> > +#define src2         t5
> > +#define data1                t0
> > +#define data2                t1
> > +#define align                t2
> > +#define data1_orcb   t3
> > +#define m1           t4
>
> nit: It's probably just me, but I'm not a big fan of creating defines
> for registers, particularly when the meaning of the registers' contents
> gets changed, because then the name, at least temporarily, no longer
> accurately describes the content.

Without any strong feelings about this: I think this is common
practice when writing asm code (helps to write and understand the
code).
See random other sources where this is also done:
https://sourceware.org/git/?p=glibc.git;a=blob;f=sysdeps/aarch64/strcmp.S;h=054ce15ef8273073de4c821e3f415f1f70541554;hb=HEAD#l35
https://github.com/openssl/openssl/blob/master/crypto/aes/asm/aes-x86_64.pl#L58
https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/arm64/lib/memcmp.S?h=v6.2-rc3#n20

> And, in this case, it looks like the code matches Appendix A, except the
> register and label naming. I'd prefer to have those match too so I could
> "review" with the diff tool.

Good that you did not "review" the code with diff.
If so, you would not have found the LGPTR issue (below).

>
>
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > +     /*
> > +      * Returns
> > +      *   a0 - comparison result, value like strcmp
> > +      *
> > +      * Parameters
> > +      *   a0 - string1
> > +      *   a1 - string2
> > +      *
> > +      * Clobbers
> > +      *   t0, t1, t2, t3, t4, t5
> > +      */
> > +     mv      src2, a1
> > +
> > +     or      align, src1, src2
> > +     li      m1, -1
> > +     and     align, align, SZREG-1
> > +     bnez    align, 3f
> > +
> > +     /* Main loop for aligned string.  */
> > +     .p2align 3
>
> Why are the starts of the loops aligned to 8 byte boundaries?

Loop alignment seems to be a valid optimization method for other architectures.
I don't see anything specific in RISC-V that would make loop alignment
irrelevant.
AArch64 code seems to align loops (or other jump targets) to 8 or 16 bytes.
This was considered as not harmful, but potentially beneficial.

>
>
> > +1:
> > +     REG_L   data1, 0(src1)
> > +     REG_L   data2, 0(src2)
> > +     orc.b   data1_orcb, data1
> > +     bne     data1_orcb, m1, 2f
> > +     addi    src1, src1, SZREG
> > +     addi    src2, src2, SZREG
> > +     beq     data1, data2, 1b
> > +
> > +     /*
> > +      * Words don't match, and no null byte in the first
> > +      * word. Get bytes in big-endian order and compare.
> > +      */
> > +#ifndef CONFIG_CPU_BIG_ENDIAN
> > +     rev8    data1, data1
> > +     rev8    data2, data2
> > +#endif
> > +
> > +     /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence. */
> > +     sltu    result, data1, data2
> > +     neg     result, result
> > +     ori     result, result, 1
> > +     ret
> > +
> > +2:
> > +     /*
> > +      * Found a null byte.
> > +      * If words don't match, fall back to simple loop.
> > +      */
> > +     bne     data1, data2, 3f
> > +
> > +     /* Otherwise, strings are equal. */
> > +     li      result, 0
> > +     ret
> > +
> > +     /* Simple loop for misaligned strings. */
> > +     .p2align 3
> > +3:
> > +     lbu     data1, 0(src1)
> > +     lbu     data2, 0(src2)
> > +     addi    src1, src1, 1
> > +     addi    src2, src2, 1
> > +     bne     data1, data2, 4f
> > +     bnez    data1, 3b
> > +
> > +4:
> > +     sub     result, data1, data2
>
> The non-optimized version returns explicitly -1, 0, 1, whereas this
> returns < 0, 0, > 0. Assuming we don't need the explicit -1 / 1 then we
> can change the non-optimized version to do this subtraction for its
> return value too, saving several instructions.

Regarding "we don't need":
"The strcmp() and strncmp() functions return an integer less than, equal
 to, or greater than zero if s1 (or the first n bytes thereof) is found,
 respectively, to be less than, to match, or be greater than s2."

So, yes, the values -1/+1 are just some of the allowed return values.

>
>
> > +     ret
> > +
> > +.option pop
> > +#endif
> >  SYM_FUNC_END(strcmp)
> > diff --git a/arch/riscv/lib/strlen.S b/arch/riscv/lib/strlen.S
> > index 09a7aaff26c8..738efb04307d 100644
> > --- a/arch/riscv/lib/strlen.S
> > +++ b/arch/riscv/lib/strlen.S
> > @@ -3,9 +3,14 @@
> >  #include <linux/linkage.h>
> >  #include <asm/asm.h>
> >  #include <asm-generic/export.h>
> > +#include <asm/alternative-macros.h>
> > +#include <asm/errata_list.h>
> >
> >  /* int strlen(const char *s) */
> >  SYM_FUNC_START(strlen)
> > +
> > +     ALTERNATIVE("nop", "j variant_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
> > +
> >       /*
> >        * Returns
> >        *   a0 - string length
> > @@ -25,4 +30,113 @@ SYM_FUNC_START(strlen)
> >  2:
> >       addi    t1, t1, 1
> >       j       1b
> > +
> > +/*
> > + * Variant of strlen using the ZBB extension if available
> > + */
> > +#ifdef CONFIG_RISCV_ISA_ZBB
> > +variant_zbb:
> > +
> > +#define src          a0
> > +#define result               a0
> > +#define addr         t0
> > +#define data         t1
> > +#define offset               t2
> > +#define offset_bits  t2
> > +#define valid_bytes  t3
> > +#define m1           t3
> > +
> > +#ifdef CONFIG_CPU_BIG_ENDIAN
> > +# define CZ  clz
> > +# define SHIFT       sll
> > +#else
> > +# define CZ  ctz
> > +# define SHIFT       srl
> > +#endif
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > +     /*
> > +      * Returns
> > +      *   a0 - string length
> > +      *
> > +      * Parameters
> > +      *   a0 - String to measure
> > +      *
> > +      * Clobbers
> > +      *   t0, t1, t2, t3
> > +      */
> > +
> > +     /* Number of irrelevant bytes in the first word. */
>
> I see that this comment doesn't just apply to the next instruction,
> but to several instructions. It threw me off at first. Again, I
> think I'd prefer we just copy+paste the Appendix, comments an all.

I think this comment only applies to this line.
The loop processes on word-aliged chunks and only the first chunk can
be misaligned.
The andi instruction calculates the number of irrelevant bytes in this
first word.

The Appendix states "// offset" here, so I think this comment is better.

But I don't mind if people want to stick with the comments from the Appendix.

>
>
> > +     andi    offset, src, SZREG-1
> > +
> > +     /* Align pointer. */
> > +     andi    addr, src, -SZREG
> > +
> > +     li      valid_bytes, SZREG
> > +     sub     valid_bytes, valid_bytes, offset
> > +     slli    offset_bits, offset, RISCV_LGPTR
>
> The shift immediate should be 3, even for rv32, since we want
> bits-per-byte, not bytes-per-word. The spec example uses PTRLOG, which
> does imply bytes-per-word to me as well, so that seems like a spec bug.

Correct, that's an issue in the Appendix.

>
>
> > +
> > +     /* Get the first word.  */
> > +     REG_L   data, 0(addr)
> > +
> > +     /*
> > +      * Shift away the partial data we loaded to remove the irrelevant bytes
> > +      * preceding the string with the effect of adding NUL bytes at the
> > +      * end of the string.
>
> the end of the string's first word.
>
> > +      */
> > +     SHIFT   data, data, offset_bits
> > +
> > +     /* Convert non-NUL into 0xff and NUL into 0x00. */
> > +     orc.b   data, data
> > +
> > +     /* Convert non-NUL into 0x00 and NUL into 0xff. */
> > +     not     data, data
> > +
> > +     /*
> > +      * Search for the first set bit (corresponding to a NUL byte in the
> > +      * original chunk).
> > +      */
> > +     CZ      data, data
> > +
> > +     /*
> > +      * The first chunk is special: commpare against the number
>
> compare
>
> > +      * of valid bytes in this chunk.
> > +      */
> > +     srli    result, data, 3
> > +     bgtu    valid_bytes, result, 3f
> > +
> > +     /* Prepare for the word comparison loop. */
> > +     addi    offset, addr, SZREG
> > +     li      m1, -1
> > +
> > +     /*
> > +      * Our critical loop is 4 instructions and processes data in
> > +      * 4 byte or 8 byte chunks.
> > +      */
> > +     .p2align 3
> > +1:
> > +     REG_L   data, SZREG(addr)
> > +     addi    addr, addr, SZREG
> > +     orc.b   data, data
> > +     beq     data, m1, 1b
> > +2:
> > +     not     data, data
> > +     CZ      data, data
> > +
> > +     /* Get number of processed words.  */
> > +     sub     offset, addr, offset
> > +
> > +     /* Add number of characters in the first word.  */
> > +     add     result, result, offset
> > +     srli    data, data, 3
> > +
> > +     /* Add number of characters in the last word.  */
> > +     add     result, result, data
> > +3:
> > +     ret
> > +
> > +.option pop
> > +#endif
> >  SYM_FUNC_END(strlen)
> > diff --git a/arch/riscv/lib/strncmp.S b/arch/riscv/lib/strncmp.S
> > index 493ab6febcb2..851428b439dc 100644
> > --- a/arch/riscv/lib/strncmp.S
> > +++ b/arch/riscv/lib/strncmp.S
> > @@ -3,9 +3,14 @@
> >  #include <linux/linkage.h>
> >  #include <asm/asm.h>
> >  #include <asm-generic/export.h>
> > +#include <asm/alternative-macros.h>
> > +#include <asm/errata_list.h>
> >
> >  /* int strncmp(const char *cs, const char *ct, size_t count) */
> >  SYM_FUNC_START(strncmp)
> > +
> > +     ALTERNATIVE("nop", "j variant_zbb", 0, CPUFEATURE_ZBB, CONFIG_RISCV_ISA_ZBB)
> > +
> >       /*
> >        * Returns
> >        *   a0 - comparison result, value like strncmp
> > @@ -37,4 +42,110 @@ SYM_FUNC_START(strncmp)
> >  4:
> >       li      a0, 0
> >       j       2b
> > +
> > +/*
> > + * Variant of strncmp using the ZBB extension if available
> > + */
> > +#ifdef CONFIG_RISCV_ISA_ZBB
> > +variant_zbb:
> > +
> > +#define src1         a0
> > +#define result               a0
> > +#define src2         t6
> > +#define len          a2
> > +#define data1                t0
> > +#define data2                t1
> > +#define align                t2
> > +#define data1_orcb   t3
> > +#define limit                t4
> > +#define m1           t5
> > +
> > +.option push
> > +.option arch,+zbb
> > +
> > +     /*
> > +      * Returns
> > +      *   a0 - comparison result, like strncmp
> > +      *
> > +      * Parameters
> > +      *   a0 - string1
> > +      *   a1 - string2
> > +      *   a2 - number of characters to compare
> > +      *
> > +      * Clobbers
> > +      *   t0, t1, t2, t3, t4, t5, t6
> > +      */
> > +     mv      src2, a1

Why is this needed (i.e. why is src2 not a1, but t6)?

> > +
> > +     or      align, src1, src2
> > +     li      m1, -1
> > +     and     align, align, SZREG-1
> > +     add     limit, src1, len
> > +     bnez    align, 4f
> > +
> > +     /* Adjust limit for fast-path.  */
> > +     addi    limit, limit, -SZREG
>
> To avoid unnecessarily writing the last SZREG bytes one at a time when
> 'len' is SZREG aligned, we should be using the explicitly aligned limit
> here (limit & ~(SZREG - 1)) rather than just subtracting SZREG. Then, down
> in the slow-path we use the original limit to finish off if len wasn't
> aligned. When it was aligned, we'll just immediately branch to the ret.
> (I suspect many times 'len' will be SZREG aligned, since string buffers
> are typically allocated with power-of-two sizes).

Yes, that's a good idea!

So I propose to keep the (original) limit and introduce a new register here:
    andi fast_limit, limit, -SZREG // ~(SZREG - 1) == -SZREG
...and use that in the loop:
    bgt     src1, fast_limit, 3f
...and remove the limit restore ("addi   limit, limit, SZREG").

>
> > +
> > +     /* Main loop for aligned string.  */
> > +     .p2align 3
> > +1:
> > +     bgt     src1, limit, 3f
> > +     REG_L   data1, 0(src1)
> > +     REG_L   data2, 0(src2)
> > +     orc.b   data1_orcb, data1
> > +     bne     data1_orcb, m1, 2f
> > +     addi    src1, src1, SZREG
> > +     addi    src2, src2, SZREG
> > +     beq     data1, data2, 1b
> > +
> > +     /*
> > +      * Words don't match, and no null byte in the first
> > +      * word. Get bytes in big-endian order and compare.
> > +      */
> > +#ifndef CONFIG_CPU_BIG_ENDIAN
> > +     rev8    data1, data1
> > +     rev8    data2, data2
> > +#endif
> > +
> > +     /* Synthesize (data1 >= data2) ? 1 : -1 in a branchless sequence.  */
> > +     sltu    result, data1, data2
> > +     neg     result, result
> > +     ori     result, result, 1
> > +     ret
> > +
> > +2:
> > +     /*
> > +      * Found a null byte.
> > +      * If words don't match, fall back to simple loop.
> > +      */
> > +     bne     data1, data2, 3f
> > +
> > +     /* Otherwise, strings are equal.  */
> > +     li      result, 0
> > +     ret
> > +
> > +     /* Simple loop for misaligned strings.  */
> > +3:
> > +     /* Restore limit for slow-path.  */
> > +     addi    limit, limit, SZREG
> > +     .p2align 3
> > +4:
> > +     bge     src1, limit, 6f
> > +     lbu     data1, 0(src1)
> > +     lbu     data2, 0(src2)
> > +     addi    src1, src1, 1
> > +     addi    src2, src2, 1
> > +     bne     data1, data2, 5f
> > +     bnez    data1, 4b
> > +
> > +5:
> > +     sub     result, data1, data2
> > +     ret
> > +
> > +6:
> > +     li      result, 0
> > +     ret
>
> As strcmp and strncmp have so much in common it's tempting to try and
> merge them somehow. Maybe with a handful of macros shared between them?
>
> > +
> > +.option pop
> > +#endif
> >  SYM_FUNC_END(strncmp)
> > --
> > 2.35.1
> >
>
> Thanks,
> drew