[PATCH] riscv: lib: optimize strchr() with Zbb extension

David Laight david.laight.linux at gmail.com
Thu May 7 03:22:35 PDT 2026


On Thu,  7 May 2026 10:06:20 +0800
Zongmin Zhou <min_halo at 163.com> wrote:

> From: Zongmin Zhou <zhouzongmin at kylinos.cn>
> 
> Add a Zbb-powered optimization to the existing strchr() implementation
> using the 'orc.b' instruction, following the same pattern established
> by strnlen().
> 
> The Zbb variant processes data in word-sized chunks using orc.b to
> detect both NUL terminators and target characters in parallel. On
> systems without Zbb support, the original byte-by-byte implementation
> is used as a fallback via the alternatives mechanism.
> 
> Benchmark results (QEMU TCG, rv64):
>   Length | zbb=off (MB/s) | zbb=on (MB/s) | Improvement
>   -------|----------------|---------------|------------
>   1 B    | 27             | 25            | -7.4%
>   7 B    | 147            | 128           | -12.9%
>   16 B   | 216            | 372           | +72.2%
>   64 B   | 378            | 958           | +153.4%
>   512 B  | 480            | 1990          | +314.6%
>   4096 B | 501            | 2269          | +352.9%
> 
> The regression on very short strings (1-7 bytes) is due to the fixed
> overhead of the word-level path: broadcasting the target character to
> all byte lanes via multiplication and checking pointer alignment before
> entering the main loop. For strings shorter than one machine word, this
> setup cost outweighs the benefit of parallel comparison. As string
> length increases beyond 16 bytes, the word-at-a-time processing shows
> significant gains.
> 
> Signed-off-by: Zongmin Zhou <zhouzongmin at kylinos.cn>
> ---
>  arch/riscv/lib/strchr.S | 115 ++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 115 insertions(+)
> 
> diff --git a/arch/riscv/lib/strchr.S b/arch/riscv/lib/strchr.S
> index 48c3a9da53e3..600b19452bc2 100644
> --- a/arch/riscv/lib/strchr.S
> +++ b/arch/riscv/lib/strchr.S
> @@ -6,9 +6,15 @@
>  
>  #include <linux/linkage.h>
>  #include <asm/asm.h>
> +#include <asm/alternative-macros.h>
> +#include <asm/hwcap.h>
>  
>  /* char *strchr(const char *s, int c) */
>  SYM_FUNC_START(strchr)
> +
> +	__ALTERNATIVE_CFG("nop", "j strchr_zbb", 0, RISCV_ISA_EXT_ZBB,
> +		IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && IS_ENABLED(CONFIG_TOOLCHAIN_HAS_ZBB))
> +
>  	/*
>  	 * Parameters
>  	 *   a0 - The string to be searched
> @@ -29,6 +35,115 @@ SYM_FUNC_START(strchr)
>  	li	a0, 0
>  2:
>  	ret
> +
> +/*
> + * Variant of strchr using the ZBB extension if available
> + *
> + * This implementation uses orc.b to detect both NUL terminators and target
> + * characters in parallel, processing word-sized chunks for efficiency.
> + */
> +#if defined(CONFIG_RISCV_ISA_ZBB) && defined(CONFIG_TOOLCHAIN_HAS_ZBB)
> +strchr_zbb:
> +
> +#ifdef CONFIG_CPU_BIG_ENDIAN
> +# define CZ	clz
> +#else
> +# define CZ	ctz
> +#endif
> +
> +.option push
> +.option arch,+zbb
> +
> +	/*
> +	 * Returns
> +	 *   a0 - Address of first occurrence of 'c' or NULL
> +	 *
> +	 * Parameters
> +	 *   a0 - String to search
> +	 *   a1 - Character to find
> +	 *
> +	 * Clobbers
> +	 *   t0, t1, t2, t3, t4
> +	 */
> +
> +	/*
> +	 * Prepare target character mask.
> +	 * Broadcast target character to all bytes using multiply.
> +	 */
> +	andi	a1, a1, 0xff
> +	li	t1, 0x01010101
> +#if __riscv_xlen == 64
> +	slli	t2, t1, 32
> +	or	t1, t1, t2
> +#endif
> +	mul	t2, a1, t1
> +
> +	/* All-ones mask for orc.b comparisons. */
> +	li	t4, -1
> +
> +	/* Check alignment. */
> +	andi	t0, a0, SZREG-1
> +	beqz	t0, 2f

It is almost certainly faster to jump 'out of line' for misaligned
strings and fallthrough for aligned ones.

> +
> +	/* Handle misaligned portion byte-by-byte. */
> +1:
> +	lbu	t1, 0(a0)
> +	beq	t1, a1, 9f
> +	beqz	t1, 8f
> +	addi	a0, a0, 1
> +	andi	t0, a0, SZREG-1
> +	bnez	t0, 1b
> +
> +	/* Main loop: process word-sized chunks. */

Tweak to remove a branch from the loop:
	addi	a0, a0, -SZREG

> +2:
> +	REG_L	t0, 0(a0)

Do read first for better instruction scheduling.
	REG_L	t0, SZREG(a0)
	addi	a0, a0, SZREG

> +
> +	/* Check for NUL terminator. */
> +	orc.b	t1, t0
> +	bne	t1, t4, 3f
> +
> +	/* Check for target character. */
> +	xor	t1, t0, t2
> +	orc.b	t1, t1
> +	bne	t1, t4, 4f

	be	t1, t4, 2b
and move the code at '4:' here.

-- David

> +
> +	addi	a0, a0, SZREG
> +	j	2b
> +
> +3:
> +	/* NUL found in current chunk. Check if target appears before NUL. */
> +	not	t1, t1
> +
> +	xor	t3, t0, t2
> +	orc.b	t3, t3
> +	not	t3, t3
> +
> +	CZ	t3, t3
> +	CZ	t1, t1
> +
> +	/* If NUL appears before target, character not found. */
> +	bltu	t1, t3, 8f
> +
> +	srli	t3, t3, 3
> +	add	a0, a0, t3
> +	ret
> +
> +4:
> +	/* Target found in chunk without NUL. */
> +	not	t1, t1
> +	CZ	t1, t1
> +	srli	t1, t1, 3
> +	add	a0, a0, t1
> +	ret
> +
> +8:
> +	/* Character not found, return NULL. */
> +	li	a0, 0
> +9:
> +	ret
> +
> +.option pop
> +#endif
>  SYM_FUNC_END(strchr)
>  
>  SYM_FUNC_ALIAS_WEAK(__pi_strchr, strchr)




More information about the linux-riscv mailing list