[PATCH v3 4/6] RISC-V: Use Zicboz in clear_page when available
Palmer Dabbelt
palmer at dabbelt.com
Wed Feb 1 20:35:54 PST 2023
On Mon, 30 Jan 2023 04:01:26 PST (-0800), ajones at ventanamicro.com wrote:
> Using memset() to zero a 4K page takes 563 total instructions
> where 20 are branches. clear_page() with Zicboz takes 150 total
> instructions where 16 are branches. We could reduce the numbers
> by further unrolling, but, since the cboz block size isn't fixed,
> we'd need a Duff device to ensure we don't execute too many
> unrolled steps. Also, cbo.zero doesn't take an offset, so each
> unrolled step requires it and an add instruction. This increases
> the chance for icache misses if we unroll many times. For these
> reasons we only unroll four times. Unrolling four times should be
> safe as it supports cboz block sizes up to 1K when used with 4K
> pages and it's only 24 to 32 bytes of unrolled instructions.
>
> Another note about the Duff device idea is that it would probably
> be best to store the number of steps needed at boot time and then
> load the value in clear_page(). Calculating it in clear_page(),
> particularly without the Zbb extension, would not be efficient.
>
> Signed-off-by: Andrew Jones <ajones at ventanamicro.com>
> Acked-by: Conor Dooley <conor.dooley at microchip.com>
> ---
> arch/riscv/Kconfig | 13 +++++++++++
> arch/riscv/include/asm/insn-def.h | 4 ++++
> arch/riscv/include/asm/page.h | 6 +++++-
> arch/riscv/lib/Makefile | 1 +
> arch/riscv/lib/clear_page.S | 36 +++++++++++++++++++++++++++++++
> 5 files changed, 59 insertions(+), 1 deletion(-)
> create mode 100644 arch/riscv/lib/clear_page.S
>
> diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
> index 33bbdc33cef8..3759a2f6edd5 100644
> --- a/arch/riscv/Kconfig
> +++ b/arch/riscv/Kconfig
> @@ -432,6 +432,19 @@ config RISCV_ISA_ZICBOM
>
> If you don't know what to do here, say Y.
>
> +config RISCV_ISA_ZICBOZ
> + bool "Zicboz extension support for faster zeroing of memory"
> + depends on !XIP_KERNEL && MMU
> + select RISCV_ALTERNATIVE
> + default y
> + help
> + Enable the use of the ZICBOZ extension (cbo.zero instruction)
> + when available.
> +
> + The Zicboz extension is used for faster zeroing of memory.
> +
> + If you don't know what to do here, say Y.
> +
> config TOOLCHAIN_HAS_ZIHINTPAUSE
> bool
> default y
> diff --git a/arch/riscv/include/asm/insn-def.h b/arch/riscv/include/asm/insn-def.h
> index e01ab51f50d2..6960beb75f32 100644
> --- a/arch/riscv/include/asm/insn-def.h
> +++ b/arch/riscv/include/asm/insn-def.h
> @@ -192,4 +192,8 @@
> INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> RS1(base), SIMM12(2))
>
> +#define CBO_zero(base) \
> + INSN_I(OPCODE_MISC_MEM, FUNC3(2), __RD(0), \
> + RS1(base), SIMM12(4))
> +
> #endif /* __ASM_INSN_DEF_H */
> diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h
> index 9f432c1b5289..ccd168fe29d2 100644
> --- a/arch/riscv/include/asm/page.h
> +++ b/arch/riscv/include/asm/page.h
> @@ -49,10 +49,14 @@
>
> #ifndef __ASSEMBLY__
>
> +#ifdef CONFIG_RISCV_ISA_ZICBOZ
> +void clear_page(void *page);
> +#else
> #define clear_page(pgaddr) memset((pgaddr), 0, PAGE_SIZE)
> +#endif
> #define copy_page(to, from) memcpy((to), (from), PAGE_SIZE)
>
> -#define clear_user_page(pgaddr, vaddr, page) memset((pgaddr), 0, PAGE_SIZE)
> +#define clear_user_page(pgaddr, vaddr, page) clear_page(pgaddr)
> #define copy_user_page(vto, vfrom, vaddr, topg) \
> memcpy((vto), (vfrom), PAGE_SIZE)
>
> diff --git a/arch/riscv/lib/Makefile b/arch/riscv/lib/Makefile
> index 25d5c9664e57..9ee5e2ab5143 100644
> --- a/arch/riscv/lib/Makefile
> +++ b/arch/riscv/lib/Makefile
> @@ -5,5 +5,6 @@ lib-y += memset.o
> lib-y += memmove.o
> lib-$(CONFIG_MMU) += uaccess.o
> lib-$(CONFIG_64BIT) += tishift.o
> +lib-$(CONFIG_RISCV_ISA_ZICBOZ) += clear_page.o
>
> obj-$(CONFIG_FUNCTION_ERROR_INJECTION) += error-inject.o
> diff --git a/arch/riscv/lib/clear_page.S b/arch/riscv/lib/clear_page.S
> new file mode 100644
> index 000000000000..49f29139a5b6
> --- /dev/null
> +++ b/arch/riscv/lib/clear_page.S
> @@ -0,0 +1,36 @@
> +/* SPDX-License-Identifier: GPL-2.0-only */
> +/*
> + * Copyright (c) 2023 Ventana Micro Systems Inc.
> + */
> +
> +#include <linux/linkage.h>
> +#include <asm/asm.h>
> +#include <asm/alternative-macros.h>
> +#include <asm/hwcap.h>
> +#include <asm/insn-def.h>
> +#include <asm/page.h>
> +
> +/* void clear_page(void *page) */
> +ENTRY(__clear_page)
> +WEAK(clear_page)
> + li a2, PAGE_SIZE
> + ALTERNATIVE("j .Lno_zicboz", "nop",
> + 0, RISCV_ISA_EXT_ZICBOZ, CONFIG_RISCV_ISA_ZICBOZ)
> + la a1, riscv_cboz_block_size
> + lw a1, 0(a1)
You should be able to just "lw a1, riscv_cboz_block_size", which can
sometimes generate better code.
> + add a2, a0, a2
> +.Lzero_loop:
> + CBO_zero(a0)
> + add a0, a0, a1
> + CBO_zero(a0)
> + add a0, a0, a1
We were talking about this in the patchwork call: this risks overflow if
the block size is bigger than a quarter of a page. That's probably
pretty rare, but given that there's already an alternative for the jump
it's easy to check.
> + CBO_zero(a0)
> + add a0, a0, a1
> + CBO_zero(a0)
> + add a0, a0, a1
> + bltu a0, a2, .Lzero_loop
> + ret
> +.Lno_zicboz:
> + li a1, 0
> + tail __memset
> +END(__clear_page)
More information about the linux-riscv
mailing list