[PATCH v2 17/18] arm64: fpsimd: Move SME save/restore inline

Vladimir Murzin vladimir.murzin at arm.com
Fri May 29 01:51:06 PDT 2026


On 5/28/26 17:54, Mark Rutland wrote:
> Currently the SVE register save/restore sequences are written in
> out-of-line assembly routines. While this works, it's somewhat painful:
> 
> * For KVM to use the sequences, portions of the logic will need to be
>   duplicated in KVM hyp code. While the common logic can be shared in
>   assembly macros, this is very likely to lead to unnecessary divergence
>   and be a maintenance burden.
> 
> * For historical reasons, the assembly macros take some register
>   arguments as numerical indices (e.g. "sme_save_za 0, x2, 12" uses x0, x1, and
>   x12), which is simply confusing.
> 
> * Address generation and control flow are far clearer in C than in
>   assembly.
> 
> * The assembly sequences can't be instrumented, and so it's harder than
>   necessary to catch memory safety issues.
> 
> To handle the above, move the SME register save/restore sequences
> to inline assembly.
> 
> Neither GCC nor LLVM instrument memory arguments to inline assembly, so
> explicit instrumentation is added in the same manner as other assembly
> routines. This instrumentation is implicitly disabled by Kbuild for nVHE
> hyp code.
> 
> Signed-off-by: Mark Rutland <mark.rutland at arm.com>
> Cc: Catalin Marinas <catalin.marinas at arm.com>
> Cc: Fuad Tabba <tabba at google.com>
> Cc: James Morse <james.morse at arm.com>
> Cc: Marc Zyngier <maz at kernel.org>
> Cc: Mark Brown <broonie at kernel.org>
> Cc: Oliver Upton <oupton at kernel.org>
> Cc: Will Deacon <will at kernel.org>
> ---
>  arch/arm64/include/asm/fpsimd.h       | 106 +++++++++++++++++++++++++-
>  arch/arm64/include/asm/fpsimdmacros.h |  76 ------------------
>  arch/arm64/kernel/Makefile            |   2 +-
>  arch/arm64/kernel/entry-fpsimd.S      |  48 ------------
>  4 files changed, 104 insertions(+), 128 deletions(-)
>  delete mode 100644 arch/arm64/kernel/entry-fpsimd.S
> 
> diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
> index 550987b36206a..3ab271320d1de 100644
> --- a/arch/arm64/include/asm/fpsimd.h
> +++ b/arch/arm64/include/asm/fpsimd.h
> @@ -357,9 +357,6 @@ static inline void sve_flush_live(void)
>  	);
>  }
>  
> -extern void sme_save_state(struct sme_state *state, int zt);
> -extern void sme_load_state(const struct sme_state *state, int zt);
> -
>  struct arm64_cpu_capabilities;
>  extern void cpu_enable_fpsimd(const struct arm64_cpu_capabilities *__unused);
>  extern void cpu_enable_sve(const struct arm64_cpu_capabilities *__unused);
> @@ -639,6 +636,106 @@ static inline size_t __sme_state_size(unsigned int sme_vl)
>  	return size;
>  }
>  
> +static inline void __sme_save_za(struct sme_state *state, unsigned long svl)
> +{
> +	/*
> +	 * The <Wv> argument to LDR/STR (array vector) can only encode W12-W15.
> +	 * The "Ucj" constraint exists for this, but is only supported by GCC
> +	 * 14.1.0+ and LLVM 18.1.0+.
> +	 */
> +	register unsigned int v asm ("w12");
> +
> +	instrument_write(state, svl * svl);
> +	for (v = 0; v < svl; v++) {
> +		void *pav = (void *)state + v * svl;
> +
> +		asm volatile(
> +		__SME_PREAMBLE
> +		"	str	za[%w[v], #0], [%[pav]]\n"
> +		:
> +		: [v] "r" (v),
> +		  [pav] "r" (pav)
> +		: "memory"
> +		);
> +	}
> +}
> +
> +static inline void __sme_load_za(const struct sme_state *state, unsigned long svl)
> +{
> +	/* See comment in __sme_save_za */
> +	register unsigned int v asm ("w12");
> +
> +	instrument_read(state, svl * svl);
> +	for (v = 0; v < svl; v++) {
> +		void *pav = (void *)state + v * svl;
> +
> +		asm volatile(
> +		__SME_PREAMBLE
> +		"	ldr	za[%w[v], #0], [%[pav]]\n"
> +		:
> +		: [v] "r" (v),
> +		  [pav] "r" (pav)
> +		: "memory"
> +		);
> +	}
> +}
> +
> +static inline void __sme_save_zt(struct sme_state *state, unsigned long svl)
> +{
> +	void *pzt = (void *)state + svl * svl;
> +
> +	instrument_write(pzt, svl);
> +	asm volatile(
> +	__DEFINE_ASM_GPR_NUMS
> +	/*
> +	 * STR ZT0, [<Xn|SP>]
> +	 * Supported by binutils 2.41+.
> +	 * Supported by LLVM 16+
> +	 */
> +	"	.inst	0xe13f8000 | ((.L__gpr_num_%[pzt]) << 5)\n"
> +	:
> +	: [pzt] "r" (pzt)
> +	: "memory"
> +	);
> +}
> +
> +static inline void __sme_load_zt(const struct sme_state *state, unsigned long svl)
> +{
> +	void *pzt = (void *)state + svl * svl;
> +
> +	instrument_read(pzt, svl);
> +	asm volatile(
> +	__DEFINE_ASM_GPR_NUMS
> +	/*
> +	 * LDR ZT0, [<Xn|SP>]
> +	 * Supported by binutils 2.41+.
> +	 * Supported by LLVM 16+
> +	 */
> +	"	.inst	0xe11f8000 | ((.L__gpr_num_%[pzt]) << 5)\n"
> +	:
> +	: [pzt] "r" (pzt)
> +	: "memory"
> +	);
> +}
> +
> +static inline void sme_save_state(struct sme_state *state, bool zt)
> +{
> +	unsigned long svl = sme_get_vl();
> +
> +	__sme_save_za(state, svl);
> +	if (zt)
> +		__sme_save_zt(state, svl);
> +}
> +
> +static inline void sme_load_state(const struct sme_state *state, bool zt)
> +{
> +	unsigned long svl = sme_get_vl();
> +
> +	__sme_load_za(state, svl);
> +	if (zt)
> +		__sme_load_zt(state, svl);
> +}
> +
>  /*
>   * Return how many bytes of memory are required to store the full SME
>   * specific state for task, given task's currently configured vector
> @@ -695,6 +792,9 @@ static inline size_t sme_state_size(struct task_struct const *task)
>  	return 0;
>  }
>  
> +static inline void sme_save_state(struct sme_state *state, bool zt) { BUILD_BUG(); }
> +static inline void sme_load_state(const struct sme_state *state, bool zt) { BUILD_BUG(); }
> +
>  static inline void sme_enter_from_user_mode(void) { }
>  static inline void sme_exit_to_user_mode(void) { }
>  
> diff --git a/arch/arm64/include/asm/fpsimdmacros.h b/arch/arm64/include/asm/fpsimdmacros.h
> index 9e352b5c6b764..a763fd03ffef3 100644
> --- a/arch/arm64/include/asm/fpsimdmacros.h
> +++ b/arch/arm64/include/asm/fpsimdmacros.h
> @@ -40,60 +40,6 @@
>  	.endif
>  .endm
>  
> -/* Deprecated macros for SME instructions */
> -
> -/* RDSVL X\nx, #\imm */
> -.macro _sme_rdsvl nx, imm
> -	.arch_extension sme
> -	rdsvl x\nx, #\imm
> -.endm
> -
> -/*
> - * STR (vector from ZA array):
> - *	STR ZA[W\nw, #\offset], [X\nxbase, #\offset, MUL VL]
> - */
> -.macro _sme_str_zav nw, nxbase, offset=0
> -	.arch_extension sme
> -	str	za[w\nw, #\offset], [x\nxbase, #\offset, MUL VL]
> -.endm
> -
> -/*
> - * LDR (vector to ZA array):
> - *	LDR ZA[w\nw, #\offset], [X\nxbase, #\offset, MUL VL]
> - */
> -.macro _sme_ldr_zav nw, nxbase, offset=0
> -	.arch_extension sme
> -	ldr	za[w\nw, #\offset], [x\nxbase, #\offset, MUL VL]
> -.endm
> -
> -/*
> - * SME2 instruction encodings for older assemblers.
> - * Supported by binutils 2.41+.
> - * Supported by LLVM 16+
> - */
> -
> -/*
> - * LDR (ZT0)
> - *
> - *	LDR ZT0, nx
> - */
> -.macro _ldr_zt nx
> -	_check_general_reg \nx
> -	.inst	0xe11f8000	\
> -		 | (\nx << 5)
> -.endm
> -
> -/*
> - * STR (ZT0)
> - *
> - *	STR ZT0, nx
> - */
> -.macro _str_zt nx
> -	_check_general_reg \nx
> -	.inst	0xe13f8000		\
> -		| (\nx << 5)
> -.endm
> -
>  .macro __for from:req, to:req
>  	.if (\from) == (\to)
>  		_for__body %\from
> @@ -116,25 +62,3 @@
>  
>  	.purgem _for__body
>  .endm
> -
> -.macro sme_save_za nxbase, xvl, nw
> -	mov	w\nw, #0
> -
> -423:
> -	_sme_str_zav \nw, \nxbase
> -	add	x\nxbase, x\nxbase, \xvl
> -	add	x\nw, x\nw, #1
> -	cmp	\xvl, x\nw
> -	bne	423b
> -.endm
> -
> -.macro sme_load_za nxbase, xvl, nw
> -	mov	w\nw, #0
> -
> -423:
> -	_sme_ldr_zav \nw, \nxbase
> -	add	x\nxbase, x\nxbase, \xvl
> -	add	x\nw, x\nw, #1
> -	cmp	\xvl, x\nw
> -	bne	423b
> -.endm
> diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile
> index 74b76bb704523..d2690c3ec5288 100644
> --- a/arch/arm64/kernel/Makefile
> +++ b/arch/arm64/kernel/Makefile
> @@ -27,7 +27,7 @@ KCOV_INSTRUMENT_idle.o := n
>  
>  # Object file lists.
>  obj-y			:= debug-monitors.o entry.o irq.o fpsimd.o		\
> -			   entry-common.o entry-fpsimd.o process.o ptrace.o	\
> +			   entry-common.o process.o ptrace.o			\
>  			   setup.o signal.o sys.o stacktrace.o time.o traps.o	\
>  			   io.o vdso.o hyp-stub.o psci.o cpu_ops.o		\
>  			   return_address.o cpuinfo.o cpu_errata.o		\
> diff --git a/arch/arm64/kernel/entry-fpsimd.S b/arch/arm64/kernel/entry-fpsimd.S
> deleted file mode 100644
> index bff941eea9566..0000000000000
> --- a/arch/arm64/kernel/entry-fpsimd.S
> +++ /dev/null
> @@ -1,48 +0,0 @@
> -/* SPDX-License-Identifier: GPL-2.0-only */
> -/*
> - * FP/SIMD state saving and restoring
> - *
> - * Copyright (C) 2012 ARM Ltd.
> - * Author: Catalin Marinas <catalin.marinas at arm.com>
> - */
> -
> -#include <linux/linkage.h>
> -
> -#include <asm/assembler.h>
> -#include <asm/fpsimdmacros.h>
> -
> -#ifdef CONFIG_ARM64_SME
> -
> -/*
> - * Save the ZA and ZT state
> - *
> - * x0 - pointer to buffer for state
> - * x1 - number of ZT registers to save
> - */
> -SYM_FUNC_START(sme_save_state)
> -	_sme_rdsvl	2, 1		// x2 = VL/8
> -	sme_save_za 0, x2, 12		// Leaves x0 pointing to the end of ZA
> -
> -	cbz	x1, 1f
> -	_str_zt 0
> -1:
> -	ret
> -SYM_FUNC_END(sme_save_state)
> -
> -/*
> - * Load the ZA and ZT state
> - *
> - * x0 - pointer to buffer for state
> - * x1 - number of ZT registers to save
> - */
> -SYM_FUNC_START(sme_load_state)
> -	_sme_rdsvl	2, 1		// x2 = VL/8
> -	sme_load_za 0, x2, 12		// Leaves x0 pointing to the end of ZA
> -
> -	cbz	x1, 1f
> -	_ldr_zt 0
> -1:
> -	ret
> -SYM_FUNC_END(sme_load_state)
> -
> -#endif /* CONFIG_ARM64_SME */
> -- 2.30.2
> 

FWIW,

Reviewed-by: Vladimir Murzin <vladimir.murzin at arm.com>




More information about the linux-arm-kernel mailing list