[PATCH v2 1/2] perf: arm64: Add SVE vector granule register to user regs

Will Deacon will at kernel.org
Mon Jun 27 04:13:20 PDT 2022


On Tue, May 17, 2022 at 11:07:42AM +0100, James Clark wrote:
> Dwarf based unwinding in a function that pushes SVE registers onto
> the stack requires the unwinder to know the length of the SVE register
> to calculate the stack offsets correctly. This was added to the Arm
> specific Dwarf spec as the VG pseudo register[1].
> 
> Add the vector length at position 46 if it's requested by userspace and
> SVE is supported. If it's not supported then fail to open the event.
> 
> The vector length must be on each sample because it can be changed
> at runtime via a prctl or ptrace call. Also by adding it as a register
> rather than a separate attribute, minimal changes will be required in an
> unwinder that already indexes into the register list.
> 
> [1]: https://github.com/ARM-software/abi-aa/blob/main/aadwarf64/aadwarf64.rst
> 
> Reviewed-by: Mark Brown <broonie at kernel.org>
> Signed-off-by: James Clark <james.clark at arm.com>
> ---
>  arch/arm64/include/uapi/asm/perf_regs.h |  7 +++++-
>  arch/arm64/kernel/perf_regs.c           | 30 +++++++++++++++++++++++--
>  drivers/perf/arm_pmu.c                  |  2 +-
>  3 files changed, 35 insertions(+), 4 deletions(-)
> 
> diff --git a/arch/arm64/include/uapi/asm/perf_regs.h b/arch/arm64/include/uapi/asm/perf_regs.h
> index d54daafa89e3..fd157f46727e 100644
> --- a/arch/arm64/include/uapi/asm/perf_regs.h
> +++ b/arch/arm64/include/uapi/asm/perf_regs.h
> @@ -36,6 +36,11 @@ enum perf_event_arm_regs {
>  	PERF_REG_ARM64_LR,
>  	PERF_REG_ARM64_SP,
>  	PERF_REG_ARM64_PC,
> -	PERF_REG_ARM64_MAX,
> +
> +	/* Extended/pseudo registers */
> +	PERF_REG_ARM64_VG = 46, // SVE Vector Granule
> +
> +	PERF_REG_ARM64_MAX = PERF_REG_ARM64_PC + 1,
> +	PERF_REG_ARM64_EXTENDED_MAX = PERF_REG_ARM64_VG + 1

I think you can leave PERF_REG_ARM64_MAX alone and just add:

	PERF_REG_ARM64_VG = 46,
	PERF_REG_ARM64_EXTENDED_MAX,

no?

>  };
>  #endif /* _ASM_ARM64_PERF_REGS_H */
> diff --git a/arch/arm64/kernel/perf_regs.c b/arch/arm64/kernel/perf_regs.c
> index f6f58e6265df..b4eece3eb17d 100644
> --- a/arch/arm64/kernel/perf_regs.c
> +++ b/arch/arm64/kernel/perf_regs.c
> @@ -9,9 +9,27 @@
>  #include <asm/perf_regs.h>
>  #include <asm/ptrace.h>
>  
> +static u64 perf_ext_regs_value(int idx)
> +{
> +	switch (idx) {
> +	case PERF_REG_ARM64_VG:
> +		if (WARN_ON_ONCE(!system_supports_sve()))
> +			return 0;
> +
> +		/*
> +		 * Vector granule is current length in bits of SVE registers
> +		 * divided by 64.
> +		 */
> +		return (task_get_sve_vl(current) * 8) / 64;

Is 'current' the right thing to use here? We pass the regs everywhere else,
so I'd prefer to stick with that if possible.

> +	default:
> +		WARN_ON_ONCE(true);
> +		return 0;
> +	}
> +}
> +
>  u64 perf_reg_value(struct pt_regs *regs, int idx)
>  {
> -	if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_MAX))
> +	if (WARN_ON_ONCE((u32)idx >= PERF_REG_ARM64_EXTENDED_MAX))
>  		return 0;
>  
>  	/*
> @@ -51,6 +69,9 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
>  	if ((u32)idx == PERF_REG_ARM64_PC)
>  		return regs->pc;
>  
> +	if ((u32)idx >= PERF_REG_ARM64_MAX)
> +		return perf_ext_regs_value(idx);
> +
>  	return regs->regs[idx];
>  }
>  
> @@ -58,7 +79,12 @@ u64 perf_reg_value(struct pt_regs *regs, int idx)
>  
>  int perf_reg_validate(u64 mask)
>  {
> -	if (!mask || mask & REG_RESERVED)
> +	u64 reserved_mask = REG_RESERVED;
> +
> +	if (system_supports_sve())
> +		reserved_mask &= ~(1ULL << PERF_REG_ARM64_VG);
> +
> +	if (!mask || mask & reserved_mask)
>  		return -EINVAL;
>  
>  	return 0;
> diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
> index 59d3980b8ca2..3f07df5a7e95 100644
> --- a/drivers/perf/arm_pmu.c
> +++ b/drivers/perf/arm_pmu.c
> @@ -894,7 +894,7 @@ static struct arm_pmu *__armpmu_alloc(gfp_t flags)
>  		 * pmu::filter_match callback and pmu::event_init group
>  		 * validation).
>  		 */
> -		.capabilities	= PERF_PMU_CAP_HETEROGENEOUS_CPUS,
> +		.capabilities	= PERF_PMU_CAP_HETEROGENEOUS_CPUS | PERF_PMU_CAP_EXTENDED_REGS,

How does userspace know this capability is available? Should we advertise
the set of extended registers that we support, rather than make this a
one-trick pony for the vector length?

Also, you don't appear to #define PERF_REG_EXTENDED_MASK so I don't
understand how userspace is supposed to interact with this. Won't
has_extended_regs() always return false?

Will



More information about the linux-arm-kernel mailing list