[PATCH 3/5] arm64/perf: Add Broadcom Vulcan PMU support

Will Deacon will.deacon at arm.com
Tue Mar 22 03:01:11 PDT 2016


Hi Ashok,

On Wed, Mar 16, 2016 at 06:01:47AM -0700, Ashok Kumar wrote:
> Broadcom Vulcan uses ARMv8 PMUv3 and supports most of
> the ARMv8 recommended implementation defined events.
> 
> Added Vulcan events mapping for perf and perf_cache.
> 
> Created separate event_attrs structure for vulcan as
> it supports more events and doesn't support few events
> (like PC_WRITE, MEM_ERROR) from the generic armv8
> event_attrs structure.
> 
> Signed-off-by: Ashok Kumar <ashoks at broadcom.com>
> ---
>  arch/arm64/kernel/perf_event.c | 253 +++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 253 insertions(+)
> 
> diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
> index 3207b5f..1bb06d3 100644
> --- a/arch/arm64/kernel/perf_event.c
> +++ b/arch/arm64/kernel/perf_event.c
> @@ -232,6 +232,20 @@ static const unsigned armv8_thunder_perf_map[PERF_COUNT_HW_MAX] = {
>  	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV8_PMUV3_PERFCTR_STALL_BACKEND,
>  };
>  
> +/* Broadcom Vulcan events mapping */
> +static const unsigned armv8_vulcan_perf_map[PERF_COUNT_HW_MAX] = {
> +	PERF_MAP_ALL_UNSUPPORTED,
> +	[PERF_COUNT_HW_CPU_CYCLES]		= ARMV8_PMUV3_PERFCTR_CLOCK_CYCLES,
> +	[PERF_COUNT_HW_INSTRUCTIONS]		= ARMV8_PMUV3_PERFCTR_INSTR_EXECUTED,
> +	[PERF_COUNT_HW_CACHE_REFERENCES]	= ARMV8_PMUV3_PERFCTR_L1D_CACHE_ACCESS,
> +	[PERF_COUNT_HW_CACHE_MISSES]		= ARMV8_PMUV3_PERFCTR_L1D_CACHE_REFILL,
> +	[PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= ARMV8_PMUV3_PERFCTR_BR_RETIRED,
> +	[PERF_COUNT_HW_BRANCH_MISSES]		= ARMV8_PMUV3_PERFCTR_PC_BRANCH_MIS_PRED,
> +	[PERF_COUNT_HW_BUS_CYCLES]		= ARMV8_PMUV3_PERFCTR_BUS_CYCLES,
> +	[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND]	= ARMV8_PMUV3_PERFCTR_STALL_FRONTEND,
> +	[PERF_COUNT_HW_STALLED_CYCLES_BACKEND]	= ARMV8_PMUV3_PERFCTR_STALL_BACKEND,
> +};
> +
>  static const unsigned armv8_pmuv3_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
>  						[PERF_COUNT_HW_CACHE_OP_MAX]
>  						[PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> @@ -324,6 +338,36 @@ static const unsigned armv8_thunder_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
>  	[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_PC_BRANCH_MIS_PRED,
>  };
>  
> +static const unsigned armv8_vulcan_perf_cache_map[PERF_COUNT_HW_CACHE_MAX]
> +					      [PERF_COUNT_HW_CACHE_OP_MAX]
> +					      [PERF_COUNT_HW_CACHE_RESULT_MAX] = {
> +	PERF_CACHE_MAP_ALL_UNSUPPORTED,
> +
> +	[C(L1D)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_ACCESS_LD,
> +	[C(L1D)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_LD,
> +	[C(L1D)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_ACCESS_ST,
> +	[C(L1D)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_ST,
> +
> +	[C(L1I)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1I_CACHE_ACCESS,
> +	[C(L1I)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1I_CACHE_REFILL,
> +
> +	[C(ITLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_L1I_TLB_REFILL,
> +	[C(ITLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_L1I_TLB_ACCESS,
> +
> +	[C(DTLB)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_ACCESS_LD,
> +	[C(DTLB)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_L1D_TLB_ACCESS_ST,
> +	[C(DTLB)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_LD,
> +	[C(DTLB)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_ST,
> +
> +	[C(BPU)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_PC_BRANCH_PRED,
> +	[C(BPU)][C(OP_READ)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_PC_BRANCH_MIS_PRED,
> +	[C(BPU)][C(OP_WRITE)][C(RESULT_ACCESS)]	= ARMV8_PMUV3_PERFCTR_PC_BRANCH_PRED,
> +	[C(BPU)][C(OP_WRITE)][C(RESULT_MISS)]	= ARMV8_PMUV3_PERFCTR_PC_BRANCH_MIS_PRED,
> +
> +	[C(NODE)][C(OP_READ)][C(RESULT_ACCESS)]	= ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_LD,
> +	[C(NODE)][C(OP_WRITE)][C(RESULT_ACCESS)] = ARMV8_IMPDEF_PERFCTR_BUS_ACCESS_ST,
> +};

I'm fine with this part of the patch...

>  #define ARMV8_EVENT_ATTR_RESOLVE(m) #m
>  #define ARMV8_EVENT_ATTR(name, config) \
>  	PMU_EVENT_ATTR_STRING(name, armv8_event_attr_##name, \
> @@ -379,6 +423,74 @@ ARMV8_EVENT_ATTR(l2i_tlb_refill, ARMV8_PMUV3_PERFCTR_L2I_TLB_REFILL);
>  ARMV8_EVENT_ATTR(l2d_tlb_access, ARMV8_PMUV3_PERFCTR_L2D_TLB_ACCESS);
>  ARMV8_EVENT_ATTR(l2i_tlb_access, ARMV8_PMUV3_PERFCTR_L2I_TLB_ACCESS);
>  
> +ARMV8_EVENT_ATTR(l1d_cache_access_ld, ARMV8_IMPDEF_PERFCTR_L1D_CACHE_ACCESS_LD);
> +ARMV8_EVENT_ATTR(l1d_cache_refill_ld, ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_LD);
> +ARMV8_EVENT_ATTR(l1d_cache_access_st, ARMV8_IMPDEF_PERFCTR_L1D_CACHE_ACCESS_ST);
> +ARMV8_EVENT_ATTR(l1d_cache_refill_st, ARMV8_IMPDEF_PERFCTR_L1D_CACHE_REFILL_ST);
> +ARMV8_EVENT_ATTR(l1d_tlb_access_ld, ARMV8_IMPDEF_PERFCTR_L1D_TLB_ACCESS_LD);
> +ARMV8_EVENT_ATTR(l1d_tlb_access_st, ARMV8_IMPDEF_PERFCTR_L1D_TLB_ACCESS_ST);
> +ARMV8_EVENT_ATTR(l1d_tlb_refill_ld, ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_LD);
> +ARMV8_EVENT_ATTR(l1d_tlb_refill_st, ARMV8_IMPDEF_PERFCTR_L1D_TLB_REFILL_ST);

[...]

> +static struct attribute *vulcan_pmuv3_event_attrs[] = {
> +	&armv8_event_attr_sw_incr.attr.attr,
> +	&armv8_event_attr_l1i_cache_refill.attr.attr,
> +	&armv8_event_attr_l1i_tlb_refill.attr.attr,
> +	&armv8_event_attr_l1d_cache_refill.attr.attr,
> +	&armv8_event_attr_l1d_cache_access.attr.attr,
> +	&armv8_event_attr_l1d_tlb_refill.attr.attr,
> +	&armv8_event_attr_ld_retired.attr.attr,
> +	&armv8_event_attr_st_retired.attr.attr,
> +	&armv8_event_attr_inst_retired.attr.attr,
> +	&armv8_event_attr_exc_taken.attr.attr,
> +	&armv8_event_attr_exc_return.attr.attr,

... but I'm not keen on having these tables in the kernel for each CPU
PMU we support. Where I'd like to get to is:

  * We expose the architected events (0x0-0x3f) in /sys using the existing
    PMUv3 tables in conjunction with PMCEIDn_EL0 (Jan mentioned this before)

  * Userspace knows about the micro-architecture-specific events for a
    given PMU

If there really is a need to have this in the kernel, then I think we
should construct the tables at runtime using a bitmap, much like I'd
like to do with PMCEIDn_EL0. That would mean having a bitmap for each
compatible string, as opposed to a table of pointers in the kernel image.

I still need to be convinced that this doesn't belong in userspace,
though.

Will



More information about the linux-arm-kernel mailing list