[PATCH 8/8] drivers/perf: Add Apple icestorm/firestorm CPU PMU driver

Sun Nov 14 10:35:47 PST 2021

On Sun, 14 Nov 2021 13:45:30 +0000,
Alyssa Rosenzweig <alyssa at rosenzweig.io> wrote:
> 
> > +/* Counters */
> > +#define SYS_IMP_APL_PMC0_EL1	sys_reg(3, 2, 15, 0, 0)
> > +#define SYS_IMP_APL_PMC1_EL1	sys_reg(3, 2, 15, 1, 0)
> > +#define SYS_IMP_APL_PMC2_EL1	sys_reg(3, 2, 15, 2, 0)
> > +#define SYS_IMP_APL_PMC3_EL1	sys_reg(3, 2, 15, 3, 0)
> > +#define SYS_IMP_APL_PMC4_EL1	sys_reg(3, 2, 15, 4, 0)
> > +#define SYS_IMP_APL_PMC5_EL1	sys_reg(3, 2, 15, 5, 0)
> > +#define SYS_IMP_APL_PMC6_EL1	sys_reg(3, 2, 15, 6, 0)
> > +#define SYS_IMP_APL_PMC7_EL1	sys_reg(3, 2, 15, 7, 0)
> --gap--
> > +#define SYS_IMP_APL_PMC8_EL1	sys_reg(3, 2, 15, 9, 0)
> > +#define SYS_IMP_APL_PMC9_EL1	sys_reg(3, 2, 15, 10, 0)
> 
> Do we know what the gap is?

If it exists, it is an IMPDEF register.

> 
> > +/*
> > + * Description of the events we actually know about, as well as those with
> > + * a specific counter affinity. Yes, this is a grand total of two known
> > + * counters, and the rest is anybody's guess.
> > + *
> > + * Not all counters can count all events. Counters #0 and #1 are wired to
> > + * count cycles and instructions respectively, and some events have
> > + * bizarre mappings (every other counter, or even *one* counter). These
> > + * restrictins equally apply to both P and E cores.
> 
> restrictions
> 
> > +/* Low level accessors. No synchronisation. */
> > +#define PMU_READ_COUNTER(_idx)						\
> > +	case _idx:	return read_sysreg_s(SYS_IMP_APL_PMC## _idx ##_EL1)
> > +
> > +#define PMU_WRITE_COUNTER(_val, _idx)					\
> > +	case _idx:							\
> > +		write_sysreg_s(_val, SYS_IMP_APL_PMC## _idx ##_EL1);	\
> > +		return
> > +
> > +static u64 m1_pmu_read_hw_counter(unsigned int index)
> > +{
> > +	switch (index) {
> > +		PMU_READ_COUNTER(0);
> > +		PMU_READ_COUNTER(1);
> > +		PMU_READ_COUNTER(2);
> > +		PMU_READ_COUNTER(3);
> > +		PMU_READ_COUNTER(4);
> > +		PMU_READ_COUNTER(5);
> > +		PMU_READ_COUNTER(6);
> > +		PMU_READ_COUNTER(7);
> > +		PMU_READ_COUNTER(8);
> > +		PMU_READ_COUNTER(9);
> > +	}
> > +
> > +	BUG();
> > +}
> > +
> > +static void m1_pmu_write_hw_counter(u64 val, unsigned int index)
> > +{
> > +	switch (index) {
> > +		PMU_WRITE_COUNTER(val, 0);
> > +		PMU_WRITE_COUNTER(val, 1);
> > +		PMU_WRITE_COUNTER(val, 2);
> > +		PMU_WRITE_COUNTER(val, 3);
> > +		PMU_WRITE_COUNTER(val, 4);
> > +		PMU_WRITE_COUNTER(val, 5);
> > +		PMU_WRITE_COUNTER(val, 6);
> > +		PMU_WRITE_COUNTER(val, 7);
> > +		PMU_WRITE_COUNTER(val, 8);
> > +		PMU_WRITE_COUNTER(val, 9);
> > +	}
> > +
> > +	BUG();
> > +}
> 
> Probbaly cleaner to use a single switch and no macros, registers become
> greppable and the code is shorter too. Caveat: didn't check if it
> compiles.

I can tell you haven't!

> 
> 	static inline u64 m1_pmu_hw_counter(unsigned int index)
> 	{
> 		switch (index) {
> 		case 0: return SYS_IMP_APL_PMC0_EL1;
> 		case 1: return SYS_IMP_APL_PMC1_EL1;
> 		case 2: return SYS_IMP_APL_PMC2_EL1;
> 		case 3: return SYS_IMP_APL_PMC3_EL1;
> 		case 4: return SYS_IMP_APL_PMC4_EL1;
> 		case 5: return SYS_IMP_APL_PMC5_EL1;
> 		case 6: return SYS_IMP_APL_PMC6_EL1;
> 		case 7: return SYS_IMP_APL_PMC7_EL1;
> 		case 8: return SYS_IMP_APL_PMC8_EL1;
> 		case 9: return SYS_IMP_APL_PMC9_EL1;
> 		}
> 
> 		BUG();
> 	}
> 
> 	static u64 m1_pmu_read_hw_counter(unsigned int index) {
> 		return read_sysreg_s(m1_pmu_hw_counter(index));

read/write_sysreg_s() cannot take a variable as a parameter. They rely
on direct macro expansion to generate the inline asm. See the various
examples all over the code base where we have similar constructs for
this exact reason.

> 	}
> 
> 
> 	static void m1_pmu_write_hw_counter(u64 val, unsigned int index)
> 	{
> 		write_sysreg_s(val, m1_pmu_hw_counter(index));
> 	}
> 
> > +static void __m1_pmu_enable_counter(unsigned int index, bool en)
> > +{
> > +	u64 val, bit;
> > +
> > +	switch (index) {
> > +	case 0 ... 7:
> > +		bit = BIT(get_bit_offset(index, PMCR0_CNT_ENABLE_0_7));
> > +		break;
> > +	case 8 ... 9:
> > +		bit = BIT(get_bit_offset(index - 8, PMCR0_CNT_ENABLE_8_9));
> > +		break;
> > +	default:
> > +		BUG();
> > +	}
> > +
> > +	val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
> > +
> > +	if (en)
> > +		val |= bit;
> > +	else
> > +		val &= ~bit;
> > +
> > +	write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
> > +}
> ...
> > +static void __m1_pmu_enable_counter_interrupt(unsigned int index, bool en)
> > +{
> > +	u64 val, bit;
> > +
> > +	switch (index) {
> > +	case 0 ... 7:
> > +		bit = BIT(get_bit_offset(index, PMCR0_PMI_ENABLE_0_7));
> > +		break;
> > +	case 8 ... 9:
> > +		bit = BIT(get_bit_offset(index - 8, PMCR0_PMI_ENABLE_8_9));
> > +		break;
> > +	default:
> > +		BUG();
> > +	}
> > +
> > +	val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
> > +
> > +	if (en)
> > +		val |= bit;
> > +	else
> > +		val &= ~bit;
> > +
> > +	write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
> > +}
> 
> These two helper functions have basically the same logic -- maybe
> worth combining?

I've tried, and I find the result pretty unconvincing, see below.

diff --git a/drivers/perf/apple_m1_cpu_pmu.c b/drivers/perf/apple_m1_cpu_pmu.c
index bc991fc892eb..40f9e14b12da 100644
--- a/drivers/perf/apple_m1_cpu_pmu.c
+++ b/drivers/perf/apple_m1_cpu_pmu.c
@@ -223,29 +223,38 @@ static void m1_pmu_write_hw_counter(u64 val, unsigned int index)
 
 #define get_bit_offset(index, mask)	(__ffs(mask) + (index))
 
-static void __m1_pmu_enable_counter(unsigned int index, bool en)
+static void __m1_pmu_write_pmcr0(unsigned int index, bool en, u64 mask)
 {
 	u64 val, bit;
 
+	bit = BIT(get_bit_offset(index, mask));
+	val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
+
+	if (en)
+		val |= bit;
+	else
+		val &= ~bit;
+
+	write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
+}
+
+static void __m1_pmu_enable_counter(unsigned int index, bool en)
+{
+	u64 mask;
+
 	switch (index) {
 	case 0 ... 7:
-		bit = BIT(get_bit_offset(index, PMCR0_CNT_ENABLE_0_7));
+		mask = PMCR0_CNT_ENABLE_0_7;
 		break;
 	case 8 ... 9:
-		bit = BIT(get_bit_offset(index - 8, PMCR0_CNT_ENABLE_8_9));
+		mask = PMCR0_CNT_ENABLE_8_9;
+		index -= 8;
 		break;
 	default:
 		BUG();
 	}
 
-	val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
-
-	if (en)
-		val |= bit;
-	else
-		val &= ~bit;
-
-	write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
+	__m1_pmu_write_pmcr0(index, en, mask);
 }
 
 static void m1_pmu_enable_counter(unsigned int index)
@@ -260,27 +269,21 @@ static void m1_pmu_disable_counter(unsigned int index)
 
 static void __m1_pmu_enable_counter_interrupt(unsigned int index, bool en)
 {
-	u64 val, bit;
+	u64 mask;
 
 	switch (index) {
 	case 0 ... 7:
-		bit = BIT(get_bit_offset(index, PMCR0_PMI_ENABLE_0_7));
+		mask =  PMCR0_PMI_ENABLE_0_7;
 		break;
 	case 8 ... 9:
-		bit = BIT(get_bit_offset(index - 8, PMCR0_PMI_ENABLE_8_9));
+		mask = PMCR0_PMI_ENABLE_8_9;
+		index -= 8;
 		break;
 	default:
 		BUG();
 	}
 
-	val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
-
-	if (en)
-		val |= bit;
-	else
-		val &= ~bit;
-
-	write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
+	__m1_pmu_write_pmcr0(index, en, mask);
 }
 
 static void m1_pmu_enable_counter_interrupt(unsigned int index)

>
> > +static void m1_pmu_configure_counter(unsigned int index, u8 event,
> > +				     bool user, bool kernel)
> > +{
> ....
> > +	switch (index) {
> > +	case 0 ... 1:
> > +		/* 0 and 1 have fixed events */
> > +		break;
> > +	case 2 ... 5:
> > +		shift = (index - 2) * 8;
> > +		val = read_sysreg_s(SYS_IMP_APL_PMESR0_EL1);
> > +		val &= ~((u64)0xff << shift);
> > +		val |= (u64)event << shift;
> > +		write_sysreg_s(val, SYS_IMP_APL_PMESR0_EL1);
> > +		break;
> > +	case 6 ... 9:
> > +		shift = (index - 6) * 8;
> > +		val = read_sysreg_s(SYS_IMP_APL_PMESR1_EL1);
> > +		val &= ~((u64)0xff << shift);
> > +		val |= (u64)event << shift;
> > +		write_sysreg_s(val, SYS_IMP_APL_PMESR1_EL1);
> > +		break;
> > +	}
> > +}
> 
> I'd love an explanation what's happening here.

PMESR{0,1} contain the event numbers for counters 2-5, resp 6-9, and
we happily shove an event number there. Is that the sort of
information you are after?

> 
> > +	/*
> > +	 * Place the event on the first free counter that can count
> > +	 * this event.
> > +	 *
> > +	 * We could do a better job if we had a view of all the events
> > +	 * counting on the PMU at any given time, and by placing the
> > +	 * most constraint events first.
> > +	 */
> 
> constraining
> 
> > +static int m1_pmu_device_probe(struct platform_device *pdev)
> > +{
> > +	int ret;
> > +
> > +	ret = arm_pmu_device_probe(pdev, m1_pmu_of_device_ids, NULL);
> > +	if (!ret) {
> > +		/*
> > +		 * If probe succeeds, taint the kernel as this is all
> > +		 * undocumented, implementation defined black magic.
> > +		 */
> > +		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
> > +	}
> > +
> > +	return ret;
> > +}
> 
> What are the implications of this taint? You could say that about every
> driver we've written for the M1, but...

This taint status appears on the kernel crash, telling people the
kernel is using CPU features that are not documented and about which
we can only hope that they work as we think they work. The same thing
takes place with KVM and the use of unadvertised GICv3 CPU registers.

Ideally, this would happen at early boot because of the fixed VHE
crap, but this is more effort that it deserves.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.