[PATCH 8/8] drivers/perf: Add Apple icestorm/firestorm CPU PMU driver

Sun Nov 14 05:45:30 PST 2021

> +/* Counters */
> +#define SYS_IMP_APL_PMC0_EL1	sys_reg(3, 2, 15, 0, 0)
> +#define SYS_IMP_APL_PMC1_EL1	sys_reg(3, 2, 15, 1, 0)
> +#define SYS_IMP_APL_PMC2_EL1	sys_reg(3, 2, 15, 2, 0)
> +#define SYS_IMP_APL_PMC3_EL1	sys_reg(3, 2, 15, 3, 0)
> +#define SYS_IMP_APL_PMC4_EL1	sys_reg(3, 2, 15, 4, 0)
> +#define SYS_IMP_APL_PMC5_EL1	sys_reg(3, 2, 15, 5, 0)
> +#define SYS_IMP_APL_PMC6_EL1	sys_reg(3, 2, 15, 6, 0)
> +#define SYS_IMP_APL_PMC7_EL1	sys_reg(3, 2, 15, 7, 0)
--gap--
> +#define SYS_IMP_APL_PMC8_EL1	sys_reg(3, 2, 15, 9, 0)
> +#define SYS_IMP_APL_PMC9_EL1	sys_reg(3, 2, 15, 10, 0)

Do we know what the gap is?

> +/*
> + * Description of the events we actually know about, as well as those with
> + * a specific counter affinity. Yes, this is a grand total of two known
> + * counters, and the rest is anybody's guess.
> + *
> + * Not all counters can count all events. Counters #0 and #1 are wired to
> + * count cycles and instructions respectively, and some events have
> + * bizarre mappings (every other counter, or even *one* counter). These
> + * restrictins equally apply to both P and E cores.

restrictions

> +/* Low level accessors. No synchronisation. */
> +#define PMU_READ_COUNTER(_idx)						\
> +	case _idx:	return read_sysreg_s(SYS_IMP_APL_PMC## _idx ##_EL1)
> +
> +#define PMU_WRITE_COUNTER(_val, _idx)					\
> +	case _idx:							\
> +		write_sysreg_s(_val, SYS_IMP_APL_PMC## _idx ##_EL1);	\
> +		return
> +
> +static u64 m1_pmu_read_hw_counter(unsigned int index)
> +{
> +	switch (index) {
> +		PMU_READ_COUNTER(0);
> +		PMU_READ_COUNTER(1);
> +		PMU_READ_COUNTER(2);
> +		PMU_READ_COUNTER(3);
> +		PMU_READ_COUNTER(4);
> +		PMU_READ_COUNTER(5);
> +		PMU_READ_COUNTER(6);
> +		PMU_READ_COUNTER(7);
> +		PMU_READ_COUNTER(8);
> +		PMU_READ_COUNTER(9);
> +	}
> +
> +	BUG();
> +}
> +
> +static void m1_pmu_write_hw_counter(u64 val, unsigned int index)
> +{
> +	switch (index) {
> +		PMU_WRITE_COUNTER(val, 0);
> +		PMU_WRITE_COUNTER(val, 1);
> +		PMU_WRITE_COUNTER(val, 2);
> +		PMU_WRITE_COUNTER(val, 3);
> +		PMU_WRITE_COUNTER(val, 4);
> +		PMU_WRITE_COUNTER(val, 5);
> +		PMU_WRITE_COUNTER(val, 6);
> +		PMU_WRITE_COUNTER(val, 7);
> +		PMU_WRITE_COUNTER(val, 8);
> +		PMU_WRITE_COUNTER(val, 9);
> +	}
> +
> +	BUG();
> +}

Probbaly cleaner to use a single switch and no macros, registers become
greppable and the code is shorter too. Caveat: didn't check if it
compiles.

	static inline u64 m1_pmu_hw_counter(unsigned int index)
	{
		switch (index) {
		case 0: return SYS_IMP_APL_PMC0_EL1;
		case 1: return SYS_IMP_APL_PMC1_EL1;
		case 2: return SYS_IMP_APL_PMC2_EL1;
		case 3: return SYS_IMP_APL_PMC3_EL1;
		case 4: return SYS_IMP_APL_PMC4_EL1;
		case 5: return SYS_IMP_APL_PMC5_EL1;
		case 6: return SYS_IMP_APL_PMC6_EL1;
		case 7: return SYS_IMP_APL_PMC7_EL1;
		case 8: return SYS_IMP_APL_PMC8_EL1;
		case 9: return SYS_IMP_APL_PMC9_EL1;
		}

		BUG();
	}

	static u64 m1_pmu_read_hw_counter(unsigned int index) {
		return read_sysreg_s(m1_pmu_hw_counter(index));
	}

	static void m1_pmu_write_hw_counter(u64 val, unsigned int index)
	{
		write_sysreg_s(val, m1_pmu_hw_counter(index));
	}

> +static void __m1_pmu_enable_counter(unsigned int index, bool en)
> +{
> +	u64 val, bit;
> +
> +	switch (index) {
> +	case 0 ... 7:
> +		bit = BIT(get_bit_offset(index, PMCR0_CNT_ENABLE_0_7));
> +		break;
> +	case 8 ... 9:
> +		bit = BIT(get_bit_offset(index - 8, PMCR0_CNT_ENABLE_8_9));
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
> +
> +	if (en)
> +		val |= bit;
> +	else
> +		val &= ~bit;
> +
> +	write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
> +}
...
> +static void __m1_pmu_enable_counter_interrupt(unsigned int index, bool en)
> +{
> +	u64 val, bit;
> +
> +	switch (index) {
> +	case 0 ... 7:
> +		bit = BIT(get_bit_offset(index, PMCR0_PMI_ENABLE_0_7));
> +		break;
> +	case 8 ... 9:
> +		bit = BIT(get_bit_offset(index - 8, PMCR0_PMI_ENABLE_8_9));
> +		break;
> +	default:
> +		BUG();
> +	}
> +
> +	val = read_sysreg_s(SYS_IMP_APL_PMCR0_EL1);
> +
> +	if (en)
> +		val |= bit;
> +	else
> +		val &= ~bit;
> +
> +	write_sysreg_s(val, SYS_IMP_APL_PMCR0_EL1);
> +}

These two helper functions have basically the same logic -- maybe worth combining?

> +static void m1_pmu_configure_counter(unsigned int index, u8 event,
> +				     bool user, bool kernel)
> +{
....
> +	switch (index) {
> +	case 0 ... 1:
> +		/* 0 and 1 have fixed events */
> +		break;
> +	case 2 ... 5:
> +		shift = (index - 2) * 8;
> +		val = read_sysreg_s(SYS_IMP_APL_PMESR0_EL1);
> +		val &= ~((u64)0xff << shift);
> +		val |= (u64)event << shift;
> +		write_sysreg_s(val, SYS_IMP_APL_PMESR0_EL1);
> +		break;
> +	case 6 ... 9:
> +		shift = (index - 6) * 8;
> +		val = read_sysreg_s(SYS_IMP_APL_PMESR1_EL1);
> +		val &= ~((u64)0xff << shift);
> +		val |= (u64)event << shift;
> +		write_sysreg_s(val, SYS_IMP_APL_PMESR1_EL1);
> +		break;
> +	}
> +}

I'd love an explanation what's happening here.

> +	/*
> +	 * Place the event on the first free counter that can count
> +	 * this event.
> +	 *
> +	 * We could do a better job if we had a view of all the events
> +	 * counting on the PMU at any given time, and by placing the
> +	 * most constraint events first.
> +	 */

constraining

> +static int m1_pmu_device_probe(struct platform_device *pdev)
> +{
> +	int ret;
> +
> +	ret = arm_pmu_device_probe(pdev, m1_pmu_of_device_ids, NULL);
> +	if (!ret) {
> +		/*
> +		 * If probe succeeds, taint the kernel as this is all
> +		 * undocumented, implementation defined black magic.
> +		 */
> +		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
> +	}
> +
> +	return ret;
> +}

What are the implications of this taint? You could say that about every
driver we've written for the M1, but...