[PATCH 2/3] perf/arm-cmn: Rework DTC counters (again)

Ilkka Koskinen ilkka at os.amperecomputing.com
Fri Oct 20 15:53:10 PDT 2023


On Fri, 20 Oct 2023, Robin Murphy wrote:
> The bitmap-based scheme for tracking DTC counter usage turns out to be a
> complete dead-end for its imagined purpose, since by the time we have to
> keep track of a per-DTC counter index anyway, we already have enough
> information to make the bitmap itself redundant. Revert the remains of
> it back to almost the original scheme, but now expanded to track per-DTC
> indices, in preparation for making use of them in anger.
>
> Note that since cycle count events always use a dedicated counter on a
> single DTC, we reuse the field to encode their DTC index directly.
>
> Signed-off-by: Robin Murphy <robin.murphy at arm.com>

Thanks! I had that on my task list but never had time to start working on 
it.

Reviewed-by: Ilkka Koskinen <ilkka at os.amperecomputing.com>

Cheers, Ilkka

> ---
> drivers/perf/arm-cmn.c | 126 +++++++++++++++++++++--------------------
> 1 file changed, 64 insertions(+), 62 deletions(-)
>
> diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
> index f1ac8d0cdb3b..675f1638013e 100644
> --- a/drivers/perf/arm-cmn.c
> +++ b/drivers/perf/arm-cmn.c
> @@ -281,16 +281,13 @@ struct arm_cmn_node {
> 	u16 id, logid;
> 	enum cmn_node_type type;
>
> -	int dtm;
> -	union {
> -		/* DN/HN-F/CXHA */
> -		struct {
> -			u8 val : 4;
> -			u8 count : 4;
> -		} occupid[SEL_MAX];
> -		/* XP */
> -		u8 dtc;
> -	};
> +	u8 dtm;
> +	s8 dtc;
> +	/* DN/HN-F/CXHA */
> +	struct {
> +		u8 val : 4;
> +		u8 count : 4;
> +	} occupid[SEL_MAX];
> 	union {
> 		u8 event[4];
> 		__le32 event_sel;
> @@ -540,12 +537,12 @@ static int arm_cmn_map_show(struct seq_file *s, void *data)
>
> 		seq_puts(s, "\n     |");
> 		for (x = 0; x < cmn->mesh_x; x++) {
> -			u8 dtc = cmn->xps[xp_base + x].dtc;
> +			s8 dtc = cmn->xps[xp_base + x].dtc;
>
> -			if (dtc & (dtc - 1))
> +			if (dtc < 0)
> 				seq_puts(s, " DTC ?? |");
> 			else
> -				seq_printf(s, " DTC %ld  |", __ffs(dtc));
> +				seq_printf(s, " DTC %d  |", dtc);
> 		}
> 		seq_puts(s, "\n     |");
> 		for (x = 0; x < cmn->mesh_x; x++)
> @@ -589,8 +586,7 @@ static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {}
> struct arm_cmn_hw_event {
> 	struct arm_cmn_node *dn;
> 	u64 dtm_idx[4];
> -	unsigned int dtc_idx;
> -	u8 dtcs_used;
> +	s8 dtc_idx[CMN_MAX_DTCS];
> 	u8 num_dns;
> 	u8 dtm_offset;
> 	bool wide_sel;
> @@ -600,6 +596,10 @@ struct arm_cmn_hw_event {
> #define for_each_hw_dn(hw, dn, i) \
> 	for (i = 0, dn = hw->dn; i < hw->num_dns; i++, dn++)
>
> +/* @i is the DTC number, @idx is the counter index on that DTC */
> +#define for_each_hw_dtc_idx(hw, i, idx) \
> +	for (int i = 0, idx; i < CMN_MAX_DTCS; i++) if ((idx = hw->dtc_idx[i]) >= 0)
> +
> static struct arm_cmn_hw_event *to_cmn_hw(struct perf_event *event)
> {
> 	BUILD_BUG_ON(sizeof(struct arm_cmn_hw_event) > offsetof(struct hw_perf_event, target));
> @@ -1429,12 +1429,11 @@ static void arm_cmn_init_counter(struct perf_event *event)
> {
> 	struct arm_cmn *cmn = to_cmn(event->pmu);
> 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
> -	unsigned int i, pmevcnt = CMN_DT_PMEVCNT(hw->dtc_idx);
> 	u64 count;
>
> -	for (i = 0; hw->dtcs_used & (1U << i); i++) {
> -		writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + pmevcnt);
> -		cmn->dtc[i].counters[hw->dtc_idx] = event;
> +	for_each_hw_dtc_idx(hw, i, idx) {
> +		writel_relaxed(CMN_COUNTER_INIT, cmn->dtc[i].base + CMN_DT_PMEVCNT(idx));
> +		cmn->dtc[i].counters[idx] = event;
> 	}
>
> 	count = arm_cmn_read_dtm(cmn, hw, false);
> @@ -1447,11 +1446,9 @@ static void arm_cmn_event_read(struct perf_event *event)
> 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
> 	u64 delta, new, prev;
> 	unsigned long flags;
> -	unsigned int i;
>
> -	if (hw->dtc_idx == CMN_DT_NUM_COUNTERS) {
> -		i = __ffs(hw->dtcs_used);
> -		delta = arm_cmn_read_cc(cmn->dtc + i);
> +	if (CMN_EVENT_TYPE(event) == CMN_TYPE_DTC) {
> +		delta = arm_cmn_read_cc(cmn->dtc + hw->dtc_idx[0]);
> 		local64_add(delta, &event->count);
> 		return;
> 	}
> @@ -1461,8 +1458,8 @@ static void arm_cmn_event_read(struct perf_event *event)
> 	delta = new - prev;
>
> 	local_irq_save(flags);
> -	for (i = 0; hw->dtcs_used & (1U << i); i++) {
> -		new = arm_cmn_read_counter(cmn->dtc + i, hw->dtc_idx);
> +	for_each_hw_dtc_idx(hw, i, idx) {
> +		new = arm_cmn_read_counter(cmn->dtc + i, idx);
> 		delta += new << 16;
> 	}
> 	local_irq_restore(flags);
> @@ -1518,7 +1515,7 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
> 	int i;
>
> 	if (type == CMN_TYPE_DTC) {
> -		i = __ffs(hw->dtcs_used);
> +		i = hw->dtc_idx[0];
> 		writeq_relaxed(CMN_CC_INIT, cmn->dtc[i].base + CMN_DT_PMCCNTR);
> 		cmn->dtc[i].cc_active = true;
> 	} else if (type == CMN_TYPE_WP) {
> @@ -1549,7 +1546,7 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
> 	int i;
>
> 	if (type == CMN_TYPE_DTC) {
> -		i = __ffs(hw->dtcs_used);
> +		i = hw->dtc_idx[0];
> 		cmn->dtc[i].cc_active = false;
> 	} else if (type == CMN_TYPE_WP) {
> 		int wp_idx = arm_cmn_wp_idx(event);
> @@ -1735,12 +1732,19 @@ static int arm_cmn_event_init(struct perf_event *event)
> 	hw->dn = arm_cmn_node(cmn, type);
> 	if (!hw->dn)
> 		return -EINVAL;
> +
> +	memset(hw->dtc_idx, -1, sizeof(hw->dtc_idx));
> 	for (dn = hw->dn; dn->type == type; dn++) {
> 		if (bynodeid && dn->id != nodeid) {
> 			hw->dn++;
> 			continue;
> 		}
> 		hw->num_dns++;
> +		if (dn->dtc < 0)
> +			memset(hw->dtc_idx, 0, cmn->num_dtcs);
> +		else
> +			hw->dtc_idx[dn->dtc] = 0;
> +
> 		if (bynodeid)
> 			break;
> 	}
> @@ -1752,12 +1756,6 @@ static int arm_cmn_event_init(struct perf_event *event)
> 			nodeid, nid.x, nid.y, nid.port, nid.dev, type);
> 		return -EINVAL;
> 	}
> -	/*
> -	 * Keep assuming non-cycles events count in all DTC domains; turns out
> -	 * it's hard to make a worthwhile optimisation around this, short of
> -	 * going all-in with domain-local counter allocation as well.
> -	 */
> -	hw->dtcs_used = (1U << cmn->num_dtcs) - 1;
>
> 	return arm_cmn_validate_group(cmn, event);
> }
> @@ -1783,28 +1781,25 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
> 	}
> 	memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
>
> -	for (i = 0; hw->dtcs_used & (1U << i); i++)
> -		cmn->dtc[i].counters[hw->dtc_idx] = NULL;
> +	for_each_hw_dtc_idx(hw, j, idx)
> +		cmn->dtc[j].counters[idx] = NULL;
> }
>
> static int arm_cmn_event_add(struct perf_event *event, int flags)
> {
> 	struct arm_cmn *cmn = to_cmn(event->pmu);
> 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
> -	struct arm_cmn_dtc *dtc = &cmn->dtc[0];
> 	struct arm_cmn_node *dn;
> 	enum cmn_node_type type = CMN_EVENT_TYPE(event);
> -	unsigned int i, dtc_idx, input_sel;
> +	unsigned int input_sel, i = 0;
>
> 	if (type == CMN_TYPE_DTC) {
> -		i = 0;
> 		while (cmn->dtc[i].cycles)
> 			if (++i == cmn->num_dtcs)
> 				return -ENOSPC;
>
> 		cmn->dtc[i].cycles = event;
> -		hw->dtc_idx = CMN_DT_NUM_COUNTERS;
> -		hw->dtcs_used = 1U << i;
> +		hw->dtc_idx[0] = i;
>
> 		if (flags & PERF_EF_START)
> 			arm_cmn_event_start(event, 0);
> @@ -1812,17 +1807,22 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
> 	}
>
> 	/* Grab a free global counter first... */
> -	dtc_idx = 0;
> -	while (dtc->counters[dtc_idx])
> -		if (++dtc_idx == CMN_DT_NUM_COUNTERS)
> -			return -ENOSPC;
> -
> -	hw->dtc_idx = dtc_idx;
> +	for_each_hw_dtc_idx(hw, j, idx) {
> +		if (j > 0) {
> +			idx = hw->dtc_idx[0];
> +		} else {
> +			idx = 0;
> +			while (cmn->dtc[j].counters[idx])
> +				if (++idx == CMN_DT_NUM_COUNTERS)
> +					goto free_dtms;
> +		}
> +		hw->dtc_idx[j] = idx;
> +	}
>
> 	/* ...then the local counters to feed it. */
> 	for_each_hw_dn(hw, dn, i) {
> 		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
> -		unsigned int dtm_idx, shift;
> +		unsigned int dtm_idx, shift, d = 0;
> 		u64 reg;
>
> 		dtm_idx = 0;
> @@ -1841,11 +1841,11 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
>
> 			tmp = dtm->wp_event[wp_idx ^ 1];
> 			if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
> -					CMN_EVENT_WP_COMBINE(dtc->counters[tmp]))
> +					CMN_EVENT_WP_COMBINE(cmn->dtc[d].counters[tmp]))
> 				goto free_dtms;
>
> 			input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
> -			dtm->wp_event[wp_idx] = dtc_idx;
> +			dtm->wp_event[wp_idx] = hw->dtc_idx[d];
> 			writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx));
> 		} else {
> 			struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
> @@ -1865,7 +1865,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
> 		dtm->input_sel[dtm_idx] = input_sel;
> 		shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx);
> 		dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift);
> -		dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift;
> +		dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, hw->dtc_idx[d]) << shift;
> 		dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx);
> 		reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low;
> 		writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG);
> @@ -1893,7 +1893,7 @@ static void arm_cmn_event_del(struct perf_event *event, int flags)
> 	arm_cmn_event_stop(event, PERF_EF_UPDATE);
>
> 	if (type == CMN_TYPE_DTC)
> -		cmn->dtc[__ffs(hw->dtcs_used)].cycles = NULL;
> +		cmn->dtc[hw->dtc_idx[0]].cycles = NULL;
> 	else
> 		arm_cmn_event_clear(cmn, event, hw->num_dns);
> }
> @@ -2074,7 +2074,6 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
> {
> 	struct arm_cmn_node *dn, *xp;
> 	int dtc_idx = 0;
> -	u8 dtcs_present = (1 << cmn->num_dtcs) - 1;
>
> 	cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL);
> 	if (!cmn->dtc)
> @@ -2084,23 +2083,26 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
>
> 	cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP);
>
> +	if (cmn->part == PART_CMN600 && cmn->num_dtcs > 1) {
> +		/* We do at least know that a DTC's XP must be in that DTC's domain */
> +		dn = arm_cmn_node(cmn, CMN_TYPE_DTC);
> +		for (int i = 0; i < cmn->num_dtcs; i++)
> +			arm_cmn_node_to_xp(cmn, dn + i)->dtc = i;
> +	}
> +
> 	for (dn = cmn->dns; dn->type; dn++) {
> -		if (dn->type == CMN_TYPE_XP) {
> -			dn->dtc &= dtcs_present;
> +		if (dn->type == CMN_TYPE_XP)
> 			continue;
> -		}
>
> 		xp = arm_cmn_node_to_xp(cmn, dn);
> +		dn->dtc = xp->dtc;
> 		dn->dtm = xp->dtm;
> 		if (cmn->multi_dtm)
> 			dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2;
>
> 		if (dn->type == CMN_TYPE_DTC) {
> -			int err;
> -			/* We do at least know that a DTC's XP must be in that DTC's domain */
> -			if (xp->dtc == 0xf)
> -				xp->dtc = 1 << dtc_idx;
> -			err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
> +			int err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
> +
> 			if (err)
> 				return err;
> 		}
> @@ -2258,9 +2260,9 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
> 			cmn->mesh_x = xp->logid;
>
> 		if (cmn->part == PART_CMN600)
> -			xp->dtc = 0xf;
> +			xp->dtc = -1;
> 		else
> -			xp->dtc = 1 << arm_cmn_dtc_domain(cmn, xp_region);
> +			xp->dtc = arm_cmn_dtc_domain(cmn, xp_region);
>
> 		xp->dtm = dtm - cmn->dtms;
> 		arm_cmn_init_dtm(dtm++, xp, 0);
> -- 
> 2.39.2.101.g768bb238c484.dirty
>
>



More information about the linux-arm-kernel mailing list