[PATCH 3/3] perf/arm-ni: Support sharing IRQs within an NI instance

Will Deacon will at kernel.org
Mon May 19 05:29:57 PDT 2025


On Tue, May 13, 2025 at 04:39:00PM +0100, Robin Murphy wrote:
> From: Shouping Wang <allen.wang at hj-micro.com>
> 
> NI-700 has a distinct PMU interrupt output for each Clock Domain,
> however some integrations may still combine these together externally.
> The initial driver didn't attempt to support this, in anticipation of a
> more general solution for IRQ sharing between system PMU instances, but
> that's still a way off, so let's make this intermediate step for now to
> at least allow sharing IRQs within an individual NI instance.
> 
> Now that CPU affinity and migration are cleaned up, it's fairly
> straightforward to adopt similar logic to arm-cmn, to identify CDs with
> a common interrupt and loop over them directly in the handler.
> 
> Signed-off-by: Shouping Wang <allen.wang at hj-micro.com>
> [ rm: Rework for affinity handling, cosmetics, new commit message ]
> Signed-off-by: Robin Murphy <robin.murphy at arm.com>
> ---
>  drivers/perf/arm-ni.c | 78 ++++++++++++++++++++++++++++---------------
>  1 file changed, 51 insertions(+), 27 deletions(-)
> 
> diff --git a/drivers/perf/arm-ni.c b/drivers/perf/arm-ni.c
> index 168750e78fc4..ca033c8785ff 100644
> --- a/drivers/perf/arm-ni.c
> +++ b/drivers/perf/arm-ni.c
> @@ -102,6 +102,7 @@ struct arm_ni_unit {
>  struct arm_ni_cd {
>  	void __iomem *pmu_base;
>  	u16 id;
> +	s8 irq_friend;

Why not just store the pointer to the cd as opposed to a relative offset?

>  	int num_units;
>  	int irq;
>  	struct pmu pmu;
> @@ -448,33 +449,37 @@ static irqreturn_t arm_ni_handle_irq(int irq, void *dev_id)
>  {
>  	struct arm_ni_cd *cd = dev_id;
>  	irqreturn_t ret = IRQ_NONE;
> -	u32 reg = readl_relaxed(cd->pmu_base + NI_PMOVSCLR);
>  
> -	if (reg & (1U << NI_CCNT_IDX)) {
> -		ret = IRQ_HANDLED;
> -		if (!(WARN_ON(!cd->ccnt))) {
> -			arm_ni_event_read(cd->ccnt);
> -			arm_ni_init_ccnt(cd);
> +	for (;;) {

Could this be a do { ... } while (cd) loop instead?

> +		u32 reg = readl_relaxed(cd->pmu_base + NI_PMOVSCLR);
> +
> +		if (reg & (1U << NI_CCNT_IDX)) {
> +			ret = IRQ_HANDLED;
> +			if (!(WARN_ON(!cd->ccnt))) {
> +				arm_ni_event_read(cd->ccnt);
> +				arm_ni_init_ccnt(cd);
> +			}
>  		}
> -	}
> -	for (int i = 0; i < NI_NUM_COUNTERS; i++) {
> -		if (!(reg & (1U << i)))
> -			continue;
> -		ret = IRQ_HANDLED;
> -		if (!(WARN_ON(!cd->evcnt[i]))) {
> -			arm_ni_event_read(cd->evcnt[i]);
> -			arm_ni_init_evcnt(cd, i);
> +		for (int i = 0; i < NI_NUM_COUNTERS; i++) {
> +			if (!(reg & (1U << i)))
> +				continue;
> +			ret = IRQ_HANDLED;
> +			if (!(WARN_ON(!cd->evcnt[i]))) {
> +				arm_ni_event_read(cd->evcnt[i]);
> +				arm_ni_init_evcnt(cd, i);
> +			}
>  		}
> +		writel_relaxed(reg, cd->pmu_base + NI_PMOVSCLR);
> +		if (!cd->irq_friend)
> +			return ret;
> +		cd += cd->irq_friend;
>  	}
> -	writel_relaxed(reg, cd->pmu_base + NI_PMOVSCLR);
> -	return ret;
>  }
>  
>  static int arm_ni_init_cd(struct arm_ni *ni, struct arm_ni_node *node, u64 res_start)
>  {
>  	struct arm_ni_cd *cd = ni->cds + node->id;
>  	const char *name;
> -	int err;
>  
>  	cd->id = node->id;
>  	cd->num_units = node->num_components;
> @@ -534,20 +539,11 @@ static int arm_ni_init_cd(struct arm_ni *ni, struct arm_ni_node *node, u64 res_s
>  		       cd->pmu_base + NI_PMCR);
>  	writel_relaxed(U32_MAX, cd->pmu_base + NI_PMCNTENCLR);
>  	writel_relaxed(U32_MAX, cd->pmu_base + NI_PMOVSCLR);
> -	writel_relaxed(U32_MAX, cd->pmu_base + NI_PMINTENSET);
>  
>  	cd->irq = platform_get_irq(to_platform_device(ni->dev), cd->id);
>  	if (cd->irq < 0)
>  		return cd->irq;
>  
> -	err = devm_request_irq(ni->dev, cd->irq, arm_ni_handle_irq,
> -			       IRQF_NOBALANCING | IRQF_NO_THREAD,
> -			       dev_name(ni->dev), cd);
> -	if (err)
> -		return err;
> -
> -	irq_set_affinity(cd->irq, cpumask_of(ni->cpu));
> -
>  	cd->pmu = (struct pmu) {
>  		.module = THIS_MODULE,
>  		.parent = ni->dev,
> @@ -593,6 +589,30 @@ static void arm_ni_probe_domain(void __iomem *base, struct arm_ni_node *node)
>  	node->num_components = readl_relaxed(base + NI_CHILD_NODE_INFO);
>  }
>  
> +static int arm_ni_init_irqs(struct arm_ni *ni)
> +{
> +	int err;
> +
> +	ni_for_each_cd(ni, cd) {
> +		for (struct arm_ni_cd *prev = cd; prev-- > ni->cds; ) {
> +			if (prev->irq == cd->irq) {
> +				prev->irq_friend = cd - prev;

Can't this race with the read of `irq_friend` in the interrupt handler?

Similarly, how do you handle the race between the interrupt firing on
one CD and the friend CD being torn down on e.g. the remove path?

> +				goto enable_irq;
> +			}
> +		}
> +		err = devm_request_irq(ni->dev, cd->irq, arm_ni_handle_irq,
> +				       IRQF_NOBALANCING | IRQF_NO_THREAD,
> +				       dev_name(ni->dev), cd);

What's the reason not to use IRQF_SHARED and register multiple handlers?
I'm sure there is one, but capturing that in a comment would help to
justify the manual multiplexing.

Will



More information about the linux-arm-kernel mailing list