[PATCH V17 3/9] drivers: perf: arm_pmu: Add infrastructure for branch stack sampling
Anshuman Khandual
anshuman.khandual at arm.com
Thu Apr 4 19:46:33 PDT 2024
In order to support the Branch Record Buffer Extension (BRBE), we need to
extend the arm_pmu framework with some basic infrastructure for the branch
stack sampling which arm_pmu drivers can opt-in using a new feature flag
called 'has_branch_stack'. Subsequent patches will use this to add support
for BRBE in the PMUv3 driver.
Branch stack sampling support i.e capturing branch records during execution
in core perf, rides along with normal HW events being scheduled on the PMU.
This prepares ARMV8 PMU framework for branch stack support on relevant PMUs
with required HW implementation.
With BRBE, the hardware records branches into a hardware FIFO, which will
be sampled by software when perf events overflow. A task may be context-
switched an arbitrary number of times between overflows, and to avoid
losing samples we need to save the current records when a task is context-
switched out. To do these we'll need to use the pmu::sched_task() callback,
and we'll also need to allocate some per-task storage space via event flag
PERF_ATTACH_TASK_DATA.
Cc: Catalin Marinas <catalin.marinas at arm.com>
Cc: Will Deacon <will at kernel.org>
Cc: Mark Rutland <mark.rutland at arm.com>
Cc: linux-arm-kernel at lists.infradead.org
Cc: linux-kernel at vger.kernel.org
Signed-off-by: Anshuman Khandual <anshuman.khandual at arm.com>
----
Changes in V17:
- ARM PMUV3 changes have been moved into the BRBE driver patch instead
- Moved down branch_stack_add() in armpmu_add() after event's basic checks
- Added new callbacks in struct arm_pmu e.g branch_stack_[init|add|del]()
- Renamed struct arm_pmu callback branch_reset() as branch_stack_reset()
- Dropped the comment in armpmu_event_init()
- Renamed 'pmu_hw_events' elements from 'brbe_' to more generic 'branch_'
drivers/perf/arm_pmu.c | 25 ++++++++++++++++++++++---
include/linux/perf/arm_pmu.h | 32 +++++++++++++++++++++++++++++++-
2 files changed, 53 insertions(+), 4 deletions(-)
diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c
index 8458fe2cebb4..e694103e811f 100644
--- a/drivers/perf/arm_pmu.c
+++ b/drivers/perf/arm_pmu.c
@@ -317,6 +317,9 @@ armpmu_del(struct perf_event *event, int flags)
struct hw_perf_event *hwc = &event->hw;
int idx = hwc->idx;
+ if (has_branch_stack(event))
+ armpmu->branch_stack_del(event, hw_events);
+
armpmu_stop(event, PERF_EF_UPDATE);
hw_events->events[idx] = NULL;
armpmu->clear_event_idx(hw_events, event);
@@ -342,6 +345,9 @@ armpmu_add(struct perf_event *event, int flags)
if (idx < 0)
return idx;
+ if (has_branch_stack(event))
+ armpmu->branch_stack_add(event, hw_events);
+
/*
* If there is an event in the counter we are going to use then make
* sure it is disabled.
@@ -511,13 +517,25 @@ static int armpmu_event_init(struct perf_event *event)
!cpumask_test_cpu(event->cpu, &armpmu->supported_cpus))
return -ENOENT;
- /* does not support taken branch sampling */
- if (has_branch_stack(event))
- return -EOPNOTSUPP;
+ if (has_branch_stack(event)) {
+ if (!armpmu->has_branch_stack)
+ return -EOPNOTSUPP;
+
+ if (!armpmu->branch_stack_init(event))
+ return -EOPNOTSUPP;
+ }
return __hw_perf_event_init(event);
}
+static void armpmu_sched_task(struct perf_event_pmu_context *pmu_ctx, bool sched_in)
+{
+ struct arm_pmu *armpmu = to_arm_pmu(pmu_ctx->pmu);
+
+ if (armpmu->sched_task)
+ armpmu->sched_task(pmu_ctx, sched_in);
+}
+
static void armpmu_enable(struct pmu *pmu)
{
struct arm_pmu *armpmu = to_arm_pmu(pmu);
@@ -864,6 +882,7 @@ struct arm_pmu *armpmu_alloc(void)
}
pmu->pmu = (struct pmu) {
+ .sched_task = armpmu_sched_task,
.pmu_enable = armpmu_enable,
.pmu_disable = armpmu_disable,
.event_init = armpmu_event_init,
diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h
index b3b34f6670cf..9eda16dd684e 100644
--- a/include/linux/perf/arm_pmu.h
+++ b/include/linux/perf/arm_pmu.h
@@ -46,6 +46,18 @@ static_assert((PERF_EVENT_FLAG_ARCH & ARMPMU_EVT_63BIT) == ARMPMU_EVT_63BIT);
}, \
}
+/*
+ * Maximum branch record entries which could be processed
+ * for core perf branch stack sampling support, regardless
+ * of the hardware support available on a given ARM PMU.
+ */
+#define MAX_BRANCH_RECORDS 64
+
+struct branch_records {
+ struct perf_branch_stack branch_stack;
+ struct perf_branch_entry branch_entries[MAX_BRANCH_RECORDS];
+};
+
/* The events for a given PMU register set. */
struct pmu_hw_events {
/*
@@ -66,6 +78,17 @@ struct pmu_hw_events {
struct arm_pmu *percpu_pmu;
int irq;
+
+ struct branch_records *branches;
+
+ /* Active context for task events */
+ void *branch_context;
+
+ /* Active events requesting branch records */
+ unsigned int branch_users;
+
+ /* Active branch sample type filters */
+ unsigned long branch_sample_type;
};
enum armpmu_attr_groups {
@@ -96,8 +119,15 @@ struct arm_pmu {
void (*stop)(struct arm_pmu *);
void (*reset)(void *);
int (*map_event)(struct perf_event *event);
+ void (*sched_task)(struct perf_event_pmu_context *pmu_ctx, bool sched_in);
+ bool (*branch_stack_init)(struct perf_event *event);
+ void (*branch_stack_add)(struct perf_event *event, struct pmu_hw_events *cpuc);
+ void (*branch_stack_del)(struct perf_event *event, struct pmu_hw_events *cpuc);
+ void (*branch_stack_reset)(void);
int num_events;
- bool secure_access; /* 32-bit ARM only */
+ unsigned int secure_access : 1, /* 32-bit ARM only */
+ has_branch_stack: 1, /* 64-bit ARM only */
+ reserved : 30;
#define ARMV8_PMUV3_MAX_COMMON_EVENTS 0x40
DECLARE_BITMAP(pmceid_bitmap, ARMV8_PMUV3_MAX_COMMON_EVENTS);
#define ARMV8_PMUV3_EXT_COMMON_EVENT_BASE 0x4000
--
2.25.1
More information about the linux-arm-kernel
mailing list