[PATCH 1/8] ras: aest: Fix shared processor node handling and error log messages
Umang Chheda
umang.chheda at oss.qualcomm.com
Tue May 5 05:23:45 PDT 2026
Two related fixes for processor nodes with ACPI_AEST_PROC_FLAG_SHARED
or ACPI_AEST_PROC_FLAG_GLOBAL set (e.g. cluster L3 cache, DSU):
1. aest_dev_is_oncore() returns true for any PROCESSOR_ERROR_NODE,
causing shared processor nodes (which use an SPI) to take the
cpuhp/PPI path. cpuhp_setup_state() is called instead of
aest_online_dev(), so aest_config_irq() is never called and the
hardware IRQ-config register is never programmed.
Fix aest_dev_is_oncore() to check irq_is_percpu() on the registered
IRQ. Only nodes whose FHI or ERI is a per-CPU PPI take the oncore
path, nodes with an SPI take aest_online_dev().
2. alloc_aest_node_name() uses processor_id for the node name of all
processor nodes. Shared/global nodes have processor_id=0 (the
field is unused when SHARED/GLOBAL is set), so every shared node
and the per-PE node for CPU 0 both got the name "processor.0",
making error logs ambiguous.
For shared/global nodes, build the name as
"processor.<resource_type>.<device_id>" (e.g. "processor.cache.1")
so each node has a unique, meaningful identifier. Per-PE nodes
keep the original "processor.<mpidr>" form.
Also add proc_flags to struct aest_event so aest_print() can
distinguish shared from per-PE nodes and print an appropriate
message.
Signed-off-by: Umang Chheda <umang.chheda at oss.qualcomm.com>
---
drivers/ras/aest/aest-core.c | 54 ++++++++++++++++++++++++++++++++++++++++----
drivers/ras/aest/aest.h | 15 +++++++++++-
2 files changed, 64 insertions(+), 5 deletions(-)
diff --git a/drivers/ras/aest/aest-core.c b/drivers/ras/aest/aest-core.c
index 6a2d84b47721..b4f4c975da1d 100644
--- a/drivers/ras/aest/aest-core.c
+++ b/drivers/ras/aest/aest-core.c
@@ -49,7 +49,19 @@ static void aest_print(struct aest_event *event)
switch (event->type) {
case ACPI_AEST_PROCESSOR_ERROR_NODE:
- pr_err("%s Error from CPU%d\n", pfx_seq, event->id0);
+ /*
+ * For shared/global nodes (e.g. cluster L3 cache, DSU),
+ * id0 is the CPU that handled the interrupt — not the error
+ * source itself. The node_name already identifies the resource
+ * (e.g. "processor.cache.1"). Print a distinct message so the
+ * log is not confused with a per-PE CPU error.
+ */
+ if (event->proc_flags &
+ (ACPI_AEST_PROC_FLAG_SHARED | ACPI_AEST_PROC_FLAG_GLOBAL))
+ pr_err("%s Error from shared processor resource (interrupt handled on CPU%d)\n",
+ pfx_seq, event->id0);
+ else
+ pr_err("%s Error from CPU%d\n", pfx_seq, event->id0);
break;
case ACPI_AEST_MEMORY_ERROR_NODE:
pr_err("%s Error from memory at SRAT proximity domain %#x\n",
@@ -133,6 +145,7 @@ static void init_aest_event(struct aest_event *event,
info->processor->processor_id);
event->id1 = info->processor->resource_type;
+ event->proc_flags = info->processor->flags;
break;
case ACPI_AEST_MEMORY_ERROR_NODE:
event->id0 = info->memory->srat_proximity_domain;
@@ -175,6 +188,7 @@ static int aest_node_gen_pool_add(struct aest_device *adev,
if (!event)
return -ENOMEM;
+ memset(event, 0, sizeof(*event));
init_aest_event(event, record, regs);
llist_add(&event->llnode, &adev->event_list);
@@ -730,9 +744,41 @@ static char *alloc_aest_node_name(struct aest_node *node)
switch (node->type) {
case ACPI_AEST_PROCESSOR_ERROR_NODE:
- name = devm_kasprintf(node->adev->dev, GFP_KERNEL, "%s.%d",
- aest_node_name[node->type],
- node->info->processor->processor_id);
+ /*
+ * Shared/global processor nodes (e.g. cluster L3 cache, DSU)
+ * have processor_id=0 and use smp_processor_id() at error-log
+ * time — using processor_id in the name would produce the same
+ * "processor.0" string for every shared node and every CPU0
+ * per-PE node, making logs ambiguous.
+ *
+ * For shared/global nodes, build the name from the resource
+ * type and the device id so each node gets a unique, meaningful
+ * name (e.g. "processor.cache.1", "processor.tlb.2").
+ *
+ * For per-PE nodes, keep the original "processor.<mpidr>" form.
+ */
+ if (node->info->processor->flags &
+ (ACPI_AEST_PROC_FLAG_SHARED | ACPI_AEST_PROC_FLAG_GLOBAL)) {
+ static const char *const res_name[] = {
+ [ACPI_AEST_CACHE_RESOURCE] = "cache",
+ [ACPI_AEST_TLB_RESOURCE] = "tlb",
+ [ACPI_AEST_GENERIC_RESOURCE] = "generic",
+ };
+ u8 rtype = node->info->processor->resource_type;
+ const char *rstr = (rtype < ARRAY_SIZE(res_name) &&
+ res_name[rtype]) ? res_name[rtype] : "unknown";
+
+ name = devm_kasprintf(node->adev->dev, GFP_KERNEL,
+ "%s.%s.%d",
+ aest_node_name[node->type],
+ rstr,
+ node->adev->id);
+ } else {
+ name = devm_kasprintf(node->adev->dev, GFP_KERNEL,
+ "%s.%d",
+ aest_node_name[node->type],
+ node->info->processor->processor_id);
+ }
break;
case ACPI_AEST_MEMORY_ERROR_NODE:
case ACPI_AEST_SMMU_ERROR_NODE:
diff --git a/drivers/ras/aest/aest.h b/drivers/ras/aest/aest.h
index 9d67d79eb4a2..9704af97fee8 100644
--- a/drivers/ras/aest/aest.h
+++ b/drivers/ras/aest/aest.h
@@ -8,6 +8,7 @@
#include <linux/acpi_aest.h>
#include <asm/ras.h>
#include <linux/debugfs.h>
+#include <linux/irqdesc.h>
#define MAX_GSI_PER_NODE 2
#define DEFAULT_CE_THRESHOLD 1
@@ -94,6 +95,8 @@ struct aest_event {
/* Vendor node : hardware ID. */
char *hid;
u32 index;
+ /* Processor node: ACPI_AEST_PROC_FLAG_* bitmask (SHARED/GLOBAL) */
+ u8 proc_flags;
u64 ce_threshold;
int addressing_mode;
struct ras_ext_regs regs;
@@ -387,7 +390,17 @@ static inline void aest_sync(struct aest_node *node)
static inline bool aest_dev_is_oncore(struct aest_device *adev)
{
- return adev->type == ACPI_AEST_PROCESSOR_ERROR_NODE;
+ /*
+ * A processor node is "on-core" (uses PPI + cpuhp) only when its
+ * interrupt is a per-CPU PPI. A shared processor node (e.g. cluster
+ * L3 cache, DSU) uses an SPI and must follow the non-oncore path
+ * (aest_online_dev) so that aest_config_irq and aest_online_dev are
+ * called instead of cpuhp_setup_state.
+ */
+ if (adev->type != ACPI_AEST_PROCESSOR_ERROR_NODE)
+ return false;
+ return irq_is_percpu(adev->irq[ACPI_AEST_NODE_FAULT_HANDLING]) ||
+ irq_is_percpu(adev->irq[ACPI_AEST_NODE_ERROR_RECOVERY]);
}
static inline int default_errgsr_mapping(int errgsr_bit)
--
2.34.1
More information about the linux-arm-kernel
mailing list