[PATCH v7 08/16] arm64: ras: Handle memory failure for uncorrectable errors

Ruidong Tian tianruidong at linux.alibaba.com
Tue Jun 2 00:15:31 PDT 2026


When an uncorrectable error (UE/DE) is detected and the error record
reports a System Physical Address (SPA), invoke memory_failure() to
offline the affected page. This prevents further consumption of
corrupted data.

Signed-off-by: Ruidong Tian <tianruidong at linux.alibaba.com>
---
 arch/arm64/include/asm/ras.h |  4 ++++
 drivers/acpi/arm64/aest.c    |  5 ++++-
 drivers/ras/arm64/ras-core.c | 21 +++++++++++++++++++++
 drivers/ras/arm64/ras.h      | 26 ++++++++++++++++++++++++++
 include/linux/acpi_aest.h    |  3 +++
 5 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/ras.h b/arch/arm64/include/asm/ras.h
index 42900e1e9a19..7bef631a395c 100644
--- a/arch/arm64/include/asm/ras.h
+++ b/arch/arm64/include/asm/ras.h
@@ -31,6 +31,10 @@
 #define ERR_STATUS_UET_UEO		2
 #define ERR_STATUS_UET_UER		3
 
+/* ERR<n>ADDR */
+#define ERR_ADDR_AI			BIT(61)
+#define ERR_ADDR_PADDR			GENMASK_ULL(55, 0)
+
 /* ERR<n>CTLR */
 #define ERR_CTLR_CFI			BIT(8)
 #define ERR_CTLR_FI			BIT(3)
diff --git a/drivers/acpi/arm64/aest.c b/drivers/acpi/arm64/aest.c
index 5733c91c8e0d..1b020ab7eccd 100644
--- a/drivers/acpi/arm64/aest.c
+++ b/drivers/acpi/arm64/aest.c
@@ -153,6 +153,9 @@ aest_init_node_props(struct acpi_aest_hdr *hdr, struct property_entry *props,
 	props[(*p)++] = PROPERTY_ENTRY_U64_ARRAY_LEN("arm,status-reporting",
 						     status_reporting,
 						     group_len);
+	props[(*p)++] = PROPERTY_ENTRY_U64_ARRAY_LEN("arm,addressing-mode",
+						     addressing_mode,
+						     group_len);
 	props[(*p)++] = PROPERTY_ENTRY_U64("arm,error-group-base",
 					   common->error_group_register_base);
 	props[(*p)++] = PROPERTY_ENTRY_U64("arm,fault-inject-base",
@@ -173,7 +176,7 @@ aest_init_node_props(struct acpi_aest_hdr *hdr, struct property_entry *props,
 static int __init
 aest_create_node_fwnode(struct acpi_aest_hdr *hdr, struct platform_device *pdev)
 {
-	struct property_entry props[15] = { };
+	struct property_entry props[16] = { };
 	int p = 0;
 	int ret;
 
diff --git a/drivers/ras/arm64/ras-core.c b/drivers/ras/arm64/ras-core.c
index 8c6d202882ed..babb390b795f 100644
--- a/drivers/ras/arm64/ras-core.c
+++ b/drivers/ras/arm64/ras-core.c
@@ -127,7 +127,17 @@ static void ras_print(struct ras_record *record, struct ras_ext_regs *regs)
 
 static void ras_do_proc(struct ras_record *record, struct ras_ext_regs *regs)
 {
+	u64 status = regs->err_status, addr = regs->err_addr;
+
 	ras_print(record, regs);
+
+	if (status & ERR_STATUS_CE)
+		return;
+
+	if (record->addressing_mode == AEST_ADDRESS_LA || (addr & ERR_ADDR_AI))
+		return;
+
+	memory_failure_queue(addr & PHYS_MASK, 0);
 }
 
 static void ras_panic(struct ras_record *record, struct ras_ext_regs *regs,
@@ -360,7 +370,10 @@ static int ras_init_record(struct ras_record *record, int i, struct ras_node *no
 	record->access = &ras_access[node->access_type];
 	record->index = i;
 	record->node = node;
+	record->addressing_mode = test_bit(i, node->addressing_mode);
 
+	ras_record_dbg(record, "record initialized, addressing mode: %s\n",
+		       record->addressing_mode ? "LA" : "SPA");
 	return 0;
 }
 
@@ -598,6 +611,11 @@ static struct ras_node *ras_init_node(struct platform_device *pdev)
 					GFP_KERNEL);
 	if (!node->status_reporting)
 		return ERR_PTR(-ENOMEM);
+	node->addressing_mode = devm_bitmap_zalloc(dev,
+					node->group->errgsr_num * BITS_PER_TYPE(u64),
+					GFP_KERNEL);
+	if (!node->addressing_mode)
+		return ERR_PTR(-ENOMEM);
 
 	ret = device_property_read_u64_array(dev, "arm,record-implemented",
 					     (u64 *)node->record_implemented,
@@ -605,6 +623,9 @@ static struct ras_node *ras_init_node(struct platform_device *pdev)
 	ret = ret ?: device_property_read_u64_array(dev, "arm,status-reporting",
 						    (u64 *)node->status_reporting,
 						    node->group->errgsr_num);
+	ret = ret ?: device_property_read_u64_array(dev, "arm,addressing-mode",
+						    (u64 *)node->addressing_mode,
+						    node->group->errgsr_num);
 	if (ret)
 		return ERR_PTR(ret);
 
diff --git a/drivers/ras/arm64/ras.h b/drivers/ras/arm64/ras.h
index c26a0aae26c5..11c6def1e4bf 100644
--- a/drivers/ras/arm64/ras.h
+++ b/drivers/ras/arm64/ras.h
@@ -70,6 +70,16 @@ struct ras_record {
 	const struct ras_access *access;
 
 	int index;
+	/*
+	 * This bit specifies the addressing mode to populate the ERR_ADDR
+	 * register:
+	 *   0b: Error record reports System Physical Addresses (SPA) in
+	 *       the ERR_ADDR register.
+	 *   1b: Error record reports error node-specific Logical Addresses (LA)
+	 *       in the ERR_ADDR register. OS must use other means to translate
+	 *       the reported LA into SPA.
+	 */
+	int addressing_mode;
 };
 
 struct ras_group {
@@ -116,6 +126,22 @@ struct ras_node {
 	 *              error events.
 	 */
 	unsigned long *status_reporting;
+	/*
+	 * This bitmap specifies the addressing mode used by each
+	 * error record within this error node to populate the
+	 * ERR<n>_ADDR register.
+	 * Bit[n] of this field pertains to error record corresponding
+	 * to index n in the error group.
+	 * Bit[n] = 0b: Error record at index n reports System
+	 *		Physical Addresses (SPA) in the ERR<n>_ADDR
+	 *		register.
+	 * Bit[n] = 1b: Error record at index n reports error
+	 *		node-specific Logical Addresses (LA) in the
+	 *		ERR<n>_ADDR register.
+	 * OS must use other means to translate the reported LA
+	 * into SPA
+	 */
+	unsigned long *addressing_mode;
 	struct ras_record *records;
 
 	u32 specific_data_size;
diff --git a/include/linux/acpi_aest.h b/include/linux/acpi_aest.h
index 9cb0fcb52c39..9a8aa234d9e5 100644
--- a/include/linux/acpi_aest.h
+++ b/include/linux/acpi_aest.h
@@ -13,6 +13,9 @@
 #define ACPI_AEST_PROC_FLAG_GLOBAL	BIT(0)
 #define ACPI_AEST_PROC_FLAG_SHARED	BIT(1)
 
+#define AEST_ADDRESS_SPA	0
+#define AEST_ADDRESS_LA		1
+
 /* AEST interrupt */
 #define AEST_INTERRUPT_MODE BIT(0)
 
-- 
2.51.2.612.gdc70283dfc




More information about the linux-arm-kernel mailing list