[PATCH v4 12/18] EDAC/synopsys: Read full data+ecc pattern on errors

Serge Semin fancer.lancer at gmail.com
Wed Sep 20 12:26:57 PDT 2023


DW uMCTL2 DDRC calculates ECC for the Full DQ-bus word. If non-Full bus
width mode is activated the leftover DQ-bits will be padded with zeros,
but the ECC code is calculated for the whole width anyway [1]. For some
reason the DW uMCTL2 DDRC driver currently doesn't read the whole SDRAM
word in if ECC errors happens even though the 64-bits DQ-bus has been
supported for a long time. Moreover a Full ECC value is also available in
the ECC(C|U)SYN2 register. In a less than 64-bits DQ-bus setups the higher
ECC bits are just unused.

So update the errors handler to reading the entire data+ecc pattern:
extend the data field of the ECC error info structure since it may contain
64-bit data; add a new ECC field there since it's a part of the erroneous
data pattern; read the upper 32-bits part of the data pattern only if an
ECC error happens and the DDR controller has been configured with the
64-bits DQ bus; read the full ECC value from the ECC(C|U)SYN2 register.
The data+ecc couple will be printed as a part of the custom error message
passed then to the edac_mc_handle_error() method.

Note since the full data+ecc info is now always logged into the EDAC core
there is no longer need in the debug print of the Syndrome Registers
content. Drop it then.

[1] DesignWare® Cores Enhanced Universal DDR Memory Controller (uMCTL2)
    Databook, Version 3.91a, October 2020, p.424-425

Signed-off-by: Serge Semin <fancer.lancer at gmail.com>

---

Changelog v4:
- Retrieve ECC too.
---
 drivers/edac/synopsys_edac.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/drivers/edac/synopsys_edac.c b/drivers/edac/synopsys_edac.c
index fbf1f8af9788..7376a0fc6394 100644
--- a/drivers/edac/synopsys_edac.c
+++ b/drivers/edac/synopsys_edac.c
@@ -305,6 +305,7 @@ struct snps_ddrc_info {
  * @syndrome:	Error syndrome.
  * @bitpos:	Bit position.
  * @data:	Data causing the error.
+ * @ecc:	Data ECC.
  */
 struct snps_ecc_error_info {
 	u32 row;
@@ -313,7 +314,8 @@ struct snps_ecc_error_info {
 	u32 bankgrp;
 	u32 syndrome;
 	u32 bitpos;
-	u32 data;
+	u64 data;
+	u32 ecc;
 };
 
 /**
@@ -422,10 +424,10 @@ static int snps_get_error_info(struct snps_edac_priv *priv)
 	p->ceinfo.col = FIELD_GET(ECC_CEADDR1_COL_MASK, regval);
 
 	p->ceinfo.data = readl(base + ECC_CSYND0_OFST);
+	if (priv->info.dq_width == SNPS_DQ_64)
+		p->ceinfo.data |= (u64)readl(base + ECC_CSYND1_OFST) << 32;
 
-	edac_dbg(2, "ECCCSYN0: 0x%08X ECCCSYN1: 0x%08X ECCCSYN2: 0x%08X\n",
-		 readl(base + ECC_CSYND0_OFST), readl(base + ECC_CSYND1_OFST),
-		 readl(base + ECC_CSYND2_OFST));
+	p->ceinfo.ecc = readl(base + ECC_CSYND2_OFST);
 
 ue_err:
 	if (!p->ue_cnt)
@@ -440,6 +442,11 @@ static int snps_get_error_info(struct snps_edac_priv *priv)
 	p->ueinfo.col = FIELD_GET(ECC_CEADDR1_COL_MASK, regval);
 
 	p->ueinfo.data = readl(base + ECC_UESYND0_OFST);
+	if (priv->info.dq_width == SNPS_DQ_64)
+		p->ueinfo.data |= (u64)readl(base + ECC_UESYND1_OFST) << 32;
+
+	p->ueinfo.ecc = readl(base + ECC_UESYND2_OFST);
+
 out:
 	spin_lock_irqsave(&priv->reglock, flags);
 
@@ -469,9 +476,9 @@ static void snps_handle_error(struct mem_ctl_info *mci, struct snps_ecc_status *
 		pinf = &p->ceinfo;
 
 		snprintf(priv->message, SNPS_EDAC_MSG_SIZE,
-			 "Row %d Col %d Bank %d Bank Group %d Bit %d Data 0x%08x",
+			 "Row %d Col %d Bank %d Bank Group %d Bit %d Data 0x%08llx:0x%02x",
 			 pinf->row, pinf->col, pinf->bank, pinf->bankgrp,
-			 pinf->bitpos, pinf->data);
+			 pinf->bitpos, pinf->data, pinf->ecc);
 
 		edac_mc_handle_error(HW_EVENT_ERR_CORRECTED, mci,
 				     p->ce_cnt, 0, 0, pinf->syndrome, 0, 0, -1,
@@ -482,8 +489,9 @@ static void snps_handle_error(struct mem_ctl_info *mci, struct snps_ecc_status *
 		pinf = &p->ueinfo;
 
 		snprintf(priv->message, SNPS_EDAC_MSG_SIZE,
-			 "Row %d Col %d Bank %d Bank Group %d",
-			 pinf->row, pinf->col, pinf->bank, pinf->bankgrp);
+			 "Row %d Col %d Bank %d Bank Group %d Data 0x%08llx:0x%02x",
+			 pinf->row, pinf->col, pinf->bank, pinf->bankgrp,
+			 pinf->data, pinf->ecc);
 
 		edac_mc_handle_error(HW_EVENT_ERR_UNCORRECTED, mci,
 				     p->ue_cnt, 0, 0, 0, 0, 0, -1,
-- 
2.41.0




More information about the linux-arm-kernel mailing list