[PATCH 3/3] spi: cadence-quadspi: Add Rx tuning support for Xilinx Versal OSPI

Sai Krishna Potthuri sai.krishna.potthuri at amd.com
Mon Feb 6 22:09:24 PST 2023


Add Rx tuning support for Xilinx Versal Octal SPI Controller(OSPI).
Xilinx Versal Octal SPI controller requires Rx tuning to find out the
optimal tap value when running at higher clock in DTR protocol. As there
is no common Tuning Data Pattern defined across all vendors, Xilinx Versal
Octal SPI uses READ_ID based tuning algorithm for which this device_id
field in spi_mem will be used as a Tuning data pattern.
After enabling the DTR protocol, spi-nor sends READ ID command to verify
the DTR protocol switching is successfully done or not. If execute_tuning
is defined(not NULL) and clk_tuned is not set then Rx tuning algorithm
will be executed before it actually read the ID in DTR protocol.
Xilinx Versal tuning algorithm (as suggested by IP design team):
- Sweep all possible 127 taps to read the flash id for each tap.
- To consider a particular tap as valid make sure to get the correct id
for 10 times.
- Once valid window is found(window size >= 3), stop parsing the
remaining taps.
- Perform the above steps with extra dummy clock 1.
- Identify the maximum window size between extra dummy clock 0 and 1 and
configure the median of min and max valid tap of the corresponding window.
- If the maximum window size is with dummy clock 1 then all the subsequent
read operations should be performed with one extra dummy clock.

Signed-off-by: Sai Krishna Potthuri <sai.krishna.potthuri at amd.com>
---
 drivers/spi/spi-cadence-quadspi.c | 201 ++++++++++++++++++++++++++++++
 1 file changed, 201 insertions(+)

diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c
index b55b763003f0..bc4395736def 100644
--- a/drivers/spi/spi-cadence-quadspi.c
+++ b/drivers/spi/spi-cadence-quadspi.c
@@ -93,6 +93,8 @@ struct cqspi_st {
 	bool			slow_sram;
 	bool			use_phy;
 	bool			use_dqs;
+	bool			clk_tuned;
+	u8			extra_dummy_clks;
 };
 
 struct cqspi_driver_platdata {
@@ -101,6 +103,7 @@ struct cqspi_driver_platdata {
 	int (*indirect_read_dma)(struct cqspi_flash_pdata *f_pdata,
 				 u_char *rxbuf, loff_t from_addr, size_t n_rx);
 	u32 (*get_dma_status)(struct cqspi_st *cqspi);
+	int (*execute_tuning)(struct spi_mem *mem, const struct spi_mem_op *op);
 };
 
 /* Operation timeout value */
@@ -192,6 +195,7 @@ struct cqspi_driver_platdata {
 
 #define CQSPI_REG_IRQSTATUS			0x40
 #define CQSPI_REG_IRQMASK			0x44
+#define CQSPI_REG_VERSAL_ECO			0x48
 
 #define CQSPI_REG_INDIRECTRD			0x60
 #define CQSPI_REG_INDIRECTRD_START_MASK		BIT(0)
@@ -238,6 +242,19 @@ struct cqspi_driver_platdata {
 #define CQSPI_REG_POLLING_STATUS		0xB0
 #define CQSPI_REG_POLLING_STATUS_DUMMY_LSB	16
 
+#define CQSPI_REG_VERSAL_PHY_CONFIG		0xB4
+#define CQSPI_REG_PHY_CONFIG_RESYNC_FLD_MASK	BIT(31)
+#define CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK	BIT(30)
+#define CQSPI_REG_PHY_CONFIG_TX_DLL_DLY_LSB	16
+
+#define CQSPI_REG_VERSAL_PHY_MASTER_CTRL	0xB8
+#define CQSPI_REG_DLL_LOWER			0xBC
+#define CQSPI_REG_DLL_LOWER_LPBK_LOCK_MASK	BIT(15)
+#define CQSPI_REG_DLL_LOWER_DLL_LOCK_MASK	BIT(0)
+
+#define CQSPI_REG_VERSAL_DLL_OBSVBLE_UPPER	0xC0
+#define CQSPI_REG_DLL_UPPER_RX_FLD_MASK		0x7F
+
 #define CQSPI_REG_OP_EXT_LOWER			0xE0
 #define CQSPI_REG_OP_EXT_READ_LSB		24
 #define CQSPI_REG_OP_EXT_WRITE_LSB		16
@@ -282,6 +299,12 @@ struct cqspi_driver_platdata {
 #define CQSPI_DMA_UNALIGN		0x3
 
 #define CQSPI_REG_VERSAL_DMA_VAL		0x602
+#define CQSPI_VERSAL_MAX_DLL_TAPS		127
+#define CQSPI_VERSAL_TX_TAP_MASTER		0x1E
+#define CQSPI_VERSAL_ECO_MIN_FREQ		120000000
+#define CQSPI_VERSAL_INIT_DLY			4
+#define CQSPI_VERSAL_RDID_MAX_CNT		10
+#define CQSPI_VERSAL_MIN_WINDOW_SZ		3
 
 static int cqspi_wait_for_bit(void __iomem *reg, const u32 mask, bool clr)
 {
@@ -540,6 +563,14 @@ static int cqspi_command_read(struct cqspi_flash_pdata *f_pdata,
 	if (dummy_clk > CQSPI_DUMMY_CLKS_MAX)
 		return -EOPNOTSUPP;
 
+	/*
+	 * Based on the tuning, some controllers requires extra dummy clocks due to additional
+	 * delays in read path. For such controllers add required extra dummy clocks for all
+	 * subsequent read operations.
+	 */
+	if (cqspi->extra_dummy_clks)
+		dummy_clk += cqspi->extra_dummy_clks;
+
 	if (dummy_clk)
 		reg |= (dummy_clk & CQSPI_REG_CMDCTRL_DUMMY_MASK)
 		     << CQSPI_REG_CMDCTRL_DUMMY_LSB;
@@ -662,6 +693,14 @@ static int cqspi_read_setup(struct cqspi_flash_pdata *f_pdata,
 	if (dummy_clk > CQSPI_DUMMY_CLKS_MAX)
 		return -EOPNOTSUPP;
 
+	/*
+	 * Based on the tuning, some controllers requires extra dummy clocks due to additional
+	 * delays in read path. For such controllers add required extra dummy clocks for all
+	 * subsequent read operations.
+	 */
+	if (cqspi->extra_dummy_clks)
+		dummy_clk += cqspi->extra_dummy_clks;
+
 	if (dummy_clk)
 		reg |= (dummy_clk & CQSPI_REG_RD_INSTR_DUMMY_MASK)
 		       << CQSPI_REG_RD_INSTR_DUMMY_LSB;
@@ -1056,6 +1095,157 @@ static int cqspi_indirect_write_execute(struct cqspi_flash_pdata *f_pdata,
 	return ret;
 }
 
+static int cqspi_versal_execute_tuning(struct spi_mem *mem, const struct spi_mem_op *op)
+{
+	struct cqspi_st *cqspi = spi_master_get_devdata(mem->spi->master);
+	u8 dummy_incr, dummy_flag = 0, count, max_index, min_index;
+	u8 min_rxtap, max_rxtap, avg_index, max_tap, windowsize;
+	struct platform_device *pdev = cqspi->pdev;
+	struct cqspi_flash_pdata *f_pdata;
+	bool id_matched, rxtapfound;
+	u32 val, i, reg, txtap = 0;
+	s8 max_windowsize = -1;
+	int ret;
+
+	f_pdata = &cqspi->f_pdata[mem->spi->chip_select];
+
+	reg = readl(cqspi->iobase + CQSPI_REG_CONFIG);
+	reg &= CQSPI_REG_CONFIG_ENABLE_MASK;
+	if (!reg)
+		return 0;
+
+	if (cqspi->master_ref_clk_hz >= CQSPI_VERSAL_ECO_MIN_FREQ)
+		writel(1, cqspi->iobase + CQSPI_REG_VERSAL_ECO);
+
+	/* Drive DLL reset bit to low */
+	writel(0, cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+
+	/* Set initial delay value */
+	writel(CQSPI_VERSAL_INIT_DLY, cqspi->iobase + CQSPI_REG_VERSAL_PHY_MASTER_CTRL);
+
+	/* Set DLL reset bit */
+	writel(CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK,
+	       cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+
+	/* Check for loopback lock */
+	ret = cqspi_wait_for_bit(cqspi->iobase + CQSPI_REG_DLL_LOWER,
+				 CQSPI_REG_DLL_LOWER_LPBK_LOCK_MASK, 0);
+	if (ret) {
+		dev_err(&pdev->dev, "Loopback lock bit error (%i)\n", ret);
+		return ret;
+	}
+
+	/* Re-synchronize slave DLLs */
+	writel(CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK, cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+	writel(CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK | CQSPI_REG_PHY_CONFIG_RESYNC_FLD_MASK,
+	       cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+
+	txtap = CQSPI_VERSAL_TX_TAP_MASTER << CQSPI_REG_PHY_CONFIG_TX_DLL_DLY_LSB;
+	max_tap = CQSPI_VERSAL_MAX_DLL_TAPS;
+
+	for (dummy_incr = 0; dummy_incr <= 1; dummy_incr++) {
+		/* Update the extra dummy clocks required to read the ID */
+		cqspi->extra_dummy_clks = dummy_incr;
+		rxtapfound = false;
+		min_rxtap = 0;
+		max_rxtap = 0;
+		max_index = 0;
+		min_index = 0;
+
+		for (i = 0; i <= max_tap; i++) {
+			val = txtap | i | CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK;
+			writel(val, cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+			writel(val | CQSPI_REG_PHY_CONFIG_RESYNC_FLD_MASK,
+			       cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+			ret = cqspi_wait_for_bit(cqspi->iobase + CQSPI_REG_DLL_LOWER,
+						 CQSPI_REG_DLL_LOWER_DLL_LOCK_MASK, 0);
+			if (ret)
+				return ret;
+
+			/*
+			 * To consider the tap as valid, make sure to read the Flash ID
+			 * 10 times(recommended by IP design team). This makes the algorithm more
+			 * robust and gives more confidence about the valid tap, this will also
+			 * avoid getting the correct id by a fluke with a particular tap.
+			 * In any one of the iteration if the Flash ID is not matches with the
+			 * device_id, then that particular tap should not be considered as valid.
+			 */
+			count = 0;
+			do {
+				/* Execute the command and wait for the complete */
+				ret = cqspi_command_read(f_pdata, op);
+				if (ret < 0) {
+					dev_err(&pdev->dev,
+						"error %d reading JEDEC ID\n", ret);
+					return ret;
+				}
+
+				id_matched = true;
+				if (memcmp(mem->device_id, op->data.buf.in, op->data.nbytes)) {
+					id_matched = false;
+					break;
+				}
+			} while (id_matched && (count++ <= CQSPI_VERSAL_RDID_MAX_CNT));
+
+			if (id_matched) {
+				u8 current_rxtap = readl(cqspi->iobase +
+							 CQSPI_REG_VERSAL_DLL_OBSVBLE_UPPER) &
+							 CQSPI_REG_DLL_UPPER_RX_FLD_MASK;
+
+				if (!rxtapfound) {
+					min_rxtap = current_rxtap;
+					min_index = i;
+					rxtapfound = true;
+				}
+
+				max_rxtap = current_rxtap;
+				max_index = i;
+			}
+
+			if (!rxtapfound)
+				continue;
+
+			/* If ID doesn't match or reach the max tap, calculate the window size */
+			if (!id_matched || i == max_tap) {
+				windowsize = max_rxtap - min_rxtap + 1;
+				if (windowsize > max_windowsize) {
+					dummy_flag = dummy_incr;
+					max_windowsize = windowsize;
+					avg_index = (max_index + min_index) / 2;
+				}
+
+				rxtapfound = false;
+				if (windowsize >= CQSPI_VERSAL_MIN_WINDOW_SZ)
+					break;
+			}
+		}
+	}
+
+	/*
+	 * Look for minimum window size to ensure reliable data sampling even during drifts due
+	 * to PVT, so that a drift either towards left or right can be well addressed.
+	 */
+	if (max_windowsize < CQSPI_VERSAL_MIN_WINDOW_SZ)
+		return -EINVAL;
+
+	/* Update the extra dummy clocks based on the maximum window size */
+	cqspi->extra_dummy_clks = dummy_flag;
+
+	val = txtap | avg_index | CQSPI_REG_PHY_CONFIG_RESET_FLD_MASK;
+	writel(val, cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+	writel((CQSPI_REG_PHY_CONFIG_RESYNC_FLD_MASK | val),
+	       cqspi->iobase + CQSPI_REG_VERSAL_PHY_CONFIG);
+
+	ret = cqspi_wait_for_bit(cqspi->iobase + CQSPI_REG_DLL_LOWER,
+				 CQSPI_REG_DLL_LOWER_DLL_LOCK_MASK, 0);
+	if (ret)
+		return ret;
+
+	cqspi->clk_tuned = true;
+
+	return 0;
+}
+
 static void cqspi_chipselect(struct cqspi_flash_pdata *f_pdata)
 {
 	struct cqspi_st *cqspi = f_pdata->cqspi;
@@ -1345,11 +1535,21 @@ static ssize_t cqspi_read(struct cqspi_flash_pdata *f_pdata,
 static int cqspi_mem_process(struct spi_mem *mem, const struct spi_mem_op *op)
 {
 	struct cqspi_st *cqspi = spi_master_get_devdata(mem->spi->master);
+	const struct cqspi_driver_platdata *ddata;
+	struct device *dev = &cqspi->pdev->dev;
 	struct cqspi_flash_pdata *f_pdata;
 
+	ddata = of_device_get_match_data(dev);
 	f_pdata = &cqspi->f_pdata[mem->spi->chip_select];
 	cqspi_configure(f_pdata, mem->spi->max_speed_hz);
 
+	if (op->cmd.dtr && ddata && ddata->execute_tuning && !cqspi->clk_tuned) {
+		int ret = ddata->execute_tuning(mem, op);
+
+		if (ret)
+			return ret;
+	}
+
 	if (op->data.dir == SPI_MEM_DATA_IN && op->data.buf.in) {
 		if (!op->addr.nbytes)
 			return cqspi_command_read(f_pdata, op);
@@ -1846,6 +2046,7 @@ static const struct cqspi_driver_platdata versal_ospi = {
 			| CQSPI_HAS_PHY | CQSPI_HAS_DQS,
 	.indirect_read_dma = cqspi_versal_indirect_read_dma,
 	.get_dma_status = cqspi_get_versal_dma_status,
+	.execute_tuning = cqspi_versal_execute_tuning,
 };
 
 static const struct of_device_id cqspi_dt_ids[] = {
-- 
2.25.1




More information about the linux-mtd mailing list