[PATCH v3 08/13] spi: cadence-quadspi: add PHY tuning support

Mon Jun 1 01:44:01 PDT 2026

On 28/05/26 14:24, Miquel Raynal wrote:
> On 27/05/2026 at 23:25:22 +0530, Santhosh Kumar K <s-k6 at ti.com> wrote:
> 
>> The Cadence QSPI controller supports a delay-line PHY for high-speed
>> operation. Without calibration the PHY is unused and read capture relies
>> on a fixed delay, limiting throughput at frequencies above the base
>> operating speed.
>>
>> Add an execute_tuning callback that performs delay-line calibration using
>> a known data pattern written to a dedicated flash region. The pattern is
>> either read from a NOR partition identified by the DT property
>> cdns,phy-pattern-partition, or written to the NAND page cache before
>> each calibration read.
>>
>> For DDR protocols (8D-8D-8D) a 2D sweep of (rx_delay, tx_delay) pairs
>> is performed to find the widest passing region in the combined RX/TX
>> space. Binary search locates the gap boundary between passing regions
>> when two separate windows exist; the final operating point is placed at
>> the centre of the larger region with a small temperature-dependent
>> offset.
>>
>> For SDR protocols a 1D sweep of the RX delay is sufficient. Two windows
>> at adjacent read_delay values are measured; the wider one's midpoint is
>> selected.
>>
>> The tuning infrastructure is platform-specific: only am654-based OSPI
>> controllers populate the execute_tuning hook. All other platform data
>> entries return -EOPNOTSUPP and are unaffected.
>>
>> spi-max-frequency may carry two values in DT; the second (higher) value
>> is the tuned target rate stored in max_clk_rate. When only one value is
>> present max_clk_rate is zero and tuning is skipped.
>>
>> Signed-off-by: Santhosh Kumar K <s-k6 at ti.com>
>> ---
> 
> There are more than 1800 new lines for the PHY tuning procedure. I left
> that decision to Mark of course, by maybe we should move that into
> another .c file.
> 
>> +static int cqspi_am654_ospi_execute_tuning(struct spi_mem *mem,
>> +					   struct spi_mem_op *read_op,
>> +					   struct spi_mem_op *write_op)
>> +{
>> +	struct cqspi_st *cqspi =
>> +		spi_controller_get_devdata(mem->spi->controller);
>> +	struct cqspi_flash_pdata *f_pdata;
>> +	struct device *dev = &cqspi->pdev->dev;
>> +	u32 base_speed;
>> +	u32 phy_offset = 0;
>> +	int ret;
>> +
>> +	f_pdata = &cqspi->f_pdata[spi_get_chipselect(mem->spi, 0)];
>> +
>> +	/*
>> +	 * A second spi-max-frequency value (the higher clock rate) must be
>> +	 * present for tiered speed support.  Without it there is nothing to
>> +	 * calibrate towards, so skip tuning gracefully.
>> +	 */
>> +	if (!f_pdata->max_clk_rate) {
>> +		dev_dbg(dev, "No higher clock rate configured, skipping tuning\n");
>> +		return 0;
>> +	}
>> +
>> +	base_speed = mem->spi->base_speed_hz;
>> +
>> +	if (write_op) {
>> +		/*
>> +		 * For NAND: write the calibration pattern to the page cache.
>> +		 * This uses write_op at the safe base speed (base_speed_hz is
>> +		 * still active) so the write itself is reliable.
>> +		 */
>> +		ret = cqspi_write_pattern_to_cache(f_pdata, mem, write_op);
>> +		if (ret) {
>> +			dev_warn(dev,
>> +				 "failed to write pattern to cache: %d, skipping tuning\n",
>> +				 ret);
>> +			goto out;
>> +		}
>> +
>> +		f_pdata->phy_write_op = *write_op;
>> +	} else {
>> +		ret = cqspi_get_phy_pattern_offset(dev, &phy_offset);
>> +		if (ret) {
>> +			dev_warn(dev,
>> +				 "pattern partition not found: %d, skipping tuning\n",
>> +				 ret);
>> +			goto out;
>> +		}
>> +
>> +		read_op->addr.val = phy_offset;
>> +	}
>> +
>> +	/*
>> +	 * Verify the calibration pattern exists using the conservative base
>> +	 * speed.  At high clock rates the DLL is not yet trained, so DTR
>> +	 * data capture is unreliable and the read would return garbage.
>> +	 * Setting max_freq to 0 here causes apply_base_freq_cap() to cap the
>> +	 * read to base_speed_hz, which is well within reliable DTR margins.
>> +	 * max_freq is restored to max_speed_hz for the tuning-loop reads
>> +	 * after base_speed_hz is cleared below.
>> +	 */
>> +	f_pdata->phy_read_op = *read_op;
>> +	f_pdata->phy_read_op.max_freq = 0;
>> +
>> +	ret = cqspi_phy_check_pattern(f_pdata, mem);
>> +	if (ret) {
>> +		dev_err(dev, "pattern not found: %d, skipping tuning\n", ret);
>> +		goto out;
>> +	}
>> +
>> +	/*
>> +	 * Pattern confirmed.  Now clear base_speed_hz so that tuning-loop
>> +	 * exec_op calls run at max_speed_hz, and restore phy_read_op.max_freq
>> +	 * so those reads also use the full speed.
>> +	 */
>> +	mem->spi->base_speed_hz = 0;
> 
> If there is a way to avoid touching the core parameters, I would be for
> using it, but maybe it is simpler to do it this way.
> 
>> +	f_pdata->phy_read_op.max_freq = mem->spi->max_speed_hz;
>> +
>> +	if (read_op->cmd.dtr || read_op->addr.dtr || read_op->dummy.dtr ||
>> +	    read_op->data.dtr) {
>> +		f_pdata->use_dqs = true;
>> +		cqspi_phy_pre_config(cqspi, f_pdata, false);
>> +		ret = cqspi_phy_tuning_ddr(f_pdata, mem);
>> +	} else {
>> +		f_pdata->use_dqs = false;
>> +		cqspi_phy_pre_config(cqspi, f_pdata, true);
>> +		ret = cqspi_phy_tuning_sdr(f_pdata, mem);
>> +	}
>> +
>> +	if (ret)
>> +		dev_warn(dev, "tuning failed: %d\n", ret);
>> +
>> +	cqspi_phy_post_config(cqspi, f_pdata->read_delay);
>> +
>> +out:
>> +	/*
>> +	 * Always restore the conservative base speed cap.  On success, write
>> +	 * back the validated maximum speed into the caller's op templates so
>> +	 * that those specific ops bypass the cap in subsequent exec_op calls.
>> +	 */
>> +	mem->spi->base_speed_hz = base_speed;
>> +	if (!ret) {
>> +		read_op->max_freq = mem->spi->max_speed_hz;
>> +		if (write_op)
>> +			write_op->max_freq = mem->spi->max_speed_hz;
>> +	}
> 
> Neat.
> 
>> +
>> +	return ret;
>> +}
>> +
>> +static int cqspi_mem_op_execute_tuning(struct spi_mem *mem,
>> +				       struct spi_mem_op *read_op,
>> +				       struct spi_mem_op *write_op)
>> +{
>> +	struct cqspi_st *cqspi =
>> +		spi_controller_get_devdata(mem->spi->controller);
>> +
>> +	if (!cqspi->ddata->execute_tuning)
>> +		return -EOPNOTSUPP;
>> +
>> +	return cqspi->ddata->execute_tuning(mem, read_op, write_op);
>> +}
>> +
>>   static int cqspi_of_get_flash_pdata(struct platform_device *pdev,
>>   				    struct cqspi_flash_pdata *f_pdata,
>>   				    struct device_node *np)
>>   {
>> +	int nfreq, ret;
>> +
>>   	if (of_property_read_u32(np, "cdns,read-delay", &f_pdata->read_delay)) {
>>   		dev_err(&pdev->dev, "couldn't determine read-delay\n");
>>   		return -ENXIO;
>> @@ -1584,7 +3343,26 @@ static int cqspi_of_get_flash_pdata(struct platform_device *pdev,
>>   		return -ENXIO;
>>   	}
>>   
>> -	if (of_property_read_u32(np, "spi-max-frequency", &f_pdata->clk_rate)) {
>> +	/*
>> +	 * spi-max-frequency accepts one or two values:
>> +	 *   <max-freq>             - single rate; no tiered speed support
>> +	 *   <base-freq max-freq>   - conservative default and higher maximum
>> +	 *
>> +	 * With two values the SPI core sets spi->base_speed_hz = base-freq and
>> +	 * spi->max_speed_hz = max-freq.  Store the second value here as the
>> +	 * controller's higher rate target for calibration.
>> +	 */
>> +	nfreq = of_property_count_u32_elems(np, "spi-max-frequency");
>> +	if (nfreq == 2) {
>> +		ret = of_property_read_u32_index(np, "spi-max-frequency", 1,
>> +						 &f_pdata->max_clk_rate);
>> +		if (ret) {
>> +			dev_err(&pdev->dev, "couldn't read spi-max-frequency[1]\n");
>> +			return ret;
>> +		}
>> +	} else if (nfreq == 1) {
>> +		f_pdata->max_clk_rate = 0;
>> +	} else {
>>   		dev_err(&pdev->dev, "couldn't determine spi-max-frequency\n");
>>   		return -ENXIO;
>>   	}
> 
> Why do we repeat that operation in the driver? Can't we just use what
> the core has already done for us? Seems like we are parsing the same
> data twice (even before this patchset).

Yeah, you're correct, they're redundant - will remove them in v4.

Thanks,
Santhosh.

> 
> Thanks,
> Miquèl