[RFC PATCH] i3c: dw: add DMA support for data transfers

Frank Li Frank.li at nxp.com
Fri May 15 09:38:15 PDT 2026


On Fri, May 15, 2026 at 11:56:35PM +0800, Haoyu Lu wrote:
> Add DMA support for the DesignWare I3C master controller to
> accelerate data transfers.
>
> The driver now requests tx and rx DMA channels during probe and
> uses them for data transfers of 4 bytes or more. Transfers smaller
> than 4 bytes, or transfers where DMA setup fails, fall back to PIO
> mode. DMA tail bytes (non-4-byte-aligned remainder) are handled
> with PIO as well.
>
> DMA transfers may sleep, so the xferqueue spinlock is temporarily
> dropped for DMA operations. This is safe because xferqueue.cur is
> only changed when a transfer completes, and the calling context
> serializes simultaneous transfers.
>
> The DMA enable bit (DEV_CTRL_DMA_EN, BIT(28)) is set when DMA is
> used and cleared otherwise, to ensure the hardware uses the correct
> data transfer path.
>
> Signed-off-by: Haoyu Lu <hechushiguitu666 at gmail.com>
> ---
> This is an RFC patch to gather feedback on the overall approach before
> a formal submission. A few specific questions for reviewers:
>
> 1. DMA threshold: The current implementation uses DMA for transfers
>    >= 4 bytes, with tail bytes handled by PIO. Is 4 bytes a reasonable
>    threshold, or should we consider a higher value (e.g. 16 or 32) to
>    avoid DMA setup overhead for very small transfers?
>
> 2. Spinlock safety: dw_i3c_master_start_xfer_locked() and
>    dw_i3c_master_end_xfer_locked() temporarily drop xferqueue.lock
>    around DMA operations. The analysis is that xferqueue.cur is only
>    modified on transfer completion, and the caller serializes
>    simultaneous transfers. Is this safe, or should we restructure the
>    locking (e.g., convert to mutex)?
>
> 3. DMA channel handling: dw_i3c_dma_request() is called during probe
>    and silently degrades to PIO if DMA channels are unavailable. Is
>    this the right behavior, or should probe fail on DMA errors other
>    than -ENODEV/-EPROBE_DEFER?
>
> 4. Error handling for DMA timeouts: when a DMA timeout occurs,
>    dmaengine_terminate_all() is called but the transfer proceeds as if
>    it were successful, potentially reading stale data. Should we instead
>    signal an error to the I3C core?
>
> 5. IBI path: dw_i3c_master_read_ibi_fifo() still uses PIO. Should
>    IBI reads also use DMA, or is the IBI payload typically small enough
>    that PIO is preferred?
>
> Any other feedback on the design or implementation is welcome.
>
>  drivers/i3c/master/dw-i3c-master.c | 260 ++++++++++++++++++++++++++++-
>  drivers/i3c/master/dw-i3c-master.h |  17 ++
>  2 files changed, 270 insertions(+), 7 deletions(-)
>
> diff --git a/drivers/i3c/master/dw-i3c-master.c b/drivers/i3c/master/dw-i3c-master.c
> index d6bdb32397fb..7b6c8681ff0c 100644
> --- a/drivers/i3c/master/dw-i3c-master.c
> +++ b/drivers/i3c/master/dw-i3c-master.c
> @@ -13,6 +13,7 @@
>  #include <linux/errno.h>
>  #include <linux/i3c/master.h>
>  #include <linux/interrupt.h>
> +#include <linux/io.h>
>  #include <linux/ioport.h>
>  #include <linux/iopoll.h>
>  #include <linux/list.h>
> @@ -24,6 +25,10 @@
>  #include <linux/reset.h>
>  #include <linux/slab.h>
>
> +#include <linux/dma-direction.h>
> +#include <linux/dmaengine.h>
> +#include <linux/dma-mapping.h>
> +
>  #include "../internals.h"
>  #include "dw-i3c-master.h"
>
> @@ -32,6 +37,7 @@
>  #define DEV_CTRL_RESUME			BIT(30)
>  #define DEV_CTRL_HOT_JOIN_NACK		BIT(8)
>  #define DEV_CTRL_I2C_SLAVE_PRESENT	BIT(7)
> +#define DEV_CTRL_DMA_EN			BIT(28)
>
>  #define DEVICE_ADDR			0x4
>  #define DEV_ADDR_DYNAMIC_ADDR_VALID	BIT(31)
> @@ -359,16 +365,123 @@ static int dw_i3c_master_get_free_pos(struct dw_i3c_master *master)
>  	return ffs(master->free_pos) - 1;
>  }
>
> +static void dw_i3c_dma_callback(void *arg)
> +{
> +	struct dw_i3c_dma *dma = arg;
> +
> +	dma_unmap_single(dma->chan_using->device->dev, dma->dma_buf,
> +			 dma->dma_len, dma->dma_data_dir);
> +	complete(&dma->complete);
> +}
> +
> +static int dw_i3c_master_dma_xfer(struct dw_i3c_dma *dma, const u8 *bytes)
> +{
> +	struct device *chan_dev = dma->chan_using->device->dev;
> +	struct dma_async_tx_descriptor *txdesc;
> +
> +	dma->dma_buf = dma_map_single(chan_dev, (void *)bytes, dma->dma_len,
> +				      dma->dma_data_dir);
> +	if (dma_mapping_error(chan_dev, dma->dma_buf))
> +		return -EINVAL;
> +
> +	txdesc = dmaengine_prep_slave_single(dma->chan_using, dma->dma_buf,
> +					     dma->dma_len, dma->dma_transfer_dir,
> +					     DMA_PREP_INTERRUPT);
> +	if (!txdesc) {
> +		dma_unmap_single(chan_dev, dma->dma_buf, dma->dma_len,
> +				 dma->dma_data_dir);
> +		return -ENOMEM;
> +	}

There are help function i3c_master_dma_map_single() to help bounce unalign
buffer.

> +
> +	reinit_completion(&dma->complete);
> +	txdesc->callback = dw_i3c_dma_callback;
> +	txdesc->callback_param = dma;
> +	if (dma_submit_error(dmaengine_submit(txdesc))) {
> +		dma_unmap_single(chan_dev, dma->dma_buf, dma->dma_len,
> +				 dma->dma_data_dir);
> +		return -EIO;
> +	}
> +
> +	dma_async_issue_pending(dma->chan_using);
> +	return 0;
> +}
> +
>  static void dw_i3c_master_wr_tx_fifo(struct dw_i3c_master *master,
>  				     const u8 *bytes, int nbytes)
>  {
> -	i3c_writel_fifo(master->regs + RX_TX_DATA_PORT, bytes, nbytes);
> +	struct dw_i3c_dma *dma = master->dma;
> +	unsigned long time_left;
> +	int ret;
> +
> +	if (!master->use_dma || nbytes < 4) {
> +		i3c_writel_fifo(master->regs + RX_TX_DATA_PORT, bytes, nbytes);
> +		return;
> +	}
> +
> +	dma->chan_using = dma->chan_tx;
> +	dma->dma_transfer_dir = DMA_MEM_TO_DEV;
> +	dma->dma_data_dir = DMA_TO_DEVICE;
> +	dma->dma_len = ALIGN_DOWN(nbytes, 4);
> +
> +	ret = dw_i3c_master_dma_xfer(dma, bytes);
> +	if (ret) {
> +		dev_err(&master->base.dev, "DMA TX setup failed (%d)\n", ret);
> +		i3c_writel_fifo(master->regs + RX_TX_DATA_PORT, bytes, nbytes);
> +		return;
> +	}
> +
> +	time_left = wait_for_completion_timeout(&dma->complete,
> +						XFER_TIMEOUT);
> +	if (!time_left) {
> +		dmaengine_terminate_all(dma->chan_using);
> +		dev_err(&master->base.dev, "DMA TX timeout\n");
> +	}
> +
> +	if (nbytes & 3) {
> +		u32 tmp = 0;
> +
> +		memcpy(&tmp, bytes + (nbytes & ~3), nbytes & 3);
> +		writesl(master->regs + RX_TX_DATA_PORT, &tmp, 1);
> +	}
>  }
>
>  static void dw_i3c_master_read_rx_fifo(struct dw_i3c_master *master,
>  				       u8 *bytes, int nbytes)
>  {
> -	i3c_readl_fifo(master->regs + RX_TX_DATA_PORT, bytes, nbytes);
> +	struct dw_i3c_dma *dma = master->dma;
> +	unsigned long time_left;
> +	int ret;
> +
> +	if (!master->use_dma || nbytes < 4) {
> +		i3c_readl_fifo(master->regs + RX_TX_DATA_PORT, bytes, nbytes);
> +		return;
> +	}
> +
> +	dma->chan_using = dma->chan_rx;
> +	dma->dma_transfer_dir = DMA_DEV_TO_MEM;
> +	dma->dma_data_dir = DMA_FROM_DEVICE;
> +	dma->dma_len = ALIGN_DOWN(nbytes, 4);
> +
> +	ret = dw_i3c_master_dma_xfer(dma, bytes);
> +	if (ret) {
> +		dev_err(&master->base.dev, "DMA RX setup failed (%d)\n", ret);
> +		i3c_readl_fifo(master->regs + RX_TX_DATA_PORT, bytes, nbytes);
> +		return;
> +	}
> +
> +	time_left = wait_for_completion_timeout(&dma->complete,
> +						XFER_TIMEOUT);
> +	if (!time_left) {
> +		dmaengine_terminate_all(dma->chan_using);
> +		dev_err(&master->base.dev, "DMA RX timeout\n");
> +	}
> +
> +	if (nbytes & 3) {
> +		u32 tmp;
> +
> +		readsl(master->regs + RX_TX_DATA_PORT, &tmp, 1);
> +		memcpy(bytes + (nbytes & ~3), &tmp, nbytes & 3);
> +	}
>  }
>
>  static void dw_i3c_master_read_ibi_fifo(struct dw_i3c_master *master,
> @@ -403,14 +516,29 @@ static void dw_i3c_master_start_xfer_locked(struct dw_i3c_master *master)
>  	struct dw_i3c_xfer *xfer = master->xferqueue.cur;
>  	unsigned int i;
>  	u32 thld_ctrl;
> +	u32 dev_ctrl;
>
>  	if (!xfer)
>  		return;
>
> -	for (i = 0; i < xfer->ncmds; i++) {
> -		struct dw_i3c_cmd *cmd = &xfer->cmds[i];
> -
> -		dw_i3c_master_wr_tx_fifo(master, cmd->tx_buf, cmd->tx_len);
> +	/*
> +	 * DMA transfers may sleep, so we must drop the spinlock while
> +	 * writing the TX FIFO. The xferqueue lock is held by our caller;
> +	 * xferqueue.cur is safe because:
> +	 * - New xfers are only added to the list (not cur) when cur is set.
> +	 * - Timeout-based dequeue is blocked until the caller calls
> +	 *   wait_for_completion_timeout(), which happens after we return.
> +	 */
> +	if (master->use_dma) {
> +		spin_unlock(&master->xferqueue.lock);
> +		for (i = 0; i < xfer->ncmds; i++)
> +			dw_i3c_master_wr_tx_fifo(master, xfer->cmds[i].tx_buf,
> +						 xfer->cmds[i].tx_len);

Here have risk, you need queue whole transfer once. because

1. First cmd
2. dma transfer
3. dma irq, which may delay or schedule by OS, there are 100us timeout by
target.

4. target may timeout, so treat REPEAT start as START,
5. target may raise IBI here, but host side think it is repeat START, don't
check address arbitration.

Frank




More information about the linux-i3c mailing list