[RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch
Barry Song
21cnbao at gmail.com
Tue Oct 28 19:31:15 PDT 2025
From: Barry Song <v-songbaohua at oppo.com>
This enables dma_direct_sync_sg_for_device, dma_direct_sync_sg_for_cpu,
dma_direct_map_sg, and dma_direct_unmap_sg to use batched DMA sync
operations when possible. This significantly improves performance on
devices without hardware cache coherence.
Tangquan's initial results show that batched synchronization can reduce
dma_map_sg() time by 64.61% and dma_unmap_sg() time by 66.60% on an MTK
phone platform (MediaTek Dimensity 9500). The tests were performed by
pinning the task to CPU7 and fixing the CPU frequency at 2.6 GHz,
running dma_map_sg() and dma_unmap_sg() on 10 MB buffers (10 MB / 4 KB
sg entries per buffer) for 200 iterations and then averaging the
results.
Cc: Catalin Marinas <catalin.marinas at arm.com>
Cc: Will Deacon <will at kernel.org>
Cc: Marek Szyprowski <m.szyprowski at samsung.com>
Cc: Robin Murphy <robin.murphy at arm.com>
Cc: Ada Couprie Diaz <ada.coupriediaz at arm.com>
Cc: Ard Biesheuvel <ardb at kernel.org>
Cc: Marc Zyngier <maz at kernel.org>
Cc: Anshuman Khandual <anshuman.khandual at arm.com>
Cc: Ryan Roberts <ryan.roberts at arm.com>
Cc: Suren Baghdasaryan <surenb at google.com>
Cc: Tangquan Zheng <zhengtangquan at oppo.com>
Cc: linux-arm-kernel at lists.infradead.org
Cc: linux-kernel at vger.kernel.org
Cc: iommu at lists.linux.dev
Signed-off-by: Barry Song <v-songbaohua at oppo.com>
---
kernel/dma/direct.c | 53 +++++++++++++++++++++++++---
kernel/dma/direct.h | 86 +++++++++++++++++++++++++++++++++++++++------
2 files changed, 123 insertions(+), 16 deletions(-)
diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
index 1f9ee9759426..a0b45f84a91f 100644
--- a/kernel/dma/direct.c
+++ b/kernel/dma/direct.c
@@ -403,9 +403,16 @@ void dma_direct_sync_sg_for_device(struct device *dev,
swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
if (!dev_is_dma_coherent(dev))
- arch_sync_dma_for_device(paddr, sg->length,
- dir);
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
+#else
+ arch_sync_dma_for_device(paddr, sg->length, dir);
+#endif
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_batch_flush();
+#endif
}
#endif
@@ -422,7 +429,11 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
if (!dev_is_dma_coherent(dev))
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
+#else
arch_sync_dma_for_cpu(paddr, sg->length, dir);
+#endif
swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
@@ -430,8 +441,12 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
arch_dma_mark_clean(paddr, sg->length);
}
- if (!dev_is_dma_coherent(dev))
+ if (!dev_is_dma_coherent(dev)) {
arch_sync_dma_for_cpu_all();
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ arch_sync_dma_batch_flush();
+#endif
+ }
}
/*
@@ -443,14 +458,29 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
{
struct scatterlist *sg;
int i;
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ bool need_sync = false;
+#endif
for_each_sg(sgl, sg, nents, i) {
- if (sg_dma_is_bus_address(sg))
+ if (sg_dma_is_bus_address(sg)) {
sg_dma_unmark_bus_address(sg);
- else
+ } else {
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ need_sync = true;
+ dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
+ sg_dma_len(sg), dir, attrs);
+
+#else
dma_direct_unmap_phys(dev, sg->dma_address,
sg_dma_len(sg), dir, attrs);
+#endif
+ }
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_batch_flush();
+#endif
}
#endif
@@ -460,6 +490,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
struct pci_p2pdma_map_state p2pdma_state = {};
struct scatterlist *sg;
int i, ret;
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ bool need_sync = false;
+#endif
for_each_sg(sgl, sg, nents, i) {
switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
@@ -471,8 +504,14 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
*/
break;
case PCI_P2PDMA_MAP_NONE:
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ need_sync = true;
+ sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
+ sg->length, dir, attrs);
+#else
sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
sg->length, dir, attrs);
+#endif
if (sg->dma_address == DMA_MAPPING_ERROR) {
ret = -EIO;
goto out_unmap;
@@ -490,6 +529,10 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
sg_dma_len(sg) = sg->length;
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+ if (need_sync && !dev_is_dma_coherent(dev))
+ arch_sync_dma_batch_flush();
+#endif
return nents;
out_unmap:
diff --git a/kernel/dma/direct.h b/kernel/dma/direct.h
index da2fadf45bcd..a211bab26478 100644
--- a/kernel/dma/direct.h
+++ b/kernel/dma/direct.h
@@ -64,15 +64,11 @@ static inline void dma_direct_sync_single_for_device(struct device *dev,
arch_sync_dma_for_device(paddr, size, dir);
}
-static inline void dma_direct_sync_single_for_cpu(struct device *dev,
- dma_addr_t addr, size_t size, enum dma_data_direction dir)
+static inline void __dma_direct_sync_single_for_cpu(struct device *dev,
+ phys_addr_t paddr, size_t size, enum dma_data_direction dir)
{
- phys_addr_t paddr = dma_to_phys(dev, addr);
-
- if (!dev_is_dma_coherent(dev)) {
- arch_sync_dma_for_cpu(paddr, size, dir);
+ if (!dev_is_dma_coherent(dev))
arch_sync_dma_for_cpu_all();
- }
swiotlb_sync_single_for_cpu(dev, paddr, size, dir);
@@ -80,7 +76,31 @@ static inline void dma_direct_sync_single_for_cpu(struct device *dev,
arch_dma_mark_clean(paddr, size);
}
-static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_sync_single_for_cpu_batch_add(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu_batch_add(paddr, size, dir);
+
+ __dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
+}
+#endif
+
+static inline void dma_direct_sync_single_for_cpu(struct device *dev,
+ dma_addr_t addr, size_t size, enum dma_data_direction dir)
+{
+ phys_addr_t paddr = dma_to_phys(dev, addr);
+
+ if (!dev_is_dma_coherent(dev))
+ arch_sync_dma_for_cpu(paddr, size, dir);
+
+ __dma_direct_sync_single_for_cpu(dev, paddr, size, dir);
+}
+
+static inline dma_addr_t __dma_direct_map_phys(struct device *dev,
phys_addr_t phys, size_t size, enum dma_data_direction dir,
unsigned long attrs)
{
@@ -108,9 +128,6 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
}
}
- if (!dev_is_dma_coherent(dev) &&
- !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
- arch_sync_dma_for_device(phys, size, dir);
return dma_addr;
err_overflow:
@@ -121,6 +138,53 @@ static inline dma_addr_t dma_direct_map_phys(struct device *dev,
return DMA_MAPPING_ERROR;
}
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline dma_addr_t dma_direct_map_phys_batch_add(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ dma_addr_t dma_addr = __dma_direct_map_phys(dev, phys, size, dir, attrs);
+
+ if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+ arch_sync_dma_for_device_batch_add(phys, size, dir);
+
+ return dma_addr;
+}
+#endif
+
+static inline dma_addr_t dma_direct_map_phys(struct device *dev,
+ phys_addr_t phys, size_t size, enum dma_data_direction dir,
+ unsigned long attrs)
+{
+ dma_addr_t dma_addr = __dma_direct_map_phys(dev, phys, size, dir, attrs);
+
+ if (dma_addr != DMA_MAPPING_ERROR && !dev_is_dma_coherent(dev) &&
+ !(attrs & (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_MMIO)))
+ arch_sync_dma_for_device(phys, size, dir);
+
+ return dma_addr;
+}
+
+#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
+static inline void dma_direct_unmap_phys_batch_add(struct device *dev, dma_addr_t addr,
+ size_t size, enum dma_data_direction dir, unsigned long attrs)
+{
+ phys_addr_t phys;
+
+ if (attrs & DMA_ATTR_MMIO)
+ /* nothing to do: uncached and no swiotlb */
+ return;
+
+ phys = dma_to_phys(dev, addr);
+ if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC))
+ dma_direct_sync_single_for_cpu_batch_add(dev, addr, size, dir);
+
+ swiotlb_tbl_unmap_single(dev, phys, size, dir,
+ attrs | DMA_ATTR_SKIP_CPU_SYNC);
+}
+#endif
+
static inline void dma_direct_unmap_phys(struct device *dev, dma_addr_t addr,
size_t size, enum dma_data_direction dir, unsigned long attrs)
{
--
2.39.3 (Apple Git-146)
More information about the linux-arm-kernel
mailing list