[RFC PATCH 5/5] dma-mapping: Allow batched DMA sync operations if supported by the arch

Marek Szyprowski m.szyprowski at samsung.com
Fri Nov 21 08:09:16 PST 2025


Hi Barry,

On 17.11.2025 22:12, Barry Song wrote:
> On Fri, Nov 14, 2025 at 2:19 AM Catalin Marinas <catalin.marinas at arm.com> wrote:
>> On Wed, Oct 29, 2025 at 10:31:15AM +0800, Barry Song wrote:
>>> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
>>> index 1f9ee9759426..a0b45f84a91f 100644
>>> --- a/kernel/dma/direct.c
>>> +++ b/kernel/dma/direct.c
>>> @@ -403,9 +403,16 @@ void dma_direct_sync_sg_for_device(struct device *dev,
>>>                swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
>>>
>>>                if (!dev_is_dma_coherent(dev))
>>> -                     arch_sync_dma_for_device(paddr, sg->length,
>>> -                                     dir);
>>> +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>>> +                     arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
>>> +#else
>>> +                     arch_sync_dma_for_device(paddr, sg->length, dir);
>>> +#endif
>>>        }
>>> +#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>>> +     if (!dev_is_dma_coherent(dev))
>>> +             arch_sync_dma_batch_flush();
>>> +#endif
>>>   }
>>>   #endif
>> Just a high-level comment for now. I'm not opposed to the idea of
>> batching the DSB barriers, we do this for ptes. However, the way it's
>
> Thanks, Catalin. I agree we need batching, as phones and embedded systems
> could use many DMA buffers while some chips lack DMA-coherency.
>
>
>> implemented in the generic files, with lots of #ifdefs, makes the code
>> pretty unreadable.
>>
>> Can we have something like arch_sync_dma_begin/end() and let the arch
>> code handle the barriers as they see fit?
>
> I guess I can refactor it as below and then remove the #ifdef/#else/#endif blocks.
>
> diff --git a/include/linux/dma-map-ops.h b/include/linux/dma-map-ops.h
> index 8fcd0a9c1f39..73bca4d7149d 100644
> --- a/include/linux/dma-map-ops.h
> +++ b/include/linux/dma-map-ops.h
> @@ -373,6 +373,20 @@ void arch_sync_dma_for_device_batch_add(phys_addr_t paddr, size_t size,
>   void arch_sync_dma_for_cpu_batch_add(phys_addr_t paddr, size_t size,
>   		enum dma_data_direction dir);
>   void arch_sync_dma_batch_flush(void);
> +#else
> +static inline void arch_sync_dma_for_device_batch_add(phys_addr_t paddr, size_t size,
> +		enum dma_data_direction dir)
> +{
> +	arch_sync_dma_for_device(paddr, size, dir);
> +}
> +static inline void arch_sync_dma_for_cpu_batch_add(phys_addr_t paddr, size_t size,
> +		enum dma_data_direction dir)
> +{
> +	arch_sync_dma_for_cpu(paddr, size, dir);
> +}
> +static inline void arch_sync_dma_batch_flush(void)
> +{
> +}
>   #endif
>   
>   #ifdef CONFIG_ARCH_HAS_SYNC_DMA_FOR_CPU_ALL
> diff --git a/kernel/dma/direct.c b/kernel/dma/direct.c
> index a0b45f84a91f..69b14b0c0501 100644
> --- a/kernel/dma/direct.c
> +++ b/kernel/dma/direct.c
> @@ -403,16 +403,10 @@ void dma_direct_sync_sg_for_device(struct device *dev,
>   		swiotlb_sync_single_for_device(dev, paddr, sg->length, dir);
>   
>   		if (!dev_is_dma_coherent(dev))
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   			arch_sync_dma_for_device_batch_add(paddr, sg->length, dir);
> -#else
> -			arch_sync_dma_for_device(paddr, sg->length, dir);
> -#endif
>   	}
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	if (!dev_is_dma_coherent(dev))
>   		arch_sync_dma_batch_flush();
> -#endif
>   }
>   #endif
>   
> @@ -429,11 +423,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
>   		phys_addr_t paddr = dma_to_phys(dev, sg_dma_address(sg));
>   
>   		if (!dev_is_dma_coherent(dev))
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   			arch_sync_dma_for_cpu_batch_add(paddr, sg->length, dir);
> -#else
> -			arch_sync_dma_for_cpu(paddr, sg->length, dir);
> -#endif
>   
>   		swiotlb_sync_single_for_cpu(dev, paddr, sg->length, dir);
>   
> @@ -443,9 +433,7 @@ void dma_direct_sync_sg_for_cpu(struct device *dev,
>   
>   	if (!dev_is_dma_coherent(dev)) {
>   		arch_sync_dma_for_cpu_all();
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   		arch_sync_dma_batch_flush();
> -#endif
>   	}
>   }
>   
> @@ -458,29 +446,19 @@ void dma_direct_unmap_sg(struct device *dev, struct scatterlist *sgl,
>   {
>   	struct scatterlist *sg;
>   	int i;
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	bool need_sync = false;
> -#endif
>   
>   	for_each_sg(sgl,  sg, nents, i) {
>   		if (sg_dma_is_bus_address(sg)) {
>   			sg_dma_unmark_bus_address(sg);
>   		} else {
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   			need_sync = true;
>   			dma_direct_unmap_phys_batch_add(dev, sg->dma_address,
>   					      sg_dma_len(sg), dir, attrs);
> -
> -#else
> -			dma_direct_unmap_phys(dev, sg->dma_address,
> -					      sg_dma_len(sg), dir, attrs);
> -#endif
>   		}
>   	}
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	if (need_sync && !dev_is_dma_coherent(dev))
>   		arch_sync_dma_batch_flush();
> -#endif
>   }
>   #endif
>   
> @@ -490,9 +468,7 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
>   	struct pci_p2pdma_map_state p2pdma_state = {};
>   	struct scatterlist *sg;
>   	int i, ret;
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	bool need_sync = false;
> -#endif
>   
>   	for_each_sg(sgl, sg, nents, i) {
>   		switch (pci_p2pdma_state(&p2pdma_state, dev, sg_page(sg))) {
> @@ -504,14 +480,9 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
>   			 */
>   			break;
>   		case PCI_P2PDMA_MAP_NONE:
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   			need_sync = true;
>   			sg->dma_address = dma_direct_map_phys_batch_add(dev, sg_phys(sg),
>   					sg->length, dir, attrs);
> -#else
> -			sg->dma_address = dma_direct_map_phys(dev, sg_phys(sg),
> -					sg->length, dir, attrs);
> -#endif
>   			if (sg->dma_address == DMA_MAPPING_ERROR) {
>   				ret = -EIO;
>   				goto out_unmap;
> @@ -529,10 +500,8 @@ int dma_direct_map_sg(struct device *dev, struct scatterlist *sgl, int nents,
>   		sg_dma_len(sg) = sg->length;
>   	}
>   
> -#ifdef CONFIG_ARCH_WANT_BATCHED_DMA_SYNC
>   	if (need_sync && !dev_is_dma_coherent(dev))
>   		arch_sync_dma_batch_flush();
> -#endif
>   	return nents;
>   
>   out_unmap:
>
>
This version looks a bit better to me. Similar batching could be added 
also to dma_iova_link()/dma_iova_sync() paths.

Best regards
-- 
Marek Szyprowski, PhD
Samsung R&D Institute Poland




More information about the linux-arm-kernel mailing list