[RFC PATCH] iommu: Optimize IOMMU UnMap

Wed May 22 20:19:35 PDT 2024

The current __arm_lpae_unmap() function calls dma_sync() on individual
PTEs after clearing them. By updating the __arm_lpae_unmap() to call
dma_sync() once for all cleared PTEs, the overall performance can be
improved 25% for large buffer sizes.
Below is detailed analysis of average unmap latency(in us) with and
without this optimization obtained by running dma_map_benchmark for
different buffer sizes.

Size	Time W/O	Time With	% Improvement
	Optimization	Optimization
	(us)		(us)

4KB	3.0		3.1		-3.33
1MB	250.3		187.9		24.93
2MB	493.7		368.7		25.32
4MB	974.7		723.4		25.78

Signed-off-by: Ashish Mhetre <amhetre at nvidia.com>
---
 drivers/iommu/io-pgtable-arm.c | 34 +++++++++++++++++++++++++---------
 1 file changed, 25 insertions(+), 9 deletions(-)

diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 3d23b924cec1..94094b711cba 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -256,13 +256,15 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
 				   sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
 }
 
-static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
+static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
 {
+	int i;
 
-	*ptep = 0;
+	for (i = 0; i < num_entries; i++)
+		ptep[i] = 0;
 
 	if (!cfg->coherent_walk)
-		__arm_lpae_sync_pte(ptep, 1, cfg);
+		__arm_lpae_sync_pte(ptep, num_entries, cfg);
 }
 
 static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
@@ -633,13 +635,25 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 	if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
 		max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
 		num_entries = min_t(int, pgcount, max_entries);
-
-		while (i < num_entries) {
-			pte = READ_ONCE(*ptep);
+		arm_lpae_iopte *pte_flush;
+		int j = 0;
+
+		pte_flush = kvcalloc(num_entries, sizeof(*pte_flush), GFP_ATOMIC);
+		if (pte_flush) {
+			for (j = 0; j < num_entries; j++) {
+				pte_flush[j] = READ_ONCE(ptep[j]);
+				if (WARN_ON(!pte_flush[j]))
+					break;
+			}
+			__arm_lpae_clear_pte(ptep, &iop->cfg, j);
+		}
+		while (i < (pte_flush ? j : num_entries)) {
+			pte = pte_flush ? pte_flush[i] : READ_ONCE(*ptep);
 			if (WARN_ON(!pte))
 				break;
 
-			__arm_lpae_clear_pte(ptep, &iop->cfg);
+			if (!pte_flush)
+				__arm_lpae_clear_pte(ptep, &iop->cfg, 1);
 
 			if (!iopte_leaf(pte, lvl, iop->fmt)) {
 				/* Also flush any partial walks */
@@ -649,10 +663,12 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
 			} else if (!iommu_iotlb_gather_queued(gather)) {
 				io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
 			}
-
-			ptep++;
+			if (!pte_flush)
+				ptep++;
 			i++;
 		}
+		if (pte_flush)
+			kvfree(pte_flush);
 
 		return i * size;
 	} else if (iopte_leaf(pte, lvl, iop->fmt)) {
-- 
2.17.1