[RFC PATCH] iommu: Optimize IOMMU UnMap
Ashish Mhetre
amhetre at nvidia.com
Wed May 22 20:19:35 PDT 2024
The current __arm_lpae_unmap() function calls dma_sync() on individual
PTEs after clearing them. By updating the __arm_lpae_unmap() to call
dma_sync() once for all cleared PTEs, the overall performance can be
improved 25% for large buffer sizes.
Below is detailed analysis of average unmap latency(in us) with and
without this optimization obtained by running dma_map_benchmark for
different buffer sizes.
Size Time W/O Time With % Improvement
Optimization Optimization
(us) (us)
4KB 3.0 3.1 -3.33
1MB 250.3 187.9 24.93
2MB 493.7 368.7 25.32
4MB 974.7 723.4 25.78
Signed-off-by: Ashish Mhetre <amhetre at nvidia.com>
---
drivers/iommu/io-pgtable-arm.c | 34 +++++++++++++++++++++++++---------
1 file changed, 25 insertions(+), 9 deletions(-)
diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c
index 3d23b924cec1..94094b711cba 100644
--- a/drivers/iommu/io-pgtable-arm.c
+++ b/drivers/iommu/io-pgtable-arm.c
@@ -256,13 +256,15 @@ static void __arm_lpae_sync_pte(arm_lpae_iopte *ptep, int num_entries,
sizeof(*ptep) * num_entries, DMA_TO_DEVICE);
}
-static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg)
+static void __arm_lpae_clear_pte(arm_lpae_iopte *ptep, struct io_pgtable_cfg *cfg, int num_entries)
{
+ int i;
- *ptep = 0;
+ for (i = 0; i < num_entries; i++)
+ ptep[i] = 0;
if (!cfg->coherent_walk)
- __arm_lpae_sync_pte(ptep, 1, cfg);
+ __arm_lpae_sync_pte(ptep, num_entries, cfg);
}
static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
@@ -633,13 +635,25 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
if (size == ARM_LPAE_BLOCK_SIZE(lvl, data)) {
max_entries = ARM_LPAE_PTES_PER_TABLE(data) - unmap_idx_start;
num_entries = min_t(int, pgcount, max_entries);
-
- while (i < num_entries) {
- pte = READ_ONCE(*ptep);
+ arm_lpae_iopte *pte_flush;
+ int j = 0;
+
+ pte_flush = kvcalloc(num_entries, sizeof(*pte_flush), GFP_ATOMIC);
+ if (pte_flush) {
+ for (j = 0; j < num_entries; j++) {
+ pte_flush[j] = READ_ONCE(ptep[j]);
+ if (WARN_ON(!pte_flush[j]))
+ break;
+ }
+ __arm_lpae_clear_pte(ptep, &iop->cfg, j);
+ }
+ while (i < (pte_flush ? j : num_entries)) {
+ pte = pte_flush ? pte_flush[i] : READ_ONCE(*ptep);
if (WARN_ON(!pte))
break;
- __arm_lpae_clear_pte(ptep, &iop->cfg);
+ if (!pte_flush)
+ __arm_lpae_clear_pte(ptep, &iop->cfg, 1);
if (!iopte_leaf(pte, lvl, iop->fmt)) {
/* Also flush any partial walks */
@@ -649,10 +663,12 @@ static size_t __arm_lpae_unmap(struct arm_lpae_io_pgtable *data,
} else if (!iommu_iotlb_gather_queued(gather)) {
io_pgtable_tlb_add_page(iop, gather, iova + i * size, size);
}
-
- ptep++;
+ if (!pte_flush)
+ ptep++;
i++;
}
+ if (pte_flush)
+ kvfree(pte_flush);
return i * size;
} else if (iopte_leaf(pte, lvl, iop->fmt)) {
--
2.17.1
More information about the linux-arm-kernel
mailing list