[PATCH v2 2/2] ARM: dma-mapping: sort the pages after allocation
Douglas Anderson
dianders at chromium.org
Fri Dec 18 14:27:02 PST 2015
After doing allocation, make one last-ditch effort to get contiguous
regions of pages to optimize TLB usage. This is a rather simplistic
approach that could be later optimized, but it doesn't hurt and should
only have the opportunity to help.
>From my testing the sort took less than 400us for a 4MB allocation.
That's much faster than the actual allocation which was more than a
millisecond even in the fastest case (and was often several hundred ms).
Signed-off-by: Douglas Anderson <dianders at chromium.org>
---
Changes in v2:
- Sort patch new for v2 (and optional if people hate it).
arch/arm/mm/dma-mapping.c | 40 ++++++++++++++++++++++++++++++++++++++++
1 file changed, 40 insertions(+)
diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c
index 9887d432cf1f..d1b3d3e6fe47 100644
--- a/arch/arm/mm/dma-mapping.c
+++ b/arch/arm/mm/dma-mapping.c
@@ -23,6 +23,7 @@
#include <linux/highmem.h>
#include <linux/memblock.h>
#include <linux/slab.h>
+#include <linux/sort.h>
#include <linux/iommu.h>
#include <linux/io.h>
#include <linux/vmalloc.h>
@@ -1122,6 +1123,21 @@ static inline void __free_iova(struct dma_iommu_mapping *mapping,
spin_unlock_irqrestore(&mapping->lock, flags);
}
+static int cmp_pfns(const void *a, const void *b)
+{
+ unsigned long a_pfn;
+ unsigned long b_pfn;
+
+ a_pfn = page_to_pfn(*(struct page **)a);
+ b_pfn = page_to_pfn(*(struct page **)b);
+
+ if (a_pfn < b_pfn)
+ return -1;
+ else if (a_pfn > b_pfn)
+ return 1;
+ return 0;
+}
+
/* We'll try 2M, 1M, 64K, and finally 4K; array must end with 0! */
static const int iommu_order_array[] = { 9, 8, 4, 0 };
@@ -1133,6 +1149,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
int array_size = count * sizeof(struct page *);
int i = 0;
int order_idx = 0;
+ int first_order_zero = -1;
if (array_size <= PAGE_SIZE)
pages = kzalloc(array_size, GFP_KERNEL);
@@ -1171,6 +1188,7 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
/* Drop down when we get small */
if (__fls(count) < order) {
order_idx++;
+ /* Don't update first_order_zero; no need to sort end */
continue;
}
@@ -1181,6 +1199,8 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
/* Go down a notch at first sign of pressure */
if (!pages[i]) {
order_idx++;
+ if (iommu_order_array[order_idx] == 0)
+ first_order_zero = i;
continue;
}
} else {
@@ -1201,6 +1221,26 @@ static struct page **__iommu_alloc_buffer(struct device *dev, size_t size,
count -= 1 << order;
}
+ /*
+ * If we folded under memory pressure, try one last ditch event to get
+ * contiguous pages via sorting. Under testing this sometimes helped
+ * get a few more contiguous pages and didn't cost much compared to
+ * the above allocations.
+ *
+ * Note that we only sort the order zero pages so that we don't mess
+ * up the higher order allocations by sticking small pages in between
+ * them.
+ *
+ * If someone wanted to optimize this more, they could insert extra
+ * (out of order) single pages in places to help keep virtual and
+ * physical pages aligned with each other. As it is we often get
+ * lucky and get the needed alignment but we're not guaranteed.
+ */
+ if (first_order_zero >= 0)
+ sort(pages + first_order_zero,
+ (size >> PAGE_SHIFT) - first_order_zero, sizeof(*pages),
+ cmp_pfns, NULL);
+
return pages;
error:
while (i--)
--
2.6.0.rc2.230.g3dd15c0
More information about the linux-arm-kernel
mailing list