[RFC PATCH 1/3] mm/vmalloc.c: try to flush vmap_area one by one

Baoquan He bhe at redhat.com
Fri May 19 05:01:27 PDT 2023


In the current __purge_vmap_area_lazy(), when trying to flush TLB of
vmalloc area, it calculate the flushing the range with [min:max] of vas.
That calculated range could be big because of the gap between the vas.

E.g in below graph, there are only 12 (4 from va_1, 8 from va_2) pages.
While the calculated flush range is 58.

  VA_1                               VA_2
 |....|-------------------------|............|
10   12                         60           68

. mapped;
- not mapped.

Sometime the calculated flush range could be surprisingly huge because
the vas could cross two kernel virtual address area. E.g the vmalloc and
the kernel module area are very far away from each other on some
architectures.

So for systems which lack a full TLB flush, to flush a long range is
a big problem(it takes time). Flushing va one by one becomes necessary
in that case.

Hence, introduce flush_tlb_kernel_vas() to try to flush va one by one.
And add CONFIG_HAVE_FLUSH_TLB_KERNEL_VAS to indicate if a certain
architecture has provided a flush_tlb_kernel_vas() implementation.
Otherwise, take the old way to calculate and flush the whole range.

Signed-off-by: Thomas Gleixner <tglx at linutronix.de>
Signed-off-by: Baoquan He <bhe at redhat.com> #Fix error of 'undefined reference to `flush_tlb_kernel_vas''
---
 arch/Kconfig              |  4 ++++
 arch/arm/Kconfig          |  1 +
 arch/arm/kernel/smp_tlb.c | 23 +++++++++++++++++++++++
 arch/x86/Kconfig          |  1 +
 arch/x86/mm/tlb.c         | 22 ++++++++++++++++++++++
 include/linux/vmalloc.h   |  8 ++++++++
 mm/vmalloc.c              | 32 ++++++++++++++++++++++----------
 7 files changed, 81 insertions(+), 10 deletions(-)

diff --git a/arch/Kconfig b/arch/Kconfig
index 205fd23e0cad..ca5413f1e4e0 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -270,6 +270,10 @@ config ARCH_HAS_SET_MEMORY
 config ARCH_HAS_SET_DIRECT_MAP
 	bool
 
+# Select if architecture provides flush_tlb_kernel_vas()
+config ARCH_HAS_FLUSH_TLB_KERNEL_VAS
+	bool
+
 #
 # Select if the architecture provides the arch_dma_set_uncached symbol to
 # either provide an uncached segment alias for a DMA allocation, or
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 0fb4b218f665..c4de7f38f9a7 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -10,6 +10,7 @@ config ARM
 	select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE
 	select ARCH_HAS_ELF_RANDOMIZE
 	select ARCH_HAS_FORTIFY_SOURCE
+	select ARCH_HAS_FLUSH_TLB_KERNEL_VAS
 	select ARCH_HAS_KEEPINITRD
 	select ARCH_HAS_KCOV
 	select ARCH_HAS_MEMBARRIER_SYNC_CORE
diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c
index d4908b3736d8..22ec9b982cb1 100644
--- a/arch/arm/kernel/smp_tlb.c
+++ b/arch/arm/kernel/smp_tlb.c
@@ -7,6 +7,7 @@
 #include <linux/preempt.h>
 #include <linux/smp.h>
 #include <linux/uaccess.h>
+#include <linux/vmalloc.h>
 
 #include <asm/smp_plat.h>
 #include <asm/tlbflush.h>
@@ -69,6 +70,19 @@ static inline void ipi_flush_tlb_kernel_range(void *arg)
 	local_flush_tlb_kernel_range(ta->ta_start, ta->ta_end);
 }
 
+static inline void local_flush_tlb_kernel_vas(struct list_head *vmap_list)
+{
+	struct vmap_area *va;
+
+	list_for_each_entry(va, vmap_list, list)
+		local_flush_tlb_kernel_range(va->va_start, va->va_end);
+}
+
+static inline void ipi_flush_tlb_kernel_vas(void *arg)
+{
+	local_flush_tlb_kernel_vas(arg);
+}
+
 static inline void ipi_flush_bp_all(void *ignored)
 {
 	local_flush_bp_all();
@@ -244,6 +258,15 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 	broadcast_tlb_a15_erratum();
 }
 
+void flush_tlb_kernel_vas(struct list_head *vmap_list, unsigned long num_entries)
+{
+	if (tlb_ops_need_broadcast()) {
+		on_each_cpu(ipi_flush_tlb_kernel_vas, vmap_list, 1);
+	} else
+		local_flush_tlb_kernel_vas(vmap_list);
+	broadcast_tlb_a15_erratum();
+}
+
 void flush_bp_all(void)
 {
 	if (tlb_ops_need_broadcast())
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 53bab123a8ee..7d7a44810a0b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -77,6 +77,7 @@ config X86
 	select ARCH_HAS_DEVMEM_IS_ALLOWED
 	select ARCH_HAS_EARLY_DEBUG		if KGDB
 	select ARCH_HAS_ELF_RANDOMIZE
+	select ARCH_HAS_FLUSH_TLB_KERNEL_VAS
 	select ARCH_HAS_FAST_MULTIPLIER
 	select ARCH_HAS_FORTIFY_SOURCE
 	select ARCH_HAS_GCOV_PROFILE_ALL
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 267acf27480a..c39d77eb37e4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/sched/smt.h>
 #include <linux/task_work.h>
+#include <linux/vmalloc.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -1081,6 +1082,27 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 	}
 }
 
+static void do_flush_tlb_vas(void *arg)
+{
+	struct list_head *vmap_list = arg;
+	struct vmap_area *va;
+	unsigned long addr;
+
+	list_for_each_entry(va, vmap_list, list) {
+		/* flush range by one by one 'invlpg' */
+		for (addr = va->va_start; addr < va->va_end; addr += PAGE_SIZE)
+			flush_tlb_one_kernel(addr);
+	}
+}
+
+void flush_tlb_kernel_vas(struct list_head *vmap_list, unsigned long num_entries)
+{
+	if (num_entries > tlb_single_page_flush_ceiling)
+		on_each_cpu(do_flush_tlb_all, NULL, 1);
+	else
+		on_each_cpu(do_flush_tlb_vas, vmap_list, 1);
+}
+
 /*
  * This can be used from process context to figure out what the value of
  * CR3 is without needing to do a (slow) __read_cr3().
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index c720be70c8dd..a9a1e488261d 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -295,4 +295,12 @@ bool vmalloc_dump_obj(void *object);
 static inline bool vmalloc_dump_obj(void *object) { return false; }
 #endif
 
+#if defined(CONFIG_HAVE_FLUSH_TLB_KERNEL_VAS)
+void flush_tlb_kernel_vas(struct list_head *list, unsigned long num_entries);
+#else
+static inline void flush_tlb_kernel_vas(struct list_head *list, unsigned long num_entries)
+{
+}
+#endif
+
 #endif /* _LINUX_VMALLOC_H */
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index c0f80982eb06..31e8d9e93650 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1724,7 +1724,8 @@ static void purge_fragmented_blocks_allcpus(void);
  */
 static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 {
-	unsigned long resched_threshold;
+	unsigned long resched_threshold, num_entries = 0, num_alias_entries = 0;
+	struct vmap_area alias_va = { .va_start = start, .va_end = end };
 	unsigned int num_purged_areas = 0;
 	struct list_head local_purge_list;
 	struct vmap_area *va, *n_va;
@@ -1736,18 +1737,29 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
 	list_replace_init(&purge_vmap_area_list, &local_purge_list);
 	spin_unlock(&purge_vmap_area_lock);
 
-	if (unlikely(list_empty(&local_purge_list)))
-		goto out;
+	start = min(start, list_first_entry(&local_purge_list, struct vmap_area, list)->va_start);
+	end = max(end, list_last_entry(&local_purge_list, struct vmap_area, list)->va_end);
+
+	if (IS_ENABLED(CONFIG_HAVE_FLUSH_TLB_KERNEL_VAS)) {
+		list_for_each_entry(va, &local_purge_list, list)
+			num_entries += (va->va_end - va->va_start) >> PAGE_SHIFT;
+
+		if (unlikely(!num_entries))
+			goto out;
+
+		if (alias_va.va_end > alias_va.va_start) {
+			num_alias_entries = (alias_va.va_end - alias_va.va_start) >> PAGE_SHIFT;
+			list_add(&alias_va.list, &local_purge_list);
+		}
 
-	start = min(start,
-		list_first_entry(&local_purge_list,
-			struct vmap_area, list)->va_start);
+		flush_tlb_kernel_vas(&local_purge_list, num_entries + num_alias_entries);
 
-	end = max(end,
-		list_last_entry(&local_purge_list,
-			struct vmap_area, list)->va_end);
+		if (num_alias_entries)
+			list_del(&alias_va.list);
+	} else {
+		flush_tlb_kernel_range(start, end);
+	}
 
-	flush_tlb_kernel_range(start, end);
 	resched_threshold = lazy_max_pages() << 1;
 
 	spin_lock(&free_vmap_area_lock);
-- 
2.34.1




More information about the linux-arm-kernel mailing list