[PATCH v1] riscv: bring up batched unmap tlb flush

Wed Oct 5 07:43:24 PDT 2022

For riscv, this feature can decrease the times of
flush_tlb_pages(), and decrease the times of boardcasting for SMP.
Add @start and @end in struct arch_tlbflush_unmap_barch in riscv
compared to x86, which can record the minimum and maximum addresses
in this flush batch to reduce the flush range.If there are a lot of 
pages to reclaim, or pages are shared with many tasks like server 
application,this feature is more beneficial.

Signed-off-by: Jinyu Tang <tjytimi at 163.com>
---

This PATCH maybe conflict with patch v4 of arm64 batched unmap tlb flush
and patch v9 of IPI support for riscv.If this patch is ok,I will rebase
it on them.

I have test this patch in QEMU base on: 
https://lore.kernel.org/all/20220921084302.43631-3-yangyicong@huawei.com/

 #include <sys/types.h>
 #include <unistd.h>
 #include <sys/mman.h>
 #include <string.h>
 #include <time.h>
 #include <stdio.h>
 #include <stdlib.h>
 int main()
 {
 	 clock_t start,finish;
 	 double dur;
 #define SIZE (1 * 1024 * 1024 * 500)
         volatile unsigned char *p = mmap(NULL, SIZE, PROT_READ | PROT_WRITE,
                                          MAP_SHARED | MAP_ANONYMOUS, -1, 0);
	 for(int tt = 0; tt < 5; tt ++){
         	memset(p, 0x88, SIZE);
         	start = clock();
         	for (int k = 0; k < 10; k++) {
                	 /* swap in */
                 	for (int i = 0; i < SIZE; i += 4096) {
                        	 (void)p[i];
                 	}

                 	/* swap out */
                 	madvise(p, SIZE, MADV_PAGEOUT);
         	}
         	finish = clock();
         	dur = (double)(finish - start) / CLOCKS_PER_SEC;
         	printf("%f seconds this time\n",dur);
         }
    	 return 0;
         
 }
And in riscv QEMU, this patch decrease the average 27.1s to 26.7s of time cost
about this test app.Maybe the decrease will be more obvious for real CPU.

 arch/riscv/Kconfig                |  8 ++++++++
 arch/riscv/include/asm/tlbbatch.h | 18 ++++++++++++++++++
 arch/riscv/include/asm/tlbflush.h | 20 ++++++++++++++++++++
 arch/riscv/mm/tlbflush.c          | 18 ++++++++++++++++++
 arch/x86/include/asm/tlbflush.h   |  2 +-
 mm/rmap.c                         |  9 ++++-----
 6 files changed, 69 insertions(+), 6 deletions(-)
 create mode 100644 arch/riscv/include/asm/tlbbatch.h

diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig
index 59d18881f35b..03304d0e0c0a 100644
--- a/arch/riscv/Kconfig
+++ b/arch/riscv/Kconfig
@@ -498,6 +498,14 @@ config ARCH_HAS_KEXEC_PURGATORY
 	depends on CRYPTO=y
 	depends on CRYPTO_SHA256=y
 
+config ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
+	bool "Support batched unmap tlb flush"
+	help
+	  This will make tlb flush batched when unmap pages. If there are
+	  a lot of pages to reclaim, or pages are shared with many tasks
+	  like server device, this will decrease the times of flushing tlbs
+	  and broadcasting.
+
 config CRASH_DUMP
 	bool "Build kdump crash kernel"
 	help
diff --git a/arch/riscv/include/asm/tlbbatch.h b/arch/riscv/include/asm/tlbbatch.h
new file mode 100644
index 000000000000..357bb0f46cff
--- /dev/null
+++ b/arch/riscv/include/asm/tlbbatch.h
@@ -0,0 +1,18 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ARCH_RISCV_TLBBATCH_H
+#define _ARCH_RISCV_TLBBATCH_H
+
+#include <linux/cpumask.h>
+
+struct arch_tlbflush_unmap_batch {
+	/*
+	 * Each bit set is a CPU that potentially has a TLB entry for one of
+	 * the PFNs being flushed. And the start and end will record the range
+	 * to be flush this batch.
+	 */
+	struct cpumask cpumask;
+	unsigned long start;
+	unsigned long end;
+};
+
+#endif /* _ARCH_RISCV_TLBBATCH_H */
diff --git a/arch/riscv/include/asm/tlbflush.h b/arch/riscv/include/asm/tlbflush.h
index 801019381dea..e2d9dad08bd0 100644
--- a/arch/riscv/include/asm/tlbflush.h
+++ b/arch/riscv/include/asm/tlbflush.h
@@ -10,6 +10,8 @@
 #include <linux/mm_types.h>
 #include <asm/smp.h>
 #include <asm/errata_list.h>
+#include <asm/tlbbatch.h>
+#include <asm/page.h>
 
 #ifdef CONFIG_MMU
 static inline void local_flush_tlb_all(void)
@@ -59,4 +61,22 @@ static inline void flush_tlb_kernel_range(unsigned long start,
 	flush_tlb_all();
 }
 
+static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
+					unsigned long address, struct mm_struct *mm)
+{
+	/*
+	 * Each bit of cpumask is a CPU that potentially has a TLB entry for one of
+	 * the PFNs being flushed. And the start and end will record the range
+	 * to be flush this batch.
+	 */
+	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
+
+	if (address < batch->start)
+		batch->start = address;
+	else if (address + PAGE_SIZE > batch->end)
+		batch->end = address + PAGE_SIZE;
+}
+
+extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
+
 #endif /* _ASM_RISCV_TLBFLUSH_H */
diff --git a/arch/riscv/mm/tlbflush.c b/arch/riscv/mm/tlbflush.c
index 37ed760d007c..6b117732200b 100644
--- a/arch/riscv/mm/tlbflush.c
+++ b/arch/riscv/mm/tlbflush.c
@@ -86,3 +86,21 @@ void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start,
 	__sbi_tlb_flush_range(vma->vm_mm, start, end - start, PMD_SIZE);
 }
 #endif
+void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch)
+{
+	unsigned int cpuid;
+	bool broadcast;
+
+	cpuid = get_cpu();
+
+	broadcast = cpumask_any_but(&batch->cpumask, cpuid) < nr_cpu_ids;
+	if (broadcast) {
+		sbi_remote_sfence_vma(&batch->cpumask, batch->start, batch->end - batch->start);
+	} else {
+		local_flush_tlb_all();
+	}
+
+	cpumask_clear(&batch->cpumask);
+
+	put_cpu();
+}
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index cda3118f3b27..9e7027ae256e 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -252,7 +252,7 @@ static inline u64 inc_mm_tlb_gen(struct mm_struct *mm)
 }
 
 static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch,
-					struct mm_struct *mm)
+					unsigned long address, struct mm_struct *mm)
 {
 	inc_mm_tlb_gen(mm);
 	cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm));
diff --git a/mm/rmap.c b/mm/rmap.c
index 93d5a6f793d2..edae27aa20a9 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -645,12 +645,12 @@ void try_to_unmap_flush_dirty(void)
 #define TLB_FLUSH_BATCH_PENDING_LARGE			\
 	(TLB_FLUSH_BATCH_PENDING_MASK / 2)
 
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, unsigned long address, bool writable)
 {
 	struct tlbflush_unmap_batch *tlb_ubc = &current->tlb_ubc;
 	int batch, nbatch;
 
-	arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
+	arch_tlbbatch_add_mm(&tlb_ubc->arch, address, mm);
 	tlb_ubc->flush_required = true;
 
 	/*
@@ -735,7 +735,7 @@ void flush_tlb_batched_pending(struct mm_struct *mm)
 	}
 }
 #else
-static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
+static void set_tlb_ubc_flush_pending(struct mm_struct *mm, unsigned long address, bool writable)
 {
 }
 
@@ -1596,8 +1596,7 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma,
 				 * and traps if the PTE is unmapped.
 				 */
 				pteval = ptep_get_and_clear(mm, address, pvmw.pte);
-
-				set_tlb_ubc_flush_pending(mm, pte_dirty(pteval));
+				set_tlb_ubc_flush_pending(mm, address, pte_dirty(pteval));
 			} else {
 				pteval = ptep_clear_flush(vma, address, pvmw.pte);
 			}
-- 
2.34.1