[PATCH v3] RISC-V: KVM: Batch stage-2 remote TLB flushes

Anup Patel apatel at ventanamicro.com
Wed Apr 8 23:03:02 PDT 2026


On Wed, Apr 8, 2026 at 9:43 PM Jinyu Tang <tjytimi at 163.com> wrote:
>
> KVM RISC-V triggers a TLB flush for every single stage-2 PTE
> modification (unmap or write-protect) now. Although KVM coalesces the
> hardware IPIs, the software overhead of executing the flush work
> for every 4K page is large, especially during dirty page tracking.
>
> Following the approach used in x86 and arm64, this patch optimizes
> the MMU logic by making the PTE manipulation functions return a boolean
> indicating if a leaf PTE was actually changed. The outer MMU functions
> bubble up this flag to batch the remote TLB flushes.
>
> Consequently, the flush operation is executed only once per batch.
> Moving it outside of the `mmu_lock` also reduces lock contention.
>
> Tested with tools/testing/selftests/kvm on a 4-vCPU guest (Host
> environment: QEMU 10.2.1 RISC-V)
> 1. demand_paging_test (1GB memory)
>         # time ./demand_paging_test -b 1G -v 4
>    - Total execution time reduced from ~2m33s to ~2m25s
> 2. dirty_log_perf_test (1GB memory)
>         # ./dirty_log_perf_test -b 1G -v 4
>         - "Clear dirty log time" per iteration dropped significantly from
>      ~3.02s to ~0.19s
>
> Reviewed-by: Nutty Liu <nutty.liu at hotmail.com>
> Signed-off-by: Jinyu Tang <tjytimi at 163.com>
> ---
> v2 -> v3:
> Addressed review comments from Anup Patel:
> - Removed gstage_tlb_flush() for non-leaf PTEs only set flush flag
> - Removed KVM_GSTAGE_FLAGS_LOCAL check
> - Used kvm_flush_remote_tlbs_range() instead of full flushes in
>   kvm_arch_flush_shadow_memslot() and kvm_unmap_gfn_range() to avoid
>   unnecessary global TLB flush.
>
> v1 -> v2:
> - Fixed alignment issues in multi-line function calls supported by
>   Nutty Liu.
>
>  arch/riscv/include/asm/kvm_gstage.h |  6 ++---
>  arch/riscv/kvm/gstage.c             | 35 +++++++++++++++----------
>  arch/riscv/kvm/mmu.c                | 40 ++++++++++++++++++++++-------
>  3 files changed, 56 insertions(+), 25 deletions(-)

This patch conflicts with other patches in the riscv_kvm_next
branch at https://github.com/kvm-riscv/linux.git

Please rebase on latest riscv_kvm_next and send v4 ?

Regards,
Anup

>
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 595e21831..b003a07f1 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -59,13 +59,13 @@ enum kvm_riscv_gstage_op {
>         GSTAGE_OP_WP,           /* Write-protect */
>  };
>
> -void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
> +bool kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>                              pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op);
>
> -void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
> +bool kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>                                   gpa_t start, gpa_t size, bool may_block);
>
> -void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end);
> +bool kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end);
>
>  void kvm_riscv_gstage_mode_detect(void);
>
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index b67d60d72..f008ccf1d 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -209,35 +209,36 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>         return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
>  }
>
> -void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
> +bool kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>                              pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
>  {
>         int i, ret;
>         pte_t old_pte, *next_ptep;
>         u32 next_ptep_level;
>         unsigned long next_page_size, page_size;
> +       bool flush = false;
>
>         ret = gstage_level_to_page_size(ptep_level, &page_size);
>         if (ret)
> -               return;
> +               return false;
>
>         WARN_ON(addr & (page_size - 1));
>
>         if (!pte_val(ptep_get(ptep)))
> -               return;
> +               return false;
>
>         if (ptep_level && !gstage_pte_leaf(ptep)) {
>                 next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
>                 next_ptep_level = ptep_level - 1;
>                 ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
>                 if (ret)
> -                       return;
> +                       return false;
>
>                 if (op == GSTAGE_OP_CLEAR)
>                         set_pte(ptep, __pte(0));
>                 for (i = 0; i < PTRS_PER_PTE; i++)
> -                       kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
> -                                               &next_ptep[i], next_ptep_level, op);
> +                       flush |= kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
> +                                                        &next_ptep[i], next_ptep_level, op);
>                 if (op == GSTAGE_OP_CLEAR)
>                         put_page(virt_to_page(next_ptep));
>         } else {
> @@ -247,11 +248,13 @@ void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>                 else if (op == GSTAGE_OP_WP)
>                         set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
>                 if (pte_val(*ptep) != pte_val(old_pte))
> -                       gstage_tlb_flush(gstage, ptep_level, addr);
> +                       flush = true;
>         }
> +
> +       return flush;
>  }
>
> -void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
> +bool kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>                                   gpa_t start, gpa_t size, bool may_block)
>  {
>         int ret;
> @@ -260,6 +263,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>         bool found_leaf;
>         unsigned long page_size;
>         gpa_t addr = start, end = start + size;
> +       bool flush = false;
>
>         while (addr < end) {
>                 found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> @@ -271,8 +275,8 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>                         goto next;
>
>                 if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
> -                       kvm_riscv_gstage_op_pte(gstage, addr, ptep,
> -                                               ptep_level, GSTAGE_OP_CLEAR);
> +                       flush |= kvm_riscv_gstage_op_pte(gstage, addr, ptep,
> +                                                        ptep_level, GSTAGE_OP_CLEAR);
>
>  next:
>                 addr += page_size;
> @@ -284,9 +288,11 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>                 if (!(gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) && may_block && addr < end)
>                         cond_resched_lock(&gstage->kvm->mmu_lock);
>         }
> +
> +       return flush;
>  }
>
> -void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
> +bool kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
>  {
>         int ret;
>         pte_t *ptep;
> @@ -294,6 +300,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>         bool found_leaf;
>         gpa_t addr = start;
>         unsigned long page_size;
> +       bool flush = false;
>
>         while (addr < end) {
>                 found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> @@ -305,12 +312,14 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>                         goto next;
>
>                 if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
> -                       kvm_riscv_gstage_op_pte(gstage, addr, ptep,
> -                                               ptep_level, GSTAGE_OP_WP);
> +                       flush |= kvm_riscv_gstage_op_pte(gstage, addr, ptep,
> +                                                        ptep_level, GSTAGE_OP_WP);
>
>  next:
>                 addr += page_size;
>         }
> +
> +       return flush;
>  }
>
>  void __init kvm_riscv_gstage_mode_detect(void)
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 0b75eb2a1..b9a57f0a9 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -23,6 +23,7 @@ static void mmu_wp_memory_region(struct kvm *kvm, int slot)
>         phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
>         phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
>         struct kvm_gstage gstage;
> +       bool flush;
>
>         gstage.kvm = kvm;
>         gstage.flags = 0;
> @@ -30,9 +31,10 @@ static void mmu_wp_memory_region(struct kvm *kvm, int slot)
>         gstage.pgd = kvm->arch.pgd;
>
>         spin_lock(&kvm->mmu_lock);
> -       kvm_riscv_gstage_wp_range(&gstage, start, end);
> +       flush = kvm_riscv_gstage_wp_range(&gstage, start, end);
>         spin_unlock(&kvm->mmu_lock);
> -       kvm_flush_remote_tlbs_memslot(kvm, memslot);
> +       if (flush)
> +               kvm_flush_remote_tlbs_memslot(kvm, memslot);
>  }
>
>  int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
> @@ -88,6 +90,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
>  void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
>  {
>         struct kvm_gstage gstage;
> +       bool flush;
>
>         gstage.kvm = kvm;
>         gstage.flags = 0;
> @@ -95,8 +98,12 @@ void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
>         gstage.pgd = kvm->arch.pgd;
>
>         spin_lock(&kvm->mmu_lock);
> -       kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
> +       flush = kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
>         spin_unlock(&kvm->mmu_lock);
> +
> +       if (flush)
> +               kvm_flush_remote_tlbs_range(kvm, gpa >> PAGE_SHIFT,
> +                                           size >> PAGE_SHIFT);
>  }
>
>  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
> @@ -108,13 +115,17 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
>         phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
>         phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
>         struct kvm_gstage gstage;
> +       bool flush;
>
>         gstage.kvm = kvm;
>         gstage.flags = 0;
>         gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>         gstage.pgd = kvm->arch.pgd;
>
> -       kvm_riscv_gstage_wp_range(&gstage, start, end);
> +       flush = kvm_riscv_gstage_wp_range(&gstage, start, end);
> +       if (flush)
> +               kvm_flush_remote_tlbs_range(kvm, start >> PAGE_SHIFT,
> +                                           (end - start) >> PAGE_SHIFT);
>  }
>
>  void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
> @@ -140,6 +151,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>         gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
>         phys_addr_t size = slot->npages << PAGE_SHIFT;
>         struct kvm_gstage gstage;
> +       bool flush;
>
>         gstage.kvm = kvm;
>         gstage.flags = 0;
> @@ -147,8 +159,11 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>         gstage.pgd = kvm->arch.pgd;
>
>         spin_lock(&kvm->mmu_lock);
> -       kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
> +       flush = kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
>         spin_unlock(&kvm->mmu_lock);
> +       if (flush)
> +               kvm_flush_remote_tlbs_range(kvm, gpa >> PAGE_SHIFT,
> +                                           size >> PAGE_SHIFT);
>  }
>
>  void kvm_arch_commit_memory_region(struct kvm *kvm,
> @@ -253,9 +268,11 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>         gstage.flags = 0;
>         gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>         gstage.pgd = kvm->arch.pgd;
> -       kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT,
> -                                    (range->end - range->start) << PAGE_SHIFT,
> -                                    range->may_block);
> +       if (kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT,
> +                                        (range->end - range->start) << PAGE_SHIFT,
> +                                        range->may_block))
> +               kvm_flush_remote_tlbs_range(kvm, range->start,
> +                                           range->end - range->start);
>         return false;
>  }
>
> @@ -579,6 +596,7 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>  {
>         struct kvm_gstage gstage;
>         void *pgd = NULL;
> +       bool flush = false;
>
>         spin_lock(&kvm->mmu_lock);
>         if (kvm->arch.pgd) {
> @@ -586,13 +604,17 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>                 gstage.flags = 0;
>                 gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>                 gstage.pgd = kvm->arch.pgd;
> -               kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
> +               flush = kvm_riscv_gstage_unmap_range(&gstage, 0UL,
> +                                                    kvm_riscv_gstage_gpa_size, false);
>                 pgd = READ_ONCE(kvm->arch.pgd);
>                 kvm->arch.pgd = NULL;
>                 kvm->arch.pgd_phys = 0;
>         }
>         spin_unlock(&kvm->mmu_lock);
>
> +       if (flush)
> +               kvm_flush_remote_tlbs(kvm);
> +
>         if (pgd)
>                 free_pages((unsigned long)pgd, get_order(kvm_riscv_gstage_pgd_size));
>  }
> --
> 2.43.0
>
>
> --
> kvm-riscv mailing list
> kvm-riscv at lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kvm-riscv



More information about the linux-riscv mailing list