[PATCH v2] RISC-V: KVM: Batch stage-2 remote TLB flushes

Anup Patel anup at brainfault.org
Wed Apr 1 23:11:31 PDT 2026


On Wed, Mar 18, 2026 at 6:05 PM Jinyu Tang <tjytimi at 163.com> wrote:
>
> KVM RISC-V triggers a TLB flush for every single stage-2 PTE
> modification (unmap or write-protect) now. Although KVM coalesces the
> hardware IPIs, the software overhead of executing the flush work
> for every 4K page is large, especially during dirty page tracking.
>
> Following the approach used in x86 and arm64, this patch optimizes
> the MMU logic by making the PTE manipulation functions return a boolean
> indicating if a leaf PTE was actually changed. The outer MMU functions
> bubble up this flag to batch the remote TLB flushes. This change makes
> the generic KVM MMU callbacks (e.g., `kvm_unmap_gfn_range()`) actually
> effective by returning the true flush requirement rather than `false`.
>
> Consequently, the flush operation is executed only once per batch.
> Moving it outside of the `mmu_lock` also reduces lock contention.
> For directory pages, we keep the synchronous flush to ensure safety.
>
> Tested with tools/testing/selftests/kvm on a 4-vCPU guest (Host
> environment: QEMU 10.2.1 RISC-V)
> 1. demand_paging_test (1GB memory)
>         # time ./demand_paging_test -b 1G -v 4
>    - Total execution time reduced from ~2m33s to ~2m25s
> 2. dirty_log_perf_test (1GB memory)
>         # ./dirty_log_perf_test -b 1G -v 4
>         - "Clear dirty log time" per iteration dropped significantly from
>      ~3.02s to ~0.20s
>
> Reviewed-by: Nutty Liu <nutty.liu at hotmail.com>
> Signed-off-by: Jinyu Tang <tjytimi at 163.com>
> ---
> v1 -> v2:
> Changes in v2:
> - Fixed alignment issues in multi-line function calls supported by
>   Nutty Liu.
>
>  arch/riscv/include/asm/kvm_gstage.h |  6 ++--
>  arch/riscv/kvm/gstage.c             | 46 +++++++++++++++++++----------
>  arch/riscv/kvm/mmu.c                | 38 +++++++++++++++++-------
>  3 files changed, 62 insertions(+), 28 deletions(-)
>
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 595e21831..b003a07f1 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -59,13 +59,13 @@ enum kvm_riscv_gstage_op {
>         GSTAGE_OP_WP,           /* Write-protect */
>  };
>
> -void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
> +bool kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>                              pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op);
>
> -void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
> +bool kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>                                   gpa_t start, gpa_t size, bool may_block);
>
> -void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end);
> +bool kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end);
>
>  void kvm_riscv_gstage_mode_detect(void);
>
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index b67d60d72..798719ac1 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -209,49 +209,59 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>         return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
>  }
>
> -void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
> +bool kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>                              pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
>  {
>         int i, ret;
>         pte_t old_pte, *next_ptep;
>         u32 next_ptep_level;
>         unsigned long next_page_size, page_size;
> +       bool flush = false;
>
>         ret = gstage_level_to_page_size(ptep_level, &page_size);
>         if (ret)
> -               return;
> +               return false;
>
>         WARN_ON(addr & (page_size - 1));
>
>         if (!pte_val(ptep_get(ptep)))
> -               return;
> +               return false;
>
>         if (ptep_level && !gstage_pte_leaf(ptep)) {
>                 next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
>                 next_ptep_level = ptep_level - 1;
>                 ret = gstage_level_to_page_size(next_ptep_level, &next_page_size);
>                 if (ret)
> -                       return;
> +                       return false;
>
>                 if (op == GSTAGE_OP_CLEAR)
>                         set_pte(ptep, __pte(0));
>                 for (i = 0; i < PTRS_PER_PTE; i++)
> -                       kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
> -                                               &next_ptep[i], next_ptep_level, op);
> -               if (op == GSTAGE_OP_CLEAR)
> +                       flush |= kvm_riscv_gstage_op_pte(gstage, addr + i * next_page_size,
> +                                                        &next_ptep[i], next_ptep_level, op);
> +               if (op == GSTAGE_OP_CLEAR) {
> +                       gstage_tlb_flush(gstage, ptep_level, addr);
> +                       flush = false;
>                         put_page(virt_to_page(next_ptep));
> +               }

gstage_tlb_flush() should not be called for non-leaf PTEs.

Only "flush |= kvm_riscv_gstage_op_pte(...)" is sufficient over here.

>         } else {
>                 old_pte = *ptep;
>                 if (op == GSTAGE_OP_CLEAR)
>                         set_pte(ptep, __pte(0));
>                 else if (op == GSTAGE_OP_WP)
>                         set_pte(ptep, __pte(pte_val(ptep_get(ptep)) & ~_PAGE_WRITE));
> -               if (pte_val(*ptep) != pte_val(old_pte))
> -                       gstage_tlb_flush(gstage, ptep_level, addr);
> +               if (pte_val(*ptep) != pte_val(old_pte)) {
> +                       if (gstage->flags & KVM_GSTAGE_FLAGS_LOCAL)
> +                               gstage_tlb_flush(gstage, ptep_level, addr);
> +                       else
> +                               flush = true;

Since, we are requiring callers of kvm_riscv_gstage_op_pte() to do the
TLB flushing, this should only set "flush = true".

> +               }
>         }
> +
> +       return flush;
>  }
>
> -void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
> +bool kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>                                   gpa_t start, gpa_t size, bool may_block)
>  {
>         int ret;
> @@ -260,6 +270,7 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>         bool found_leaf;
>         unsigned long page_size;
>         gpa_t addr = start, end = start + size;
> +       bool flush = false;
>
>         while (addr < end) {
>                 found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> @@ -271,8 +282,8 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>                         goto next;
>
>                 if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
> -                       kvm_riscv_gstage_op_pte(gstage, addr, ptep,
> -                                               ptep_level, GSTAGE_OP_CLEAR);
> +                       flush |= kvm_riscv_gstage_op_pte(gstage, addr, ptep,
> +                                                        ptep_level, GSTAGE_OP_CLEAR);
>
>  next:
>                 addr += page_size;
> @@ -284,9 +295,11 @@ void kvm_riscv_gstage_unmap_range(struct kvm_gstage *gstage,
>                 if (!(gstage->flags & KVM_GSTAGE_FLAGS_LOCAL) && may_block && addr < end)
>                         cond_resched_lock(&gstage->kvm->mmu_lock);
>         }
> +
> +       return flush;
>  }
>
> -void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
> +bool kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end)
>  {
>         int ret;
>         pte_t *ptep;
> @@ -294,6 +307,7 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>         bool found_leaf;
>         gpa_t addr = start;
>         unsigned long page_size;
> +       bool flush = false;
>
>         while (addr < end) {
>                 found_leaf = kvm_riscv_gstage_get_leaf(gstage, addr, &ptep, &ptep_level);
> @@ -305,12 +319,14 @@ void kvm_riscv_gstage_wp_range(struct kvm_gstage *gstage, gpa_t start, gpa_t end
>                         goto next;
>
>                 if (!(addr & (page_size - 1)) && ((end - addr) >= page_size))
> -                       kvm_riscv_gstage_op_pte(gstage, addr, ptep,
> -                                               ptep_level, GSTAGE_OP_WP);
> +                       flush |= kvm_riscv_gstage_op_pte(gstage, addr, ptep,
> +                                                        ptep_level, GSTAGE_OP_WP);
>
>  next:
>                 addr += page_size;
>         }
> +
> +       return flush;
>  }
>
>  void __init kvm_riscv_gstage_mode_detect(void)
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index 0b75eb2a1..dde5afe57 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -23,6 +23,7 @@ static void mmu_wp_memory_region(struct kvm *kvm, int slot)
>         phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
>         phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
>         struct kvm_gstage gstage;
> +       bool flush;
>
>         gstage.kvm = kvm;
>         gstage.flags = 0;
> @@ -30,9 +31,10 @@ static void mmu_wp_memory_region(struct kvm *kvm, int slot)
>         gstage.pgd = kvm->arch.pgd;
>
>         spin_lock(&kvm->mmu_lock);
> -       kvm_riscv_gstage_wp_range(&gstage, start, end);
> +       flush = kvm_riscv_gstage_wp_range(&gstage, start, end);
>         spin_unlock(&kvm->mmu_lock);
> -       kvm_flush_remote_tlbs_memslot(kvm, memslot);
> +       if (flush)
> +               kvm_flush_remote_tlbs_memslot(kvm, memslot);
>  }
>
>  int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
> @@ -88,6 +90,7 @@ int kvm_riscv_mmu_ioremap(struct kvm *kvm, gpa_t gpa, phys_addr_t hpa,
>  void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
>  {
>         struct kvm_gstage gstage;
> +       bool flush;
>
>         gstage.kvm = kvm;
>         gstage.flags = 0;
> @@ -95,8 +98,12 @@ void kvm_riscv_mmu_iounmap(struct kvm *kvm, gpa_t gpa, unsigned long size)
>         gstage.pgd = kvm->arch.pgd;
>
>         spin_lock(&kvm->mmu_lock);
> -       kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
> +       flush = kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
>         spin_unlock(&kvm->mmu_lock);
> +
> +       if (flush)
> +               kvm_flush_remote_tlbs_range(kvm, gpa >> PAGE_SHIFT,
> +                                           size >> PAGE_SHIFT);
>  }
>
>  void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
> @@ -108,13 +115,17 @@ void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
>         phys_addr_t start = (base_gfn +  __ffs(mask)) << PAGE_SHIFT;
>         phys_addr_t end = (base_gfn + __fls(mask) + 1) << PAGE_SHIFT;
>         struct kvm_gstage gstage;
> +       bool flush;
>
>         gstage.kvm = kvm;
>         gstage.flags = 0;
>         gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>         gstage.pgd = kvm->arch.pgd;
>
> -       kvm_riscv_gstage_wp_range(&gstage, start, end);
> +       flush = kvm_riscv_gstage_wp_range(&gstage, start, end);
> +       if (flush)
> +               kvm_flush_remote_tlbs_range(kvm, start >> PAGE_SHIFT,
> +                                           (end - start) >> PAGE_SHIFT);
>  }
>
>  void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot)
> @@ -140,6 +151,7 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>         gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
>         phys_addr_t size = slot->npages << PAGE_SHIFT;
>         struct kvm_gstage gstage;
> +       bool flush;
>
>         gstage.kvm = kvm;
>         gstage.flags = 0;
> @@ -147,8 +159,10 @@ void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
>         gstage.pgd = kvm->arch.pgd;
>
>         spin_lock(&kvm->mmu_lock);
> -       kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
> +       flush = kvm_riscv_gstage_unmap_range(&gstage, gpa, size, false);
>         spin_unlock(&kvm->mmu_lock);
> +       if (flush)
> +               kvm_flush_remote_tlbs(kvm);

Why nuke the whole TLB over here ?
I think kvm_flush_remote_tlbs_range() is sufficient here.

>  }
>
>  void kvm_arch_commit_memory_region(struct kvm *kvm,
> @@ -253,10 +267,9 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
>         gstage.flags = 0;
>         gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>         gstage.pgd = kvm->arch.pgd;
> -       kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT,
> -                                    (range->end - range->start) << PAGE_SHIFT,
> -                                    range->may_block);
> -       return false;
> +       return kvm_riscv_gstage_unmap_range(&gstage, range->start << PAGE_SHIFT,
> +                                           (range->end - range->start) << PAGE_SHIFT,
> +                                           range->may_block);

The caller of this function (aka kvm_handle_hva_range()) nukes the
entire TLB which is not efficient.

Better to do kvm_flush_remote_tlbs_range() over here and return false
like before.

>  }
>
>  bool kvm_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
> @@ -579,6 +592,7 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>  {
>         struct kvm_gstage gstage;
>         void *pgd = NULL;
> +       bool flush = false;
>
>         spin_lock(&kvm->mmu_lock);
>         if (kvm->arch.pgd) {
> @@ -586,13 +600,17 @@ void kvm_riscv_mmu_free_pgd(struct kvm *kvm)
>                 gstage.flags = 0;
>                 gstage.vmid = READ_ONCE(kvm->arch.vmid.vmid);
>                 gstage.pgd = kvm->arch.pgd;
> -               kvm_riscv_gstage_unmap_range(&gstage, 0UL, kvm_riscv_gstage_gpa_size, false);
> +               flush = kvm_riscv_gstage_unmap_range(&gstage, 0UL,
> +                                                    kvm_riscv_gstage_gpa_size, false);
>                 pgd = READ_ONCE(kvm->arch.pgd);
>                 kvm->arch.pgd = NULL;
>                 kvm->arch.pgd_phys = 0;
>         }
>         spin_unlock(&kvm->mmu_lock);
>
> +       if (flush)
> +               kvm_flush_remote_tlbs(kvm);
> +
>         if (pgd)
>                 free_pages((unsigned long)pgd, get_order(kvm_riscv_gstage_pgd_size));
>  }
> --
> 2.43.0
>

Overall, this is good change. Please address the above comments.

Regards,
Anup



More information about the linux-riscv mailing list