Re: [PATCH v2 3/3] RISC-V: KVM: Recover gstage huge page mappings during disable-dirty-log
Anup Patel
anup at brainfault.org
Sun May 31 23:22:56 PDT 2026
On Wed, Apr 29, 2026 at 3:05 PM <wang.yechao255 at zte.com.cn> wrote:
>
> From: Wang Yechao <wang.yechao255 at zte.com.cn>
>
> When dirty logging is enabled, the gstage mappings are split into
> 4K pages to track dirty pages. If the migration fails or is canceled,
> in order to keep the VM's performance consistent with that before
> dirty logging was enabled, the gstage huge page mappings are recoverd
> when dirty logging is disabled.
>
> With this patch, dirty_log_perf_test shows a decrease in the number of
> vCPU faults:
>
> $ perf stat -e kvm:kvm_page_fault \
> /dirty_log_perf_test -s anonymous_hugetlb_1gb -v 1 -e -b 1G
>
> Before: 524,819 kvm:kvm_page_fault
> After : 263,211 kvm:kvm_page_fault
>
> Signed-off-by: Wang Yechao <wang.yechao255 at zte.com.cn>
> ---
> arch/riscv/include/asm/kvm_gstage.h | 4 +++
> arch/riscv/kvm/gstage.c | 42 ++++++++++++++++++++++++++
> arch/riscv/kvm/mmu.c | 47 +++++++++++++++++++++++++++++
> 3 files changed, 93 insertions(+)
>
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 9c908432bc17..a074493596d1 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -64,6 +64,10 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
> struct kvm_mmu_memory_cache *pcache,
> gpa_t addr, u32 target_level, bool flush);
>
> +void kvm_riscv_gstage_recover_huge(struct kvm_gstage *gstage, gpa_t addr,
> + unsigned long taget_page_size,
> + unsigned long *page_size);
> +
> enum kvm_riscv_gstage_op {
> GSTAGE_OP_NOP = 0, /* Nothing */
> GSTAGE_OP_CLEAR, /* Clear/Unmap */
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index d9fe8be2a151..e2c8cbac30d9 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -337,6 +337,48 @@ int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
> return 0;
> }
>
> +void kvm_riscv_gstage_recover_huge(struct kvm_gstage *gstage, gpa_t addr,
> + unsigned long target_page_size,
> + unsigned long *page_size)
> +{
> + u32 current_level = gstage->pgd_levels - 1;
> + pte_t *next_ptep = (pte_t *)gstage->pgd;
> + u32 target_level, out_level;
> + pte_t *ptep, *child_ptep;
> + int ret;
> +
> + out_level = 0;
> + ret = gstage_page_size_to_level(gstage, target_page_size, &target_level);
> + if (ret)
> + goto out;
> +
> + while (current_level >= target_level) {
> + ptep = (pte_t *)&next_ptep[gstage_pte_index(gstage, addr, current_level)];
> +
> + out_level = current_level;
> + if (!pte_val(ptep_get(ptep)))
> + goto out;
> +
> + /* The mapping is already a huge page mapping. */
> + if (gstage_pte_leaf(ptep))
> + goto out;
> +
> + next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
> + current_level--;
> + }
> +
> + /* Replace the huge PTE with the first PTE entry of the child.*/
> + child_ptep = (pte_t *)&next_ptep[0];
> + if (gstage_pte_leaf(child_ptep)) {
> + set_pte(ptep, __pte(pte_val(ptep_get(child_ptep))));
> + gstage_tlb_flush(gstage, target_level, addr);
> + put_page(virt_to_page(next_ptep));
> + }
> +
> +out:
> + gstage_level_to_page_size(gstage, out_level, page_size);
> +}
> +
> void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
> pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
> {
> diff --git a/arch/riscv/kvm/mmu.c b/arch/riscv/kvm/mmu.c
> index dc6a8efde99e..789e3362144f 100644
> --- a/arch/riscv/kvm/mmu.c
> +++ b/arch/riscv/kvm/mmu.c
> @@ -16,6 +16,8 @@
> #include <asm/kvm_mmu.h>
> #include <asm/kvm_nacl.h>
>
> +static void kvm_mmu_recover_huge_pages(struct kvm *kvm, int slot);
> +
> static void mmu_wp_memory_region(struct kvm *kvm, int slot)
> {
> struct kvm_memslots *slots = kvm_memslots(kvm);
> @@ -160,6 +162,17 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
> if (kvm_dirty_log_manual_protect_and_init_set(kvm))
> return;
> mmu_wp_memory_region(kvm, new->id);
> + } else {
> + /*
> + * Recover huge page mappings in the slot now that dirty logging
> + * is disabled, i.e. now that KVM does not have to track guest
> + * writes at 4KiB granularity.
> + *
> + * Dirty logging might be disabled by userspace if an ongoing VM
> + * live migration is cancelled and the VM must continue running
> + * on the source.
> + */
> + kvm_mmu_recover_huge_pages(kvm, new->id);
> }
> }
>
> @@ -598,3 +611,37 @@ void kvm_riscv_mmu_update_hgatp(struct kvm_vcpu *vcpu)
> if (!kvm_riscv_gstage_vmid_bits())
> kvm_riscv_local_hfence_gvma_all();
> }
> +
> +static void kvm_mmu_recover_huge_pages(struct kvm *kvm, int slot)
> +{
> + struct kvm_memslots *slots = kvm_memslots(kvm);
> + struct kvm_memory_slot *memslot = id_to_memslot(slots, slot);
> + unsigned long hva = gfn_to_hva(kvm, memslot->base_gfn);
> + phys_addr_t start = memslot->base_gfn << PAGE_SHIFT;
> + phys_addr_t end = (memslot->base_gfn + memslot->npages) << PAGE_SHIFT;
> + phys_addr_t addr = start;
> + struct kvm_gstage gstage;
> + unsigned long page_size;
> +
> + if (!fault_supports_gstage_huge_mapping(memslot, hva))
> + return;
> +
> + kvm_riscv_gstage_init(&gstage, kvm);
> +
> + spin_lock(&kvm->mmu_lock);
> +
> + while (addr < end) {
> + if (get_hva_mapping_size(kvm, hva) < PMD_SIZE) {
This is hard-coded to work only for 2MB hugepages.
> + addr += PMD_SIZE;
> + hva += PMD_SIZE;
> + continue;
> + }
> +
> + kvm_riscv_gstage_recover_huge(&gstage, addr, PMD_SIZE, &page_size);
> +
> + addr += page_size;
> + hva += page_size;
> + cond_resched_lock(&kvm->mmu_lock);
> + }
> + spin_unlock(&kvm->mmu_lock);
> +}
> --
> 2.43.5
Regards,
Anup
More information about the kvm-riscv
mailing list