[PATCH v3 2/3] RISC-V: KVM: Allow splitting huge pages to arbitrary level

Sun Mar 29 23:25:56 PDT 2026

On Mon, Mar 16, 2026 at 11:49 AM <wang.yechao255 at zte.com.cn> wrote:
>
> From: Wang Yechao <wang.yechao255 at zte.com.cn>
>
> This patch introduces the function kvm_riscv_gstage_split_huge().
> It splits the huge page covering a given guest physical address down
> to a specified target level (e.g., from 1G to 2M or 4K). The caller
> provides a memory cache for allocating any intermediate page tables
> and may request a TLB flush after the split.
>
> This functionality will be used by subsequent patches to split huge
> pages before handling the write-protection fault, or for other operations
> that require page-level granularity.
>
> Signed-off-by: Wang Yechao <wang.yechao255 at zte.com.cn>
> ---
>  arch/riscv/include/asm/kvm_gstage.h |  4 ++
>  arch/riscv/kvm/gstage.c             | 69 +++++++++++++++++++++++++++++
>  2 files changed, 73 insertions(+)
>
> diff --git a/arch/riscv/include/asm/kvm_gstage.h b/arch/riscv/include/asm/kvm_gstage.h
> index 595e2183173e..373748c6745e 100644
> --- a/arch/riscv/include/asm/kvm_gstage.h
> +++ b/arch/riscv/include/asm/kvm_gstage.h
> @@ -53,6 +53,10 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>                               bool page_rdonly, bool page_exec,
>                               struct kvm_gstage_mapping *out_map);
>
> +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
> +                                struct kvm_mmu_memory_cache *pcache,
> +                                gpa_t addr, u32 target_level, bool flush);
> +
>  enum kvm_riscv_gstage_op {
>         GSTAGE_OP_NOP = 0,      /* Nothing */
>         GSTAGE_OP_CLEAR,        /* Clear/Unmap */
> diff --git a/arch/riscv/kvm/gstage.c b/arch/riscv/kvm/gstage.c
> index d2001d508046..5356abb18932 100644
> --- a/arch/riscv/kvm/gstage.c
> +++ b/arch/riscv/kvm/gstage.c
> @@ -209,6 +209,75 @@ int kvm_riscv_gstage_map_page(struct kvm_gstage *gstage,
>         return kvm_riscv_gstage_set_pte(gstage, pcache, out_map);
>  }
>
> +static inline unsigned long make_child_pte(unsigned long huge_pte, int index,
> +                                          unsigned long child_page_size)
> +{
> +       unsigned long child_pte = huge_pte;
> +       unsigned long child_pfn_offset;
> +
> +       /*
> +        * The child_pte already has the base address of the huge page being
> +        * split. So we just have to OR in the offset to the page at the next
> +        * lower level for the given index.
> +        */
> +       child_pfn_offset = index * (child_page_size / PAGE_SIZE);
> +       child_pte |= pte_val(pfn_pte(child_pfn_offset, __pgprot(0)));
> +
> +       return child_pte;
> +}
> +
> +int kvm_riscv_gstage_split_huge(struct kvm_gstage *gstage,
> +                               struct kvm_mmu_memory_cache *pcache,
> +                               gpa_t addr, u32 target_level, bool flush)
> +{
> +       u32 current_level = kvm_riscv_gstage_pgd_levels - 1;
> +       pte_t *next_ptep = (pte_t *)gstage->pgd;
> +       pte_t *ptep;
> +       unsigned long huge_pte, child_pte;
> +       unsigned long child_page_size;
> +       int i, ret;

Declare local variables in inverted pyramid fashion.

> +
> +       while(current_level > target_level) {
> +               ptep = (pte_t *)&next_ptep[gstage_pte_index(addr, current_level)];
> +
> +               if (!pte_val(ptep_get(ptep)))
> +                       break;
> +
> +               if (!gstage_pte_leaf(ptep)) {
> +                       next_ptep = (pte_t *)gstage_pte_page_vaddr(ptep_get(ptep));
> +                       current_level--;
> +                       continue;
> +               }
> +
> +               huge_pte = pte_val(ptep_get(ptep));
> +
> +               ret = gstage_level_to_page_size(current_level - 1, &child_page_size);
> +               if (ret)
> +                       return ret;
> +
> +               if (!pcache)
> +                       return -ENOMEM;

This checks needs to be outside while-loop.

> +               next_ptep = kvm_mmu_memory_cache_alloc(pcache);
> +               if (!next_ptep)
> +                       return -ENOMEM;
> +
> +               for (i = 0; i < PTRS_PER_PTE; i++) {
> +                       child_pte = make_child_pte(huge_pte, i, child_page_size);
> +                       set_pte((pte_t *)&next_ptep[i], __pte(child_pte));
> +               }
> +
> +               set_pte(ptep, pfn_pte(PFN_DOWN(__pa(next_ptep)),
> +                               __pgprot(_PAGE_TABLE)));
> +
> +               if (flush)
> +                       gstage_tlb_flush(gstage, current_level, addr);
> +
> +               current_level--;
> +       }
> +
> +       return 0;
> +}
> +
>  void kvm_riscv_gstage_op_pte(struct kvm_gstage *gstage, gpa_t addr,
>                              pte_t *ptep, u32 ptep_level, enum kvm_riscv_gstage_op op)
>  {
> --
> 2.27.0

The kvm_riscv_gstage_split_huge() function introduced here
is only used in PATCH3 so better sqash this patch into PATCH3.

Regards,
Anup