[PATCH v2 02/30] KVM: arm64: Introduce struct kvm_s2_fault to user_mem_abort()
Anshuman Khandual
anshuman.khandual at arm.com
Sun Mar 29 21:18:18 PDT 2026
On 27/03/26 5:05 PM, Marc Zyngier wrote:
> From: Fuad Tabba <tabba at google.com>
>
> The user_mem_abort() function takes many arguments and defines a large
> number of local variables. Passing all these variables around to helper
> functions would result in functions with too many arguments.
>
> Introduce struct kvm_s2_fault to encapsulate the stage-2 fault state.
> This structure holds both the input parameters and the intermediate
> state required during the fault handling process.
>
> Update user_mem_abort() to initialize this structure and replace the
> usage of local variables with fields from the new structure.
>
> This prepares the ground for further extracting parts of
> user_mem_abort() into smaller helper functions that can simply take a
> pointer to the fault state structure.
>
> Reviewed-by: Joey Gouly <joey.gouly at arm.com>
> Signed-off-by: Fuad Tabba <tabba at google.com>
> Signed-off-by: Marc Zyngier <maz at kernel.org>
Reviewed-by: Anshuman Khandual <anshuman.khandual at arm.com>
> ---
> arch/arm64/kvm/mmu.c | 212 +++++++++++++++++++++++++------------------
> 1 file changed, 123 insertions(+), 89 deletions(-)
>
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index f8064b2d32045..b366bde15a429 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1710,38 +1710,68 @@ static short kvm_s2_resolve_vma_size(struct vm_area_struct *vma,
> return vma_shift;
> }
>
> +struct kvm_s2_fault {
> + struct kvm_vcpu *vcpu;
> + phys_addr_t fault_ipa;
> + struct kvm_s2_trans *nested;
> + struct kvm_memory_slot *memslot;
> + unsigned long hva;
> + bool fault_is_perm;
> +
> + bool write_fault;
> + bool exec_fault;
> + bool writable;
> + bool topup_memcache;
> + bool mte_allowed;
> + bool is_vma_cacheable;
> + bool s2_force_noncacheable;
> + bool vfio_allow_any_uc;
> + unsigned long mmu_seq;
> + phys_addr_t ipa;
> + short vma_shift;
> + gfn_t gfn;
> + kvm_pfn_t pfn;
> + bool logging_active;
> + bool force_pte;
> + long vma_pagesize;
> + long fault_granule;
> + enum kvm_pgtable_prot prot;
> + struct page *page;
> + vm_flags_t vm_flags;
> +};
> +
> static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> struct kvm_s2_trans *nested,
> struct kvm_memory_slot *memslot, unsigned long hva,
> bool fault_is_perm)
> {
> int ret = 0;
> - bool topup_memcache;
> - bool write_fault, writable;
> - bool exec_fault, mte_allowed, is_vma_cacheable;
> - bool s2_force_noncacheable = false, vfio_allow_any_uc = false;
> - unsigned long mmu_seq;
> - phys_addr_t ipa = fault_ipa;
> + struct kvm_s2_fault fault_data = {
> + .vcpu = vcpu,
> + .fault_ipa = fault_ipa,
> + .nested = nested,
> + .memslot = memslot,
> + .hva = hva,
> + .fault_is_perm = fault_is_perm,
> + .ipa = fault_ipa,
> + .logging_active = memslot_is_logging(memslot),
> + .force_pte = memslot_is_logging(memslot),
> + .s2_force_noncacheable = false,
> + .vfio_allow_any_uc = false,
> + .prot = KVM_PGTABLE_PROT_R,
> + };
> + struct kvm_s2_fault *fault = &fault_data;
> struct kvm *kvm = vcpu->kvm;
> struct vm_area_struct *vma;
> - short vma_shift;
> void *memcache;
> - gfn_t gfn;
> - kvm_pfn_t pfn;
> - bool logging_active = memslot_is_logging(memslot);
> - bool force_pte = logging_active;
> - long vma_pagesize, fault_granule;
> - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R;
> struct kvm_pgtable *pgt;
> - struct page *page;
> - vm_flags_t vm_flags;
> enum kvm_pgtable_walk_flags flags = KVM_PGTABLE_WALK_SHARED;
>
> - if (fault_is_perm)
> - fault_granule = kvm_vcpu_trap_get_perm_fault_granule(vcpu);
> - write_fault = kvm_is_write_fault(vcpu);
> - exec_fault = kvm_vcpu_trap_is_exec_fault(vcpu);
> - VM_WARN_ON_ONCE(write_fault && exec_fault);
> + if (fault->fault_is_perm)
> + fault->fault_granule = kvm_vcpu_trap_get_perm_fault_granule(fault->vcpu);
> + fault->write_fault = kvm_is_write_fault(fault->vcpu);
> + fault->exec_fault = kvm_vcpu_trap_is_exec_fault(fault->vcpu);
> + VM_WARN_ON_ONCE(fault->write_fault && fault->exec_fault);
>
> /*
> * Permission faults just need to update the existing leaf entry,
> @@ -1749,8 +1779,9 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> * only exception to this is when dirty logging is enabled at runtime
> * and a write fault needs to collapse a block entry into a table.
> */
> - topup_memcache = !fault_is_perm || (logging_active && write_fault);
> - ret = prepare_mmu_memcache(vcpu, topup_memcache, &memcache);
> + fault->topup_memcache = !fault->fault_is_perm ||
> + (fault->logging_active && fault->write_fault);
> + ret = prepare_mmu_memcache(fault->vcpu, fault->topup_memcache, &memcache);
> if (ret)
> return ret;
>
> @@ -1759,33 +1790,33 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> * get block mapping for device MMIO region.
> */
> mmap_read_lock(current->mm);
> - vma = vma_lookup(current->mm, hva);
> + vma = vma_lookup(current->mm, fault->hva);
> if (unlikely(!vma)) {
> - kvm_err("Failed to find VMA for hva 0x%lx\n", hva);
> + kvm_err("Failed to find VMA for fault->hva 0x%lx\n", fault->hva);
> mmap_read_unlock(current->mm);
> return -EFAULT;
> }
>
> - vma_shift = kvm_s2_resolve_vma_size(vma, hva, memslot, nested,
> - &force_pte, &ipa);
> - vma_pagesize = 1UL << vma_shift;
> + fault->vma_shift = kvm_s2_resolve_vma_size(vma, fault->hva, fault->memslot, fault->nested,
> + &fault->force_pte, &fault->ipa);
> + fault->vma_pagesize = 1UL << fault->vma_shift;
>
> /*
> * Both the canonical IPA and fault IPA must be aligned to the
> * mapping size to ensure we find the right PFN and lay down the
> * mapping in the right place.
> */
> - fault_ipa = ALIGN_DOWN(fault_ipa, vma_pagesize);
> - ipa = ALIGN_DOWN(ipa, vma_pagesize);
> + fault->fault_ipa = ALIGN_DOWN(fault->fault_ipa, fault->vma_pagesize);
> + fault->ipa = ALIGN_DOWN(fault->ipa, fault->vma_pagesize);
>
> - gfn = ipa >> PAGE_SHIFT;
> - mte_allowed = kvm_vma_mte_allowed(vma);
> + fault->gfn = fault->ipa >> PAGE_SHIFT;
> + fault->mte_allowed = kvm_vma_mte_allowed(vma);
>
> - vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
> + fault->vfio_allow_any_uc = vma->vm_flags & VM_ALLOW_ANY_UNCACHED;
>
> - vm_flags = vma->vm_flags;
> + fault->vm_flags = vma->vm_flags;
>
> - is_vma_cacheable = kvm_vma_is_cacheable(vma);
> + fault->is_vma_cacheable = kvm_vma_is_cacheable(vma);
>
> /* Don't use the VMA after the unlock -- it may have vanished */
> vma = NULL;
> @@ -1798,24 +1829,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> * Rely on mmap_read_unlock() for an implicit smp_rmb(), which pairs
> * with the smp_wmb() in kvm_mmu_invalidate_end().
> */
> - mmu_seq = kvm->mmu_invalidate_seq;
> + fault->mmu_seq = kvm->mmu_invalidate_seq;
> mmap_read_unlock(current->mm);
>
> - pfn = __kvm_faultin_pfn(memslot, gfn, write_fault ? FOLL_WRITE : 0,
> - &writable, &page);
> - if (pfn == KVM_PFN_ERR_HWPOISON) {
> - kvm_send_hwpoison_signal(hva, vma_shift);
> + fault->pfn = __kvm_faultin_pfn(fault->memslot, fault->gfn,
> + fault->write_fault ? FOLL_WRITE : 0,
> + &fault->writable, &fault->page);
> + if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
> + kvm_send_hwpoison_signal(fault->hva, fault->vma_shift);
> return 0;
> }
> - if (is_error_noslot_pfn(pfn))
> + if (is_error_noslot_pfn(fault->pfn))
> return -EFAULT;
>
> /*
> * Check if this is non-struct page memory PFN, and cannot support
> * CMOs. It could potentially be unsafe to access as cacheable.
> */
> - if (vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(pfn)) {
> - if (is_vma_cacheable) {
> + if (fault->vm_flags & (VM_PFNMAP | VM_MIXEDMAP) && !pfn_is_map_memory(fault->pfn)) {
> + if (fault->is_vma_cacheable) {
> /*
> * Whilst the VMA owner expects cacheable mapping to this
> * PFN, hardware also has to support the FWB and CACHE DIC
> @@ -1833,25 +1865,25 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> } else {
> /*
> * If the page was identified as device early by looking at
> - * the VMA flags, vma_pagesize is already representing the
> + * the VMA flags, fault->vma_pagesize is already representing the
> * largest quantity we can map. If instead it was mapped
> - * via __kvm_faultin_pfn(), vma_pagesize is set to PAGE_SIZE
> + * via __kvm_faultin_pfn(), fault->vma_pagesize is set to PAGE_SIZE
> * and must not be upgraded.
> *
> * In both cases, we don't let transparent_hugepage_adjust()
> * change things at the last minute.
> */
> - s2_force_noncacheable = true;
> + fault->s2_force_noncacheable = true;
> }
> - } else if (logging_active && !write_fault) {
> + } else if (fault->logging_active && !fault->write_fault) {
> /*
> - * Only actually map the page as writable if this was a write
> + * Only actually map the page as fault->writable if this was a write
> * fault.
> */
> - writable = false;
> + fault->writable = false;
> }
>
> - if (exec_fault && s2_force_noncacheable)
> + if (fault->exec_fault && fault->s2_force_noncacheable)
> ret = -ENOEXEC;
>
> if (ret)
> @@ -1860,21 +1892,21 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> /*
> * Guest performs atomic/exclusive operations on memory with unsupported
> * attributes (e.g. ld64b/st64b on normal memory when no FEAT_LS64WB)
> - * and trigger the exception here. Since the memslot is valid, inject
> + * and trigger the exception here. Since the fault->memslot is valid, inject
> * the fault back to the guest.
> */
> - if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(vcpu))) {
> - kvm_inject_dabt_excl_atomic(vcpu, kvm_vcpu_get_hfar(vcpu));
> + if (esr_fsc_is_excl_atomic_fault(kvm_vcpu_get_esr(fault->vcpu))) {
> + kvm_inject_dabt_excl_atomic(fault->vcpu, kvm_vcpu_get_hfar(fault->vcpu));
> ret = 1;
> goto out_put_page;
> }
>
> - if (nested)
> - adjust_nested_fault_perms(nested, &prot, &writable);
> + if (fault->nested)
> + adjust_nested_fault_perms(fault->nested, &fault->prot, &fault->writable);
>
> kvm_fault_lock(kvm);
> - pgt = vcpu->arch.hw_mmu->pgt;
> - if (mmu_invalidate_retry(kvm, mmu_seq)) {
> + pgt = fault->vcpu->arch.hw_mmu->pgt;
> + if (mmu_invalidate_retry(kvm, fault->mmu_seq)) {
> ret = -EAGAIN;
> goto out_unlock;
> }
> @@ -1883,78 +1915,80 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> * If we are not forced to use page mapping, check if we are
> * backed by a THP and thus use block mapping if possible.
> */
> - if (vma_pagesize == PAGE_SIZE && !(force_pte || s2_force_noncacheable)) {
> - if (fault_is_perm && fault_granule > PAGE_SIZE)
> - vma_pagesize = fault_granule;
> + if (fault->vma_pagesize == PAGE_SIZE &&
> + !(fault->force_pte || fault->s2_force_noncacheable)) {
> + if (fault->fault_is_perm && fault->fault_granule > PAGE_SIZE)
> + fault->vma_pagesize = fault->fault_granule;
> else
> - vma_pagesize = transparent_hugepage_adjust(kvm, memslot,
> - hva, &pfn,
> - &fault_ipa);
> + fault->vma_pagesize = transparent_hugepage_adjust(kvm, fault->memslot,
> + fault->hva, &fault->pfn,
> + &fault->fault_ipa);
>
> - if (vma_pagesize < 0) {
> - ret = vma_pagesize;
> + if (fault->vma_pagesize < 0) {
> + ret = fault->vma_pagesize;
> goto out_unlock;
> }
> }
>
> - if (!fault_is_perm && !s2_force_noncacheable && kvm_has_mte(kvm)) {
> + if (!fault->fault_is_perm && !fault->s2_force_noncacheable && kvm_has_mte(kvm)) {
> /* Check the VMM hasn't introduced a new disallowed VMA */
> - if (mte_allowed) {
> - sanitise_mte_tags(kvm, pfn, vma_pagesize);
> + if (fault->mte_allowed) {
> + sanitise_mte_tags(kvm, fault->pfn, fault->vma_pagesize);
> } else {
> ret = -EFAULT;
> goto out_unlock;
> }
> }
>
> - if (writable)
> - prot |= KVM_PGTABLE_PROT_W;
> + if (fault->writable)
> + fault->prot |= KVM_PGTABLE_PROT_W;
>
> - if (exec_fault)
> - prot |= KVM_PGTABLE_PROT_X;
> + if (fault->exec_fault)
> + fault->prot |= KVM_PGTABLE_PROT_X;
>
> - if (s2_force_noncacheable) {
> - if (vfio_allow_any_uc)
> - prot |= KVM_PGTABLE_PROT_NORMAL_NC;
> + if (fault->s2_force_noncacheable) {
> + if (fault->vfio_allow_any_uc)
> + fault->prot |= KVM_PGTABLE_PROT_NORMAL_NC;
> else
> - prot |= KVM_PGTABLE_PROT_DEVICE;
> + fault->prot |= KVM_PGTABLE_PROT_DEVICE;
> } else if (cpus_have_final_cap(ARM64_HAS_CACHE_DIC)) {
> - prot |= KVM_PGTABLE_PROT_X;
> + fault->prot |= KVM_PGTABLE_PROT_X;
> }
>
> - if (nested)
> - adjust_nested_exec_perms(kvm, nested, &prot);
> + if (fault->nested)
> + adjust_nested_exec_perms(kvm, fault->nested, &fault->prot);
>
> /*
> * Under the premise of getting a FSC_PERM fault, we just need to relax
> - * permissions only if vma_pagesize equals fault_granule. Otherwise,
> + * permissions only if fault->vma_pagesize equals fault->fault_granule. Otherwise,
> * kvm_pgtable_stage2_map() should be called to change block size.
> */
> - if (fault_is_perm && vma_pagesize == fault_granule) {
> + if (fault->fault_is_perm && fault->vma_pagesize == fault->fault_granule) {
> /*
> * Drop the SW bits in favour of those stored in the
> * PTE, which will be preserved.
> */
> - prot &= ~KVM_NV_GUEST_MAP_SZ;
> - ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault_ipa, prot, flags);
> + fault->prot &= ~KVM_NV_GUEST_MAP_SZ;
> + ret = KVM_PGT_FN(kvm_pgtable_stage2_relax_perms)(pgt, fault->fault_ipa, fault->prot,
> + flags);
> } else {
> - ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault_ipa, vma_pagesize,
> - __pfn_to_phys(pfn), prot,
> - memcache, flags);
> + ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt, fault->fault_ipa, fault->vma_pagesize,
> + __pfn_to_phys(fault->pfn), fault->prot,
> + memcache, flags);
> }
>
> out_unlock:
> - kvm_release_faultin_page(kvm, page, !!ret, writable);
> + kvm_release_faultin_page(kvm, fault->page, !!ret, fault->writable);
> kvm_fault_unlock(kvm);
>
> /* Mark the page dirty only if the fault is handled successfully */
> - if (writable && !ret)
> - mark_page_dirty_in_slot(kvm, memslot, gfn);
> + if (fault->writable && !ret)
> + mark_page_dirty_in_slot(kvm, fault->memslot, fault->gfn);
>
> return ret != -EAGAIN ? ret : 0;
>
> out_put_page:
> - kvm_release_page_unused(page);
> + kvm_release_page_unused(fault->page);
> return ret;
> }
>
More information about the linux-arm-kernel
mailing list