[PATCH v2 14/35] KVM: arm64: Handle aborts from protected VMs

Thu Feb 12 02:37:19 PST 2026

Hi Will,

On Mon, Jan 19, 2026 at 12:46:07PM +0000, Will Deacon wrote:
> Introduce a new abort handler for resolving stage-2 page faults from
> protected VMs by pinning and donating anonymous memory. This is
> considerably simpler than the infamous user_mem_abort() as we only have
> to deal with translation faults at the pte level.
> 
> Signed-off-by: Will Deacon <will at kernel.org>
> ---
>  arch/arm64/kvm/mmu.c | 89 ++++++++++++++++++++++++++++++++++++++++----
>  1 file changed, 81 insertions(+), 8 deletions(-)
> 
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index a23a4b7f108c..b21a5bf3d104 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1641,6 +1641,74 @@ static int gmem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  	return ret != -EAGAIN ? ret : 0;
>  }
>  
> +static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
> +		struct kvm_memory_slot *memslot, unsigned long hva)
> +{
> +	unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
> +	struct kvm_pgtable *pgt = vcpu->arch.hw_mmu->pgt;
> +	struct mm_struct *mm = current->mm;
> +	struct kvm *kvm = vcpu->kvm;
> +	void *hyp_memcache;
> +	struct page *page;
> +	int ret;
> +
> +	ret = prepare_mmu_memcache(vcpu, true, &hyp_memcache);
> +	if (ret)
> +		return -ENOMEM;
> +
> +	ret = account_locked_vm(mm, 1, true);
> +	if (ret)
> +		return ret;
> +
> +	mmap_read_lock(mm);
> +	ret = pin_user_pages(hva, 1, flags, &page);
> +	mmap_read_unlock(mm);

If the page is part of a large folio, the entire folio gets pinned here, not
just the page returned by pin_user_pages(). Do you reckon that should be
considered when calling account_locked_vm()?

> +
> +	if (ret == -EHWPOISON) {
> +		kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
> +		ret = 0;
> +		goto dec_account;
> +	} else if (ret != 1) {
> +		ret = -EFAULT;
> +		goto dec_account;
> +	} else if (!folio_test_swapbacked(page_folio(page))) {
> +		/*
> +		 * We really can't deal with page-cache pages returned by GUP
> +		 * because (a) we may trigger writeback of a page for which we
> +		 * no longer have access and (b) page_mkclean() won't find the
> +		 * stage-2 mapping in the rmap so we can get out-of-whack with
> +		 * the filesystem when marking the page dirty during unpinning
> +		 * (see cc5095747edf ("ext4: don't BUG if someone dirty pages
> +		 * without asking ext4 first")).

I've been trying to wrap my head around this. Would you mind providing a few
more hints about what the issue is? I'm sure the approach is correct, it's
likely just me not being familiar with the code.

> +		 *
> +		 * Ideally we'd just restrict ourselves to anonymous pages, but
> +		 * we also want to allow memfd (i.e. shmem) pages, so check for
> +		 * pages backed by swap in the knowledge that the GUP pin will
> +		 * prevent try_to_unmap() from succeeding.
> +		 */
> +		ret = -EIO;
> +		goto unpin;
> +	}
> +
> +	write_lock(&kvm->mmu_lock);
> +	ret = pkvm_pgtable_stage2_map(pgt, fault_ipa, PAGE_SIZE,
> +				      page_to_phys(page), KVM_PGTABLE_PROT_RWX,
> +				      hyp_memcache, 0);
> +	write_unlock(&kvm->mmu_lock);
> +	if (ret) {
> +		if (ret == -EAGAIN)
> +			ret = 0;
> +		goto unpin;
> +	}

This looks correct to me, there's no need to check for the notifier sequence
number if the MMU notifiers are ignored. And concurrent faults on the same page
are handled by treating -EAGAIN as success.

> +
> +	return 0;
> +unpin:
> +	unpin_user_pages(&page, 1);
> +dec_account:
> +	account_locked_vm(mm, 1, false);
> +	return ret;
> +}
> +
>  static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
>  			  struct kvm_s2_trans *nested,
>  			  struct kvm_memory_slot *memslot, unsigned long hva,
> @@ -2190,15 +2258,20 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
>  		goto out_unlock;
>  	}
>  
> -	VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
> -			!write_fault && !kvm_vcpu_trap_is_exec_fault(vcpu));
> +	if (kvm_vm_is_protected(vcpu->kvm)) {
> +		ret = pkvm_mem_abort(vcpu, fault_ipa, memslot, hva);

I guess the reason this comes after handling an access fault is because you want
the WARN_ON() to trigger in pkvm_pgtable_stage2_mkyoung().

Thanks,
Alex

> +	} else {
> +		VM_WARN_ON_ONCE(kvm_vcpu_trap_is_permission_fault(vcpu) &&
> +				!write_fault &&
> +				!kvm_vcpu_trap_is_exec_fault(vcpu));
>  
> -	if (kvm_slot_has_gmem(memslot))
> -		ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
> -				 esr_fsc_is_permission_fault(esr));
> -	else
> -		ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
> -				     esr_fsc_is_permission_fault(esr));
> +		if (kvm_slot_has_gmem(memslot))
> +			ret = gmem_abort(vcpu, fault_ipa, nested, memslot,
> +					 esr_fsc_is_permission_fault(esr));
> +		else
> +			ret = user_mem_abort(vcpu, fault_ipa, nested, memslot, hva,
> +					     esr_fsc_is_permission_fault(esr));
> +	}
>  	if (ret == 0)
>  		ret = 1;
>  out:
> -- 
> 2.52.0.457.g6b5491de43-goog
> 
>