[PATCH 1/5] KVM: arm64: Grab KVM MMU write lock in kvm_arch_flush_shadow_all()

Bibo Mao maobibo at loongson.cn
Tue May 5 19:27:19 PDT 2026



On 2026/5/5 上午6:42, James Houghton wrote:
> kvm_arch_flush_shadow_all() may sometimes be called on the same `kvm`
> concurrently in the event that the KVM's `mm` is __mmput() at the
> same time that last reference to the KVM is being dropped.
> 
> T1              T2
> KVM_CREATE_VM
>                  Get VM file from T1
> close VM
> exit_mm()       close VM
> 
> T1: exit_mm() -> kvm_mmu_notifier_release() -> kvm_flush_shadow_all(),
>      with only the KVM srcu read lock held.
> 
> T2: kvm_vm_release() ---> mmu_notifier_unregister() ->
>      kvm_mmu_notifier_release() -> kvm_flush_shadow_all(),
>      again, with only the KVM srcu read lock held.
By looking through the code, kvm_arch_destroy_vm() will free PGD page 
only, page table walking is executing in deleting memslot or exit_mm().

With normal code, life cycle of VM is something like this:
   KVM_CREATE_VM
     Create_VCPUs
     Create memslots
     Destroy_VCPUs
     Destroy memslots
   close VM
   exit_mm()

And there is kvm_get_kvm()/kvm_put_kvm() function call with 
creating/destroy vCPUs, however no such operations with memslot 
operation. Is it possible that VM is destroyed without removing 
memslots, such as the following operation.
   KVM_CREATE_VM
     Create memslots
   close VM
   exit_mm()

Regards
Bibo Mao

> 
> This leads to a potential double-free of
> kvm->arch.kvm_mmu_free_memory_cache and now with NV
> kvm->arch.nested_mmus.
> 
> Cc: stable at vger.kernel.org
> Fixes: e7bf7a490c68 ("KVM: arm64: Split huge pages when dirty logging is enabled")
> Signed-off-by: James Houghton <jthoughton at google.com>
> ---
>   arch/arm64/include/asm/kvm_mmu.h |  1 +
>   arch/arm64/kvm/mmu.c             | 23 +++++++++++++++++++----
>   arch/arm64/kvm/nested.c          |  4 +++-
>   3 files changed, 23 insertions(+), 5 deletions(-)
> 
> diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h
> index 01e9c72d6aa7..30d5c24fcebb 100644
> --- a/arch/arm64/include/asm/kvm_mmu.h
> +++ b/arch/arm64/include/asm/kvm_mmu.h
> @@ -178,6 +178,7 @@ void stage2_unmap_vm(struct kvm *kvm);
>   int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long type);
>   void kvm_uninit_stage2_mmu(struct kvm *kvm);
>   void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu);
> +void kvm_free_stage2_pgd_locked(struct kvm_s2_mmu *mmu);
>   int kvm_phys_addr_ioremap(struct kvm *kvm, phys_addr_t guest_ipa,
>   			  phys_addr_t pa, unsigned long size, bool writable);
>   
> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
> index d089c107d9b7..4bab407d43bb 100644
> --- a/arch/arm64/kvm/mmu.c
> +++ b/arch/arm64/kvm/mmu.c
> @@ -1021,7 +1021,9 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
>   
>   void kvm_uninit_stage2_mmu(struct kvm *kvm)
>   {
> -	kvm_free_stage2_pgd(&kvm->arch.mmu);
> +	lockdep_assert_held_write(&kvm->mmu_lock);
> +
> +	kvm_free_stage2_pgd_locked(&kvm->arch.mmu);
>   	kvm_mmu_free_memory_cache(&kvm->arch.mmu.split_page_cache);
>   }
>   
> @@ -1095,12 +1097,14 @@ void stage2_unmap_vm(struct kvm *kvm)
>   	srcu_read_unlock(&kvm->srcu, idx);
>   }
>   
> -void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
> +static void __kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu, bool locked)
>   {
>   	struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
>   	struct kvm_pgtable *pgt = NULL;
>   
> -	write_lock(&kvm->mmu_lock);
> +	if (!locked)
> +		write_lock(&kvm->mmu_lock);
> +
>   	pgt = mmu->pgt;
>   	if (pgt) {
>   		mmu->pgd_phys = 0;
> @@ -1111,7 +1115,8 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>   	if (kvm_is_nested_s2_mmu(kvm, mmu))
>   		kvm_init_nested_s2_mmu(mmu);
>   
> -	write_unlock(&kvm->mmu_lock);
> +	if (!locked)
> +		write_unlock(&kvm->mmu_lock);
>   
>   	if (pgt) {
>   		kvm_stage2_destroy(pgt);
> @@ -1119,6 +1124,16 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
>   	}
>   }
>   
> +void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
> +{
> +	__kvm_free_stage2_pgd(mmu, false);
> +}
> +
> +void kvm_free_stage2_pgd_locked(struct kvm_s2_mmu *mmu)
> +{
> +	__kvm_free_stage2_pgd(mmu, true);
> +}
> +
>   static void hyp_mc_free_fn(void *addr, void *mc)
>   {
>   	struct kvm_hyp_memcache *memcache = mc;
> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
> index 883b6c1008fb..977598bff5e6 100644
> --- a/arch/arm64/kvm/nested.c
> +++ b/arch/arm64/kvm/nested.c
> @@ -1190,11 +1190,13 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
>   {
>   	int i;
>   
> +	guard(write_lock)(&kvm->mmu_lock);
> +
>   	for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
>   		struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
>   
>   		if (!WARN_ON(atomic_read(&mmu->refcnt)))
> -			kvm_free_stage2_pgd(mmu);
> +			kvm_free_stage2_pgd_locked(mmu);
>   	}
>   	kvfree(kvm->arch.nested_mmus);
>   	kvm->arch.nested_mmus = NULL;
> 




More information about the linux-arm-kernel mailing list