[PATCH v2] KVM: arm64: nv: Optimize unmapping of shadow S2-MMU tables
Vishnu Pajjuri
vishnu at os.amperecomputing.com
Wed Feb 25 03:59:05 PST 2026
Hi Marc,
On 24-11-2025 16:33, Ganapatrao Kulkarni wrote:
> On 10/28/2025 5:59 PM, Marc Zyngier wrote:
>> On Tue, 28 Oct 2025 06:02:03 +0000,
>> Ganapatrao Kulkarni <gankulkarni at os.amperecomputing.com> wrote:
>>>
>>> On 10/23/2025 8:05 PM, Marc Zyngier wrote:
>>>> On Thu, 23 Oct 2025 12:11:42 +0100,
>>>> Ganapatrao Kulkarni <gankulkarni at os.amperecomputing.com> wrote:
>>>>>
>>>>>
>>>>> Hi Marc, Oliver,
>>>>>
>>>>> On 10/13/2025 12:21 PM, Ganapatrao Kulkarni wrote:
>>>>>> As of commit ec14c272408a ("KVM: arm64: nv: Unmap/flush shadow
>>>>>> stage 2 page tables"), an unmap of a canonical IPA range mapped at L1
>>>>>> triggers invalidation in L1 S2-MMU and in all active shadow (L2)
>>>>>> S2-MMU
>>>>>> tables. Because there is no direct mapping to locate the
>>>>>> corresponding
>>>>>> shadow IPAs, the code falls back to a full S2-MMU page-table walk and
>>>>>> invalidation across the entire L1 address space.
>>>>>>
>>>>>> For 4K pages this causes roughly 256K loop iterations (about 8M for
>>>>>> 64K pages) per unmap, which can severely impact performance on large
>>>>>> systems and even cause soft lockups during NV (L1/L2) boots with many
>>>>>> CPUs and large memory. It also causes long delays during L1 reboot.
>>>>>>
>>>>>> This patch adds a maple-tree-based lookup that records canonical-
>>>>>> IPA to
>>>>>> shadow-IPA mappings whenever a page is mapped into any shadow (L2)
>>>>>> table. On unmap, the lookup is used to target only those shadow IPAs
>>>>>> which are fully or partially mapped in shadow S2-MMU tables, avoiding
>>>>>> a full-address-space walk and unnecessary unmap/flush operations.
>>>>>>
>>>>>> The lookup is updated on map/unmap operations so entries remain
>>>>>> consistent with shadow table state. Use it during unmap to invalidate
>>>>>> only affected shadow IPAs, avoiding unnecessary CPU work and reducing
>>>>>> latency when shadow mappings are sparse.
>>>>>>
>>>>>> Reviewed-by: Christoph Lameter (Ampere) <cl at gentwo.org>
>>>>>> Signed-off-by: Ganapatrao Kulkarni
>>>>>> <gankulkarni at os.amperecomputing.com>
>>>>>> ---
>>>>>>
>>>>>> Changes since v1:
>>>>>> Rebased to 6.18-rc1.
>>>>>> Fixed alignment issue while adding the shadow ipa range
>>>>>> to lookup.
>>>>>>
>>>>>> Changes since RFC v1:
>>>>>> Added maple tree based lookup and updated with review
>>>>>> comments from [1].
>>>>>>
>>>>>> [1] https://lkml.indiana.edu/2403.0/03801.html
>>>>>>
>>>>>> arch/arm64/include/asm/kvm_host.h | 3 +
>>>>>> arch/arm64/include/asm/kvm_nested.h | 9 +++
>>>>>> arch/arm64/kvm/mmu.c | 17 ++--
>>>>>> arch/arm64/kvm/nested.c | 120 ++++++++++++++++++++
>>>>>> ++++++--
>>>>>> 4 files changed, 138 insertions(+), 11 deletions(-)
>>>>>>
>>>>>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/
>>>>>> include/asm/kvm_host.h
>>>>>> index b763293281c8..e774681c6ba4 100644
>>>>>> --- a/arch/arm64/include/asm/kvm_host.h
>>>>>> +++ b/arch/arm64/include/asm/kvm_host.h
>>>>>> @@ -227,6 +227,9 @@ struct kvm_s2_mmu {
>>>>>> * >0: Somebody is actively using this.
>>>>>> */
>>>>>> atomic_t refcnt;
>>>>>> +
>>>>>> + /* For IPA to shadow IPA lookup */
>>>>>> + struct maple_tree nested_mmu_mt;
>>>>>> };
>>>>>> struct kvm_arch_memory_slot {
>>>>>> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/
>>>>>> include/asm/kvm_nested.h
>>>>>> index f7c06a840963..5b7c4e7ed18f 100644
>>>>>> --- a/arch/arm64/include/asm/kvm_nested.h
>>>>>> +++ b/arch/arm64/include/asm/kvm_nested.h
>>>>>> @@ -69,6 +69,8 @@ extern void kvm_init_nested(struct kvm *kvm);
>>>>>> extern int kvm_vcpu_init_nested(struct kvm_vcpu *vcpu);
>>>>>> extern void kvm_init_nested_s2_mmu(struct kvm_s2_mmu *mmu);
>>>>>> extern struct kvm_s2_mmu *lookup_s2_mmu(struct kvm_vcpu *vcpu);
>>>>>> +extern int add_to_shadow_ipa_lookup(struct kvm_pgtable *pgt, u64
>>>>>> shadow_ipa, u64 ipa,
>>>>>> + u64 size);
>>>>>> union tlbi_info;
>>>>>> @@ -95,6 +97,12 @@ struct kvm_s2_trans {
>>>>>> u64 desc;
>>>>>> };
>>>>>> +struct shadow_ipa_map {
>>>>>> + u64 shadow_ipa;
>>>>>> + u64 ipa;
>>>>>> + u64 size;
>>>>>> +};
>>>>>> +
>>>>>> static inline phys_addr_t kvm_s2_trans_output(struct
>>>>>> kvm_s2_trans *trans)
>>>>>> {
>>>>>> return trans->output;
>>>>>> @@ -132,6 +140,7 @@ extern int kvm_s2_handle_perm_fault(struct
>>>>>> kvm_vcpu *vcpu,
>>>>>> extern int kvm_inject_s2_fault(struct kvm_vcpu *vcpu, u64
>>>>>> esr_el2);
>>>>>> extern void kvm_nested_s2_wp(struct kvm *kvm);
>>>>>> extern void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block);
>>>>>> +extern void kvm_nested_s2_unmap_range(struct kvm *kvm, u64 ipa,
>>>>>> u64 size, bool may_block);
>>>>>> extern void kvm_nested_s2_flush(struct kvm *kvm);
>>>>>> unsigned long compute_tlb_inval_range(struct kvm_s2_mmu *mmu,
>>>>>> u64 val);
>>>>>> diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c
>>>>>> index 7cc964af8d30..27c120556e1b 100644
>>>>>> --- a/arch/arm64/kvm/mmu.c
>>>>>> +++ b/arch/arm64/kvm/mmu.c
>>>>>> @@ -1872,6 +1872,10 @@ static int user_mem_abort(struct kvm_vcpu
>>>>>> *vcpu, phys_addr_t fault_ipa,
>>>>>> ret = KVM_PGT_FN(kvm_pgtable_stage2_map)(pgt,
>>>>>> fault_ipa, vma_pagesize,
>>>>>> __pfn_to_phys(pfn), prot,
>>>>>> memcache, flags);
>>>>>> +
>>>>>> + /* Add to lookup, if canonical IPA range mapped to shadow
>>>>>> mmu */
>>>>>> + if (nested)
>>>>>> + add_to_shadow_ipa_lookup(pgt, fault_ipa, ipa,
>>>>>> vma_pagesize);
>>>>>> }
>>>>>> out_unlock:
>>>>>> @@ -2094,14 +2098,15 @@ int kvm_handle_guest_abort(struct kvm_vcpu
>>>>>> *vcpu)
>>>>>> bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range
>>>>>> *range)
>>>>>> {
>>>>>> + gpa_t start = range->start << PAGE_SHIFT;
>>>>>> + gpa_t end = (range->end - range->start) << PAGE_SHIFT;
>>>>>> + bool may_block = range->may_block;
>>>>>> +
>>>>>> if (!kvm->arch.mmu.pgt)
>>>>>> return false;
>>>>>> - __unmap_stage2_range(&kvm->arch.mmu, range->start <<
>>>>>> PAGE_SHIFT,
>>>>>> - (range->end - range->start) << PAGE_SHIFT,
>>>>>> - range->may_block);
>>>>>> -
>>>>>> - kvm_nested_s2_unmap(kvm, range->may_block);
>>>>>> + __unmap_stage2_range(&kvm->arch.mmu, start, end, may_block);
>>>>>> + kvm_nested_s2_unmap_range(kvm, start, end, may_block);
>>>>>> return false;
>>>>>> }
>>>>>> @@ -2386,7 +2391,7 @@ void kvm_arch_flush_shadow_memslot(struct
>>>>>> kvm *kvm,
>>>>>> write_lock(&kvm->mmu_lock);
>>>>>> kvm_stage2_unmap_range(&kvm->arch.mmu, gpa, size, true);
>>>>>> - kvm_nested_s2_unmap(kvm, true);
>>>>>> + kvm_nested_s2_unmap_range(kvm, gpa, size, true);
>>>>>> write_unlock(&kvm->mmu_lock);
>>>>>> }
>>>>>> diff --git a/arch/arm64/kvm/nested.c b/arch/arm64/kvm/nested.c
>>>>>> index 7a045cad6bdf..3a7035e7526a 100644
>>>>>> --- a/arch/arm64/kvm/nested.c
>>>>>> +++ b/arch/arm64/kvm/nested.c
>>>>>> @@ -7,6 +7,7 @@
>>>>>> #include <linux/bitfield.h>
>>>>>> #include <linux/kvm.h>
>>>>>> #include <linux/kvm_host.h>
>>>>>> +#include <linux/maple_tree.h>
>>>>>> #include <asm/fixmap.h>
>>>>>> #include <asm/kvm_arm.h>
>>>>>> @@ -725,6 +726,7 @@ void kvm_init_nested_s2_mmu(struct kvm_s2_mmu
>>>>>> *mmu)
>>>>>> mmu->tlb_vttbr = VTTBR_CNP_BIT;
>>>>>> mmu->nested_stage2_enabled = false;
>>>>>> atomic_set(&mmu->refcnt, 0);
>>>>>> + mt_init_flags(&mmu->nested_mmu_mt, MM_MT_FLAGS);
>>>>>> }
>>>>>> void kvm_vcpu_load_hw_mmu(struct kvm_vcpu *vcpu)
>>>>>> @@ -1067,6 +1069,99 @@ void kvm_nested_s2_wp(struct kvm *kvm)
>>>>>> kvm_invalidate_vncr_ipa(kvm, 0, BIT(kvm->arch.mmu.pgt-
>>>>>> >ia_bits));
>>>>>> }
>>>>>> +/*
>>>>>> + * Store range of canonical IPA mapped to a nested stage 2 mmu
>>>>>> table.
>>>>>> + * Canonical IPA used as pivot in maple tree for the lookup later
>>>>>> + * while IPA unmap/flush.
>>>>>> + */
>>>>>> +int add_to_shadow_ipa_lookup(struct kvm_pgtable *pgt, u64
>>>>>> shadow_ipa,
>>>>>> + u64 ipa, u64 size)
>>>>>> +{
>>>>>> + struct kvm_s2_mmu *mmu;
>>>>>> + struct shadow_ipa_map *entry;
>>>>>> + unsigned long start, end;
>>>>>> + int ret;
>>>>>> +
>>>>>> + start = ALIGN_DOWN(ipa, size);
>>>>>> + end = start + size;
>>>>>> + mmu = pgt->mmu;
>>>>>> +
>>>>>> + entry = kzalloc(sizeof(struct shadow_ipa_map),
>>>>>> GFP_KERNEL_ACCOUNT);
>>>>>> +
>>>>>> + if (!entry)
>>>>>> + return -ENOMEM;
>>>>>> +
>>>>>> + entry->ipa = start;
>>>>>> + entry->shadow_ipa = ALIGN_DOWN(shadow_ipa, size);
>>>>>> + entry->size = size;
>>>>>> + ret = mtree_store_range(&mmu->nested_mmu_mt, start, end - 1,
>>>>>> entry,
>>>>>> + GFP_KERNEL_ACCOUNT);
>>>>>> + if (ret) {
>>>>>> + kfree(entry);
>>>>>> + WARN_ON(ret);
>>>>>> + }
>>>>>> +
>>>>>> + return ret;
>>>>>> +}
>>>>>> +
>>>>>> +static void nested_mtree_erase(struct maple_tree *mt, unsigned
>>>>>> long start,
>>>>>> + unsigned long size)
>>>>>> +{
>>>>>> + void *entry = NULL;
>>>>>> +
>>>>>> + MA_STATE(mas, mt, start, start + size - 1);
>>>>>> +
>>>>>> + mtree_lock(mt);
>>>>>> + entry = mas_erase(&mas);
>>>>>> + mtree_unlock(mt);
>>>>>> + kfree(entry);
>>>>>> +}
>>>>>> +
>>>>>> +static void nested_mtree_erase_and_unmap_all(struct kvm_s2_mmu *mmu,
>>>>>> + unsigned long start, unsigned long end, bool may_block)
>>>>>> +{
>>>>>> + struct shadow_ipa_map *entry;
>>>>>> +
>>>>>> + mt_for_each(&mmu->nested_mmu_mt, entry, start,
>>>>>> kvm_phys_size(mmu)) {
>>>>>> + kvm_stage2_unmap_range(mmu, entry->shadow_ipa, entry->size,
>>>>>> + may_block);
>>>>>> + kfree(entry);
>>>>>> + }
>>>>>> +
>>>>>> + mtree_destroy(&mmu->nested_mmu_mt);
>>>>>> + mt_init_flags(&mmu->nested_mmu_mt, MM_MT_FLAGS);
>>>>>> +}
>>>>>> +
>>>>>> +void kvm_nested_s2_unmap_range(struct kvm *kvm, u64 ipa, u64 size,
>>>>>> + bool may_block)
>>>>>> +{
>>>>>> + int i;
>>>>>> + struct shadow_ipa_map *entry;
>>>>>> +
>>>>>> + lockdep_assert_held_write(&kvm->mmu_lock);
>>>>>> +
>>>>>> + for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
>>>>>> + struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
>>>>>> + unsigned long start = ipa;
>>>>>> + unsigned long end = ipa + size;
>>>>>> +
>>>>>> + if (!kvm_s2_mmu_valid(mmu))
>>>>>> + continue;
>>>>>> +
>>>>>> + do {
>>>>>> + entry = mt_find(&mmu->nested_mmu_mt, &start, end - 1);
>>>>>> + if (!entry)
>>>>>> + break;
>>>>>> +
>>>>>> + kvm_stage2_unmap_range(mmu, entry->shadow_ipa,
>>>>>> + entry->size, may_block);
>>>>>> + start = entry->ipa + entry->size;
>>>>>> + nested_mtree_erase(&mmu->nested_mmu_mt, entry->ipa,
>>>>>> + entry->size);
>>>>>> + } while (start < end);
>>>>>> + }
>>>>>> +}
>>>>>> +
>>>>>> void kvm_nested_s2_unmap(struct kvm *kvm, bool may_block)
>>>>>> {
>>>>>> int i;
>>>>>> @@ -1076,8 +1171,10 @@ void kvm_nested_s2_unmap(struct kvm *kvm,
>>>>>> bool may_block)
>>>>>> for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
>>>>>> struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
>>>>>> - if (kvm_s2_mmu_valid(mmu))
>>>>>> - kvm_stage2_unmap_range(mmu, 0, kvm_phys_size(mmu),
>>>>>> may_block);
>>>>>> + if (!kvm_s2_mmu_valid(mmu))
>>>>>> + continue;
>>>>>> +
>>>>>> + nested_mtree_erase_and_unmap_all(mmu, 0,
>>>>>> kvm_phys_size(mmu), may_block);
>>>>>> }
>>>>>> kvm_invalidate_vncr_ipa(kvm, 0,
>>>>>> BIT(kvm->arch.mmu.pgt->ia_bits));
>>>>>> @@ -1091,9 +1188,14 @@ void kvm_nested_s2_flush(struct kvm *kvm)
>>>>>> for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
>>>>>> struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
>>>>>> + struct shadow_ipa_map *entry;
>>>>>> + unsigned long start = 0;
>>>>>> - if (kvm_s2_mmu_valid(mmu))
>>>>>> - kvm_stage2_flush_range(mmu, 0, kvm_phys_size(mmu));
>>>>>> + if (!kvm_s2_mmu_valid(mmu))
>>>>>> + continue;
>>>>>> +
>>>>>> + mt_for_each(&mmu->nested_mmu_mt, entry, start,
>>>>>> kvm_phys_size(mmu))
>>>>>> + kvm_stage2_flush_range(mmu, entry->shadow_ipa, entry-
>>>>>> >size);
>>>>>> }
>>>>>> }
>>>>>> @@ -1104,8 +1206,16 @@ void kvm_arch_flush_shadow_all(struct kvm
>>>>>> *kvm)
>>>>>> for (i = 0; i < kvm->arch.nested_mmus_size; i++) {
>>>>>> struct kvm_s2_mmu *mmu = &kvm->arch.nested_mmus[i];
>>>>>> - if (!WARN_ON(atomic_read(&mmu->refcnt)))
>>>>>> + if (!WARN_ON(atomic_read(&mmu->refcnt))) {
>>>>>> + struct shadow_ipa_map *entry;
>>>>>> + unsigned long start = 0;
>>>>>> +
>>>>>> kvm_free_stage2_pgd(mmu);
>>>>>> +
>>>>>> + mt_for_each(&mmu->nested_mmu_mt, entry, start,
>>>>>> kvm_phys_size(mmu))
>>>>>> + kfree(entry);
>>>>>> + mtree_destroy(&mmu->nested_mmu_mt);
>>>>>> + }
>>>>>> }
>>>>>> kvfree(kvm->arch.nested_mmus);
>>>>>> kvm->arch.nested_mmus = NULL;
>>>>>
>>>>> Any review comments or suggestions for this patch?
>>>>
>>>> None. This patch is obviously lacking the basic requirements that such
>>>> an "optimisation" should handle, such as dealing with multiple
>>>> mappings to the same IPA in the shadow S2, hence will happily fail to
>>>> correctly unmap stuff. There is no documentation, and no test.
>>>>
>>> Thanks for the comment.
>>> How about adding list of multiple mappings ranges to the maple tree
>>> entry/node while adding to the lookup and later unmapping every
>>> range present in that list?
>>
>> How will that work when the various mappings don't have the same size?
>
> Can we have a merged new pivot?
> Say old IPA range is from A to B and the newer range is from C to D
> (Which are intersecting partially or fully).
> Then can we create new pivot (delete old and insert new) with the range
> MIN(A,C) to MAX(B,D).
> The new pivot will have list of entries of both old and new shadow ipa
> ranges.
> This shall be repeated till all intersecting pivots are merged.
>
> While unmapping, on first intersect detection, unmap all listed shadow
> IPAs.
I'm Vishnu.
Following Ganapat’s departure from Ampere,
I have begun following up on his outstanding patch and
would appreciate your guidance on the proposed approach.
Do you have any recommendations or alternative steps to suggest?
Regards,
-Vishnu.
>>
>>> I tested this patch on an AmpereOne system (2 NUMA nodes, 96 CPUs
>>> per node, numa balance enabled) with large vCPU counts and large
>>> memory to L1 and L2. The current full-address-space walk caused
>>> very large unmap/flush work and significant delays (exacerbated by
>>> NUMA balancing / page migration activity). The targeted unmap using
>>> the list per node removes only the affected mappings and reduces the
>>> unmap latency substantially in our workloads.
>>
>> I really don't care how badly things perform. The important thing is
>> that this is architecturally correct, while your approach isn't.
>>
>>> I booted multiple L1s, each hosting several L2s, and observed no
>>> panics or failures related to missing support for multiple‑IPA
>>> mappings.
>>
>> I'm sorry, but Linux isn't a validation tool for the architecture. You
>> have clearly designed something around Linux's own behaviour, not the
>> architectural requirements.
>>
>>> If you have any test cases or scenarios that would validate support
>>> for multiple IPA mappings, could you please share them?
>>
>> The onus is on *you* to provide them, not me.
>>
> Using below kvm selftest code, could generate the 2 shadow IPAs, mapping
> to same canonical IPA (when ran from L1 shell).
> This code generates 2 shadow IPA ranges of same size(PAGE SIZE) and
> mapped to same Canonical IPA.
>
> #include <stdio.h>
> #include <stdlib.h>
> #include <errno.h>
> #include <linux/atomic.h>
> #include <linux/sizes.h>
>
> #include "kvm_util.h"
> #include "test_util.h"
> #include "processor.h"
> #include "ucall_common.h"
>
> #define PAGE_SIZE 4096
>
> static void guest_code(uint64_t slot1_gpa, uint64_t slot2_gpa, uint64_t
> size, uint64_t page_size)
> {
> uint64_t gpa;
>
> for (gpa = slot1_gpa; gpa < slot1_gpa + size; gpa += page_size)
> vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
>
> for (gpa = slot2_gpa; gpa < slot2_gpa + size; gpa += page_size)
> vcpu_arch_put_guest(*((volatile uint64_t *)gpa), gpa);
>
> /* Write to slot 1 in loop */
> while(1) {
> for (gpa = slot1_gpa; gpa < slot1_gpa + size; gpa +=
> page_size)
> vcpu_arch_put_guest(*((volatile uint64_t
> *)gpa), gpa);
> }
>
> GUEST_PRINTF("Guest DONE\n");
> GUEST_DONE();
> }
>
> static void run_vcpu(struct kvm_vcpu *vcpu)
> {
> struct ucall uc;
>
> do {
> vcpu_run(vcpu);
> switch (get_ucall(vcpu, &uc)) {
> case UCALL_ABORT:
> REPORT_GUEST_ASSERT(uc);
> break;
> case UCALL_PRINTF:
> printf("%s", uc.buffer);
> break;
> case UCALL_DONE:
> break;
> default:
> TEST_FAIL("Unknown ucall %lu", uc.cmd);
> }
> } while (uc.cmd != UCALL_DONE);
> }
>
> int main(int argc, char *argv[])
> {
> uint64_t gpa, slot_size, slot_stride, i;
> int slot, fd;
> struct kvm_vcpu *vcpu;
> struct kvm_vm *vm;
> void *mem;
> int first_slot, nr_slots;
> uint64_t start_gpa;
>
> slot_size = SZ_1M * 128;
> slot_stride = SZ_2G;
> start_gpa = SZ_4G;
> first_slot = 1;
> nr_slots = 2;
>
> vcpu = malloc(sizeof(vcpu));
> TEST_ASSERT(vcpu, "Failed to allocate vCPU");
>
> vm = __vm_create_with_vcpus(VM_SHAPE_DEFAULT, 1,
> (nr_slots * slot_size) / PAGE_SIZE,
> guest_code, &vcpu);
>
> fd = kvm_memfd_alloc(slot_size, false);
> mem = mmap(NULL, slot_size, PROT_READ | PROT_WRITE, MAP_SHARED,
> fd, 0);
> TEST_ASSERT(mem != MAP_FAILED, "mmap() failed");
>
> for (slot = first_slot; slot <= nr_slots; slot++) {
> gpa = start_gpa + ((slot - first_slot) * slot_stride);
>
> vm_set_user_memory_region(vm, slot, 0, gpa, slot_size,
> mem);
> for (i = 0; i < slot_size; i += vm->page_size)
> virt_pg_map(vm, gpa + i, gpa + i);
> }
>
> vcpu_args_set(vcpu, 4, start_gpa, start_gpa + SZ_2G, slot_size,
> vm->page_size);
> run_vcpu(vcpu);
> }
>
> from host, tried page migration using migratepages command to trigger L1
> crash with PATCH v2.
More information about the linux-arm-kernel
mailing list