[PATCH v2 09/39] KVM: arm64: gic-v5: Create & manage VM and VPE tables
Sascha Bischoff
Sascha.Bischoff at arm.com
Thu May 21 07:52:13 PDT 2026
GICv5 uses a set of in-memory tables to track and manage VM
state. These must be allocated by the hypervisor, and provided to the
IRS to use.
The VMT (Virtual Machine Table) is a linear or two level table
comprising VMT Entries (VMTE). Each VMTE describes the state for a
single VM. This state includes things such as the SPI and LPI IST
configuration (coming in a future commit), an implementation-defined
VM Descriptor, and a VPE Table (VPET).
The VPET contains one entry per VPE belonging to a VM, and is used to
mark a VPE as valid, as well as providing the address of an
implementation-defined VPE Descriptor, which is used by the hardware
to track and manage VPE state.
This commit adds support for allocating the VMT, and managing the
VMTEs. The VMTEs can be initialised or released for re-use. Allocation
and tracking of unused VMTEs is handled with an IDA.
Signed-off-by: Sascha Bischoff <sascha.bischoff at arm.com>
---
arch/arm64/kvm/Makefile | 2 +-
arch/arm64/kvm/vgic/vgic-init.c | 2 +
arch/arm64/kvm/vgic/vgic-v5-tables.c | 625 +++++++++++++++++++++++++++
arch/arm64/kvm/vgic/vgic-v5-tables.h | 76 ++++
arch/arm64/kvm/vgic/vgic-v5.c | 15 +
drivers/irqchip/irq-gic-v5-irs.c | 12 +-
include/kvm/arm_vgic.h | 4 +
include/linux/irqchip/arm-gic-v5.h | 14 +-
8 files changed, 740 insertions(+), 10 deletions(-)
create mode 100644 arch/arm64/kvm/vgic/vgic-v5-tables.c
create mode 100644 arch/arm64/kvm/vgic/vgic-v5-tables.h
diff --git a/arch/arm64/kvm/Makefile b/arch/arm64/kvm/Makefile
index 59612d2f277c1..431de9b145ca1 100644
--- a/arch/arm64/kvm/Makefile
+++ b/arch/arm64/kvm/Makefile
@@ -24,7 +24,7 @@ kvm-y += arm.o mmu.o mmio.o psci.o hypercalls.o pvtime.o \
vgic/vgic-mmio.o vgic/vgic-mmio-v2.o \
vgic/vgic-mmio-v3.o vgic/vgic-kvm-device.o \
vgic/vgic-its.o vgic/vgic-debug.o vgic/vgic-v3-nested.o \
- vgic/vgic-v5.o
+ vgic/vgic-v5.o vgic/vgic-v5-tables.o
kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
kvm-$(CONFIG_ARM64_PTR_AUTH) += pauth.o
diff --git a/arch/arm64/kvm/vgic/vgic-init.c b/arch/arm64/kvm/vgic/vgic-init.c
index 625d352756fcf..079a57c2b18f6 100644
--- a/arch/arm64/kvm/vgic/vgic-init.c
+++ b/arch/arm64/kvm/vgic/vgic-init.c
@@ -154,6 +154,8 @@ int kvm_vgic_create(struct kvm *kvm, u32 type)
case KVM_DEV_TYPE_ARM_VGIC_V3:
INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
break;
+ case KVM_DEV_TYPE_ARM_VGIC_V5:
+ kvm->arch.vgic.gicv5_vm.vm_id = VGIC_V5_VM_ID_INVAL;
}
/*
diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.c b/arch/arm64/kvm/vgic/vgic-v5-tables.c
new file mode 100644
index 0000000000000..e9b92893b4e1f
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.c
@@ -0,0 +1,625 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2025, 2026 Arm Ltd.
+ */
+
+#include <kvm/arm_vgic.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
+#include <linux/sizes.h>
+#include <linux/slab.h>
+#include <linux/xarray.h>
+#include <asm/kvm_mmu.h>
+
+#include "vgic.h"
+#include "vgic-v5-tables.h"
+
+#define irs_caps kvm_vgic_global_state.vgic_v5_irs_caps
+
+static struct vgic_v5_vmt *vmt_info;
+/* Serialises lazy installation of shared second-level VMTs. */
+static DEFINE_MUTEX(vmt_l2_lock);
+static DEFINE_XARRAY(vm_info);
+
+/* Level 1 Virtual Machine Table Entry */
+#define GICV5_VMTEL1E_VALID BIT_ULL(0)
+/* Note that there is no shift for the address by design */
+#define GICV5_VMTEL1E_L2_ADDR GENMASK(51, 12)
+
+#define GICV5_VMTEL2E_SIZE 32ULL
+/* An L2 table (two-level VMT) is ALWAYS 4kB! */
+#define GICV5_VMT_L2_TABLE_SIZE 4096ULL
+#define GICV5_VMT_L2_TABLE_ENTRIES (GICV5_VMT_L2_TABLE_SIZE / GICV5_VMTEL2E_SIZE)
+
+/*
+ * As the L2 VMTE is a large data structure, we are splitting it into 4 parts.
+ * We only mask and shift WITHIN each part for simplicity.
+ */
+/* First 64-bit chunk */
+#define GICV5_VMTEL2E_VALID BIT_ULL(0)
+#define GICV5_VMTEL2E_VMD_ADDR_SHIFT 3ULL
+#define GICV5_VMTEL2E_VMD_ADDR GENMASK_ULL(55, 3)
+/* Second 64-bit chunk */
+#define GICV5_VMTEL2E_VPET_ADDR_SHIFT 3ULL
+#define GICV5_VMTEL2E_VPET_ADDR GENMASK_ULL(55, 3)
+#define GICV5_VMTEL2E_VPE_ID_BITS GENMASK_ULL(63, 59)
+/* Third & fourth 64-bit chunks (the encodings are the same for each) */
+#define GICV5_VMTEL2E_IST_VALID BIT_ULL(0)
+#define GICV5_VMTEL2E_IST_L2SZ GENMASK_ULL(2, 1)
+#define GICV5_VMTEL2E_IST_ADDR_SHIFT 6ULL
+#define GICV5_VMTEL2E_IST_ADDR GENMASK_ULL(55, 6)
+#define GICV5_VMTEL2E_IST_ISTSZ GENMASK_ULL(57, 56)
+#define GICV5_VMTEL2E_IST_STRUCTURE BIT_ULL(58)
+#define GICV5_VMTEL2E_IST_ID_BITS GENMASK_ULL(63, 59)
+
+/* Virtual PE Table Entry */
+#define GICV5_VPE_VALID BIT_ULL(0)
+/* Note that there is no shift for the address by design. */
+#define GICV5_VPED_ADDR_SHIFT 3ULL
+#define GICV5_VPED_ADDR GENMASK_ULL(55, 3)
+
+/*
+ * Our IRS might be coherent or non-coherent. If coherent, we can just emit a
+ * DSB to ensure that we're in sync. However, when non-coherent, we need to
+ * manage our cached data explicitly.
+ *
+ * This helper is used to handle both coherent and non-coherent IRSes, and
+ * handles all combinations of cleaning and invalidating to the PoC.
+ */
+static void vgic_v5_clean_inval(void *va, size_t size)
+{
+ unsigned long base = (unsigned long)va;
+
+ dsb(ishst);
+
+ if (kvm_vgic_global_state.vgic_v5_irs_caps.non_coherent)
+ dcache_clean_inval_poc(base, base + size);
+}
+
+/*
+ * Create a linear VM Table. Directly using the number of entries supplied as
+ * the size of an L2 VMTE (32 bytes) guarantees that our allocation is aligned per
+ * the GICv5 requirements for the IRS_VMT_BASER.
+ */
+static int vgic_v5_alloc_vmt_linear(unsigned int num_entries)
+{
+ vmt_info->linear.vmt_base = kzalloc_objs(*vmt_info->linear.vmt_base,
+ num_entries);
+ if (!vmt_info->linear.vmt_base)
+ return -ENOMEM;
+
+ vgic_v5_clean_inval(vmt_info->linear.vmt_base,
+ num_entries * sizeof(struct vmtl2_entry));
+
+ return 0;
+}
+
+/*
+ * Allocate the first level of a two-level VM table. The second-level VM tables
+ * are allocated on demand (by vgic_v5_alloc_l2_vmt()).
+ */
+static int vgic_v5_alloc_vmt_two_level(unsigned int num_entries)
+{
+ /*
+ * Each L2 VMT array is always 4k-sized (covering 128 VMs). This is
+ * mandated by the GICv5 specification (GICv5 EAC0 Specification rule
+ * D_LSPBK). Hence, round up the number of entries to be at least 128
+ * (or the next highest power of two as we give the HW the number of VM
+ * ID bits).
+ */
+ if (num_entries < GICV5_VMT_L2_TABLE_ENTRIES)
+ num_entries = GICV5_VMT_L2_TABLE_ENTRIES;
+ num_entries = roundup_pow_of_two(num_entries);
+
+ vmt_info->l2.num_l1_ents = (num_entries / GICV5_VMT_L2_TABLE_ENTRIES);
+ vmt_info->l2.vmt_base = kzalloc_objs(*vmt_info->l2.vmt_base,
+ vmt_info->l2.num_l1_ents);
+ if (!vmt_info->l2.vmt_base)
+ return -ENOMEM;
+
+ vmt_info->l2.l2ptrs = kzalloc_objs(*vmt_info->l2.l2ptrs,
+ vmt_info->l2.num_l1_ents,
+ GFP_KERNEL);
+ if (!vmt_info->l2.l2ptrs) {
+ kfree(vmt_info->l2.vmt_base);
+ return -ENOMEM;
+ }
+
+ vgic_v5_clean_inval(vmt_info->l2.vmt_base,
+ vmt_info->l2.num_l1_ents * sizeof(vmtl1_entry));
+
+ return 0;
+}
+
+/*
+ * Allocate a second level VMT, if required. This can be called eagerly, and
+ * will only perform the allocation if required.
+ */
+static int vgic_v5_alloc_l2_vmt(struct kvm *kvm)
+{
+ struct kvm_vcpu *vcpu0 = kvm_get_vcpu(kvm, 0);
+ u16 vm_id = vgic_v5_vm_id(kvm);
+ enum gicv5_vcpu_cmd cmd = VMT_L2_MAP;
+ struct vmtl2_entry *l2_table;
+ unsigned int l1_index;
+ int ret;
+
+ /* Nothing to do if we have linear tables! */
+ if (!vmt_info->two_level)
+ return 0;
+
+ /*
+ * We have 4k-sized L2 tables - this is mandated by the spec for
+ * two-level VMTs (GICv5 EAC0 Specification rule D_LSPBK). This means
+ * that we have 128 entries per L1 VMTE.
+ */
+ l1_index = vm_id / GICV5_VMT_L2_TABLE_ENTRIES;
+
+ guard(mutex)(&vmt_l2_lock);
+
+ /* Already valid? Great! */
+ if (vmt_info->l2.l2ptrs[l1_index])
+ return 0;
+
+ l2_table = kzalloc_objs(*l2_table, GICV5_VMT_L2_TABLE_ENTRIES);
+ if (!l2_table)
+ return -ENOMEM;
+
+ vgic_v5_clean_inval(l2_table, GICV5_VMT_L2_TABLE_SIZE);
+
+ vgic_v5_clean_inval(vmt_info->l2.vmt_base + l1_index,
+ sizeof(vmtl1_entry));
+
+ WRITE_ONCE(vmt_info->l2.vmt_base[l1_index],
+ cpu_to_le64(virt_to_phys(l2_table)));
+
+ vgic_v5_clean_inval(vmt_info->l2.vmt_base + l1_index,
+ sizeof(vmtl1_entry));
+
+ /*
+ * VMAP in the L2 VMT via the IRS. We use any of the VM's CPUs as a
+ * conduit for interacting with the host's IRS. In the current case,
+ * this lets us resolve the VM ID to pass to the hardware.
+ */
+ ret = irq_set_vcpu_affinity(vgic_v5_vpe_db(vcpu0), &cmd);
+
+ /* We've failed to make the L2 VMT valid - things are very broken! */
+ if (ret) {
+ /* Remove the pointer from L1 table */
+ WRITE_ONCE(vmt_info->l2.vmt_base[l1_index], 0);
+
+ vgic_v5_clean_inval(vmt_info->l2.vmt_base + l1_index,
+ sizeof(vmtl1_entry));
+
+ kfree(l2_table);
+
+ return ret;
+ }
+
+ vmt_info->l2.l2ptrs[l1_index] = l2_table;
+
+ return 0;
+}
+
+/*
+ * Allocate the top-level VMT. This can either be linear or two-level.
+ */
+int vgic_v5_vmt_allocate(unsigned int max_vpes)
+{
+ int ret;
+
+ /* Allocate the tracking structure */
+ vmt_info = kzalloc_obj(*vmt_info, GFP_KERNEL);
+ if (!vmt_info)
+ return -ENOMEM;
+
+ ida_init(&vmt_info->vm_id_ida);
+ vmt_info->max_vpes = max_vpes;
+ vmt_info->vmd_size = irs_caps.vmd_size;
+ vmt_info->vped_size = irs_caps.vped_size;
+ vmt_info->two_level = irs_caps.two_level_vmt_support;
+ vmt_info->num_entries = irs_caps.max_vms;
+
+ if (vmt_info->two_level)
+ ret = vgic_v5_alloc_vmt_two_level(vmt_info->num_entries);
+ else
+ ret = vgic_v5_alloc_vmt_linear(vmt_info->num_entries);
+
+ /* If anything failed, free our tracking structure before returning */
+ if (ret) {
+ kfree(vmt_info);
+ vmt_info = NULL;
+ }
+
+ return ret;
+}
+
+/*
+ * Free the VMT and associated tracking structures. This isn't strictly expected
+ * to be called in general operation, but instead exists for completeness.
+ */
+int vgic_v5_vmt_free(void)
+{
+ if (!vmt_info)
+ return 0;
+
+ if (!vmt_info->two_level) {
+ kfree(vmt_info->linear.vmt_base);
+ } else {
+ /* Free the L2 tables; kfree(NULL) is safe */
+ for (int i = 0; i < vmt_info->l2.num_l1_ents; ++i)
+ kfree(vmt_info->l2.l2ptrs[i]);
+ kfree(vmt_info->l2.l2ptrs);
+
+ /* And now free the L1 table */
+ kfree(vmt_info->l2.vmt_base);
+ }
+
+ ida_destroy(&vmt_info->vm_id_ida);
+ kfree(vmt_info);
+ vmt_info = NULL;
+
+ return 0;
+}
+
+/*
+ * Look up a VMT Entry by VM ID.
+ */
+static struct vmtl2_entry *vgic_v5_get_l2_vmte(u16 vm_id)
+{
+ unsigned int l1_index, l2_index;
+ struct vmtl2_entry *l2_table;
+
+ if (!vmt_info->two_level)
+ return &vmt_info->linear.vmt_base[vm_id];
+
+ l1_index = vm_id / GICV5_VMT_L2_TABLE_ENTRIES;
+ l2_index = vm_id % GICV5_VMT_L2_TABLE_ENTRIES;
+
+ if (l1_index >= vmt_info->l2.num_l1_ents)
+ return ERR_PTR(-E2BIG);
+
+ if (!vmt_info->l2.l2ptrs[l1_index])
+ return ERR_PTR(-EINVAL);
+
+ l2_table = vmt_info->l2.l2ptrs[l1_index];
+ return &l2_table[l2_index];
+}
+
+/*
+ * Zero a VMT Entry, and flush & invalidate to the PoC, if required.
+ */
+static int vgic_v5_reset_vmte(struct kvm *kvm)
+{
+ u16 vm_id = vgic_v5_vm_id(kvm);
+ struct vmtl2_entry *vmte;
+
+ vmte = vgic_v5_get_l2_vmte(vm_id);
+ if (IS_ERR(vmte))
+ return PTR_ERR(vmte);
+
+ /*
+ * The VMT is normal memory shared with the IRS. Invalidate before
+ * rewriting the entry so that cacheline-granular maintenance cannot
+ * later push stale data for neighbouring IRS-visible state back to
+ * memory.
+ */
+ vgic_v5_clean_inval(vmte, sizeof(*vmte));
+
+ /*
+ * Prevent the compiler from eliding the individual VMTE
+ * stores. Ordering and visibility to the IRS are provided by the
+ * surrounding cache maintenance and command protocol, not by
+ * WRITE_ONCE().
+ *
+ * The same compiler-access constraint applies to READ_ONCE() users in
+ * this file: when inspecting IRS-visible table entries, read the field
+ * exactly once and prevent the compiler from reusing, merging or
+ * tearing the access. Coherency and freshness for non-coherent IRSes
+ * still come from the surrounding cache maintenance.
+ */
+ WRITE_ONCE(vmte->val[0], cpu_to_le64(0ULL));
+ WRITE_ONCE(vmte->val[1], cpu_to_le64(0ULL));
+ WRITE_ONCE(vmte->val[2], cpu_to_le64(0ULL));
+ WRITE_ONCE(vmte->val[3], cpu_to_le64(0ULL));
+
+ /* And make our write visible to the IRS (if non-coherent) */
+ vgic_v5_clean_inval(vmte, sizeof(*vmte));
+
+ return 0;
+}
+
+/*
+ * Use the IDA to allocate a new VM ID, and track it in the gicv5_vm data
+ * structure. If we're out of VM IDs, the IDA catches that, and we return the
+ * error (-ENOSPC). If we've previously allocated a VM ID, we catch that too and
+ * return -EBUSY.
+ */
+int vgic_v5_allocate_vm_id(struct kvm *kvm)
+{
+ int id;
+
+ if (kvm->arch.vgic.gicv5_vm.vm_id != VGIC_V5_VM_ID_INVAL)
+ return -EBUSY;
+
+ id = ida_alloc_max(&vmt_info->vm_id_ida, vmt_info->num_entries - 1u,
+ GFP_KERNEL);
+ if (id < 0)
+ return id;
+
+ kvm->arch.vgic.gicv5_vm.vm_id = id;
+
+ return 0;
+}
+
+/*
+ * Release the VM ID to allow it to be reallocated in the future.
+ */
+void vgic_v5_release_vm_id(struct kvm *kvm)
+{
+ if (kvm->arch.vgic.gicv5_vm.vm_id == VGIC_V5_VM_ID_INVAL)
+ return;
+
+ ida_free(&vmt_info->vm_id_ida, kvm->arch.vgic.gicv5_vm.vm_id);
+ kvm->arch.vgic.gicv5_vm.vm_id = VGIC_V5_VM_ID_INVAL;
+}
+
+/*
+ * Initialise an entry in the VMT based on the index of the VM.
+ *
+ * Note: We don't mark the VMTE as valid as this needs to be done by
+ * the hardware.
+ */
+int vgic_v5_vmte_init(struct kvm *kvm)
+{
+ int nr_cpus = atomic_read(&kvm->online_vcpus);
+ struct vgic_v5_vm_info *vmi = NULL;
+ u16 vm_id = vgic_v5_vm_id(kvm);
+ struct vmtl2_entry *vmte;
+ void **vped_ptrs = NULL;
+ vpe_entry *vpet = NULL;
+ void *vmd = NULL;
+ int ret;
+ u64 tmp;
+
+ if (nr_cpus > vmt_info->max_vpes)
+ return -E2BIG;
+
+ /*
+ * If we're using two-level VMTs, L2 is allocated on demand. For linear
+ * VMTs, this is a NOP.
+ */
+ ret = vgic_v5_alloc_l2_vmt(kvm);
+ if (ret)
+ return ret;
+
+ vmte = vgic_v5_get_l2_vmte(vm_id);
+ if (IS_ERR(vmte))
+ return PTR_ERR(vmte);
+
+ /* If the entry is already valid, something went wrong */
+ vgic_v5_clean_inval(vmte, sizeof(*vmte));
+ if (le64_to_cpu(READ_ONCE(vmte->val[0])) & GICV5_VMTEL2E_VALID)
+ return -EINVAL;
+
+ ret = vgic_v5_reset_vmte(kvm);
+ if (ret)
+ return ret;
+
+ vmi = kzalloc_obj(*vmi);
+ if (!vmi) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ ret = xa_insert(&vm_info, vm_id, vmi, GFP_KERNEL);
+ if (ret)
+ goto out_fail;
+
+ /* Allocate and assign the VM Descriptor, if required. */
+ if (vmt_info->vmd_size != 0) {
+ vmd = kzalloc(vmt_info->vmd_size, GFP_KERNEL);
+ if (!vmd) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ /* Stash the VA so we can free it later */
+ vmi->vmd_base = vmd;
+
+ tmp = FIELD_PREP(GICV5_VMTEL2E_VMD_ADDR,
+ virt_to_phys(vmd) >> GICV5_VMTEL2E_VMD_ADDR_SHIFT);
+ WRITE_ONCE(vmte->val[0], cpu_to_le64(tmp));
+ }
+
+ /*
+ * Allocate and assign the VPE Table. Round up the number of CPUs to a
+ * whole power of two as we cannot describe non-powers-of-two in the
+ * VMTE field as it conveys the number of ID bits used and not the
+ * number of vPEs.
+ *
+ * The IRS encodes the number of IAFFID bits as N - 1, so a VM with a
+ * single vCPU must still allocate two VPET entries and expose 1 bit.
+ */
+ nr_cpus = max(2UL, roundup_pow_of_two(nr_cpus));
+ vmi->vpe_id_bits = fls(nr_cpus) - 1;
+
+ vpet = kzalloc_objs(*vpet, nr_cpus);
+ if (!vpet) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+
+ /* Stash the VA so we can free it later */
+ vmi->vpet_base = vpet;
+
+ tmp = FIELD_PREP(GICV5_VMTEL2E_VPET_ADDR,
+ virt_to_phys(vpet) >> GICV5_VMTEL2E_VPET_ADDR_SHIFT);
+ tmp |= FIELD_PREP(GICV5_VMTEL2E_VPE_ID_BITS, vmi->vpe_id_bits);
+ WRITE_ONCE(vmte->val[1], cpu_to_le64(tmp));
+
+ vped_ptrs = kzalloc_objs(*vped_ptrs, nr_cpus, GFP_KERNEL);
+ if (!vped_ptrs) {
+ ret = -ENOMEM;
+ goto out_fail;
+ }
+ vmi->vped_ptrs = vped_ptrs;
+
+ if (vmd)
+ vgic_v5_clean_inval(vmd, vmt_info->vmd_size);
+ vgic_v5_clean_inval(vpet, sizeof(*vpet) * nr_cpus);
+ vgic_v5_clean_inval(vmte, sizeof(*vmte));
+
+ kvm->arch.vgic.gicv5_vm.vmte_allocated = true;
+
+ return 0;
+
+out_fail:
+ /* kfree(NULL) is safe so we can just kfree() at leisure */
+ kfree(vmd);
+ kfree(vpet);
+ kfree(vped_ptrs);
+ if (vmi)
+ xa_erase(&vm_info, vm_id);
+ kfree(vmi);
+
+ vgic_v5_reset_vmte(kvm);
+
+ return ret;
+}
+
+/*
+ * Release the VMT Entry, freeing up any allocated data structures before
+ * zeroing the VMTE.
+ *
+ * The VMTE must be marked as invalid before it is released.
+ */
+int vgic_v5_vmte_release(struct kvm *kvm)
+{
+ u16 vm_id = vgic_v5_vm_id(kvm);
+ struct vgic_v5_vm_info *vmi;
+ struct vmtl2_entry *vmte;
+ int ret;
+
+ vmte = vgic_v5_get_l2_vmte(vm_id);
+ if (IS_ERR(vmte))
+ return PTR_ERR(vmte);
+
+ /* Reject if the VMTE has not been marked as invalid! */
+ vgic_v5_clean_inval(vmte, sizeof(*vmte));
+ if (le64_to_cpu(READ_ONCE(vmte->val[0])) & GICV5_VMTEL2E_VALID)
+ return -EINVAL;
+
+ vmi = xa_load(&vm_info, vm_id);
+ if (WARN_ON_ONCE(!vmi))
+ goto no_vmi;
+
+ for (int i = 0; i < BIT(vmi->vpe_id_bits); i++)
+ kfree(vmi->vped_ptrs[i]);
+ kfree(vmi->vped_ptrs);
+ kfree(vmi->vpet_base);
+ kfree(vmi->vmd_base);
+
+ xa_erase(&vm_info, vm_id);
+ kfree(vmi);
+
+no_vmi:
+ /*
+ * If we didn't get far enough into allocating a VMTE to create the VM
+ * info structure, then we just zero the VMTE and move on. There's
+ * nothing else we can realistically do here.
+ */
+ ret = vgic_v5_reset_vmte(kvm);
+ if (ret)
+ return ret;
+
+ kvm->arch.vgic.gicv5_vm.vmte_allocated = false;
+
+ return 0;
+}
+
+/*
+ * Allocate a VPE descriptor and provide it to the hardware via the VPE Table.
+ */
+int vgic_v5_vmte_alloc_vpe(struct kvm_vcpu *vcpu)
+{
+ u16 vm_id = vgic_v5_vm_id(vcpu->kvm);
+ u16 vpe_id = vgic_v5_vpe_id(vcpu);
+ struct vgic_v5_vm_info *vmi;
+ vpe_entry tmp, *vpet_base;
+ void *vped;
+
+ /* Make sure we're not over what the hardware supports */
+ if (vpe_id >= vmt_info->max_vpes)
+ return -E2BIG;
+
+ vmi = xa_load(&vm_info, vm_id);
+ if (WARN_ON_ONCE(!vmi))
+ return -EINVAL;
+
+ if (vpe_id >= 1 << vmi->vpe_id_bits)
+ return -E2BIG;
+
+ vpet_base = vmi->vpet_base;
+
+ /* If the VPETE for this CPU is already valid we've gone wrong */
+ vgic_v5_clean_inval(&vpet_base[vpe_id], sizeof(*vpet_base));
+ if (le64_to_cpu(READ_ONCE(vpet_base[vpe_id])) & GICV5_VPE_VALID)
+ return -EBUSY;
+
+ /* Alloc VPE Descriptor. Only used by IRS. */
+ vped = kzalloc(vmt_info->vped_size, GFP_KERNEL);
+ if (!vped)
+ return -ENOMEM;
+
+ vmi->vped_ptrs[vpe_id] = vped;
+
+ tmp = FIELD_PREP(GICV5_VPED_ADDR, virt_to_phys(vped) >> GICV5_VPED_ADDR_SHIFT);
+ WRITE_ONCE(vpet_base[vpe_id], cpu_to_le64(tmp));
+
+ vgic_v5_clean_inval(vped, vmt_info->vped_size);
+ vgic_v5_clean_inval(vpet_base + vpe_id, sizeof(vpe_entry));
+
+ return 0;
+}
+
+/*
+ * Free the memory allocated for the VPE descriptor.
+ */
+int vgic_v5_vmte_free_vpe(struct kvm_vcpu *vcpu)
+{
+ u16 vm_id = vgic_v5_vm_id(vcpu->kvm);
+ u16 vpe_id = vgic_v5_vpe_id(vcpu);
+ struct vgic_v5_vm_info *vmi;
+ struct vmtl2_entry *vmte;
+ vpe_entry *vpet_base;
+ void *vped;
+
+ vmte = vgic_v5_get_l2_vmte(vm_id);
+ if (IS_ERR(vmte))
+ return PTR_ERR(vmte);
+
+ vgic_v5_clean_inval(vmte, sizeof(*vmte));
+ if (le64_to_cpu(READ_ONCE(vmte->val[0])) & GICV5_VMTEL2E_VALID)
+ return -EBUSY;
+
+ vmi = xa_load(&vm_info, vm_id);
+ if (!vmi)
+ return -EINVAL;
+
+ if (vpe_id >= 1 << vmi->vpe_id_bits)
+ return -E2BIG;
+
+ vpet_base = vmi->vpet_base;
+ WRITE_ONCE(vpet_base[vpe_id], 0ULL);
+
+ vgic_v5_clean_inval(vpet_base + vpe_id, sizeof(vpe_entry));
+
+ /* Free VPE Descriptor. Only used by IRS. */
+ vped = vmi->vped_ptrs[vpe_id];
+ vmi->vped_ptrs[vpe_id] = NULL;
+ kfree(vped);
+
+ return 0;
+}
diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.h b/arch/arm64/kvm/vgic/vgic-v5-tables.h
new file mode 100644
index 0000000000000..3ca5bc7214fc9
--- /dev/null
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.h
@@ -0,0 +1,76 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2025, 2026 Arm Ltd.
+ */
+
+#ifndef __KVM_ARM_VGICV5_TABLES_H__
+#define __KVM_ARM_VGICV5_TABLES_H__
+
+#include <linux/idr.h>
+#include <linux/irqchip/arm-gic-v5.h>
+
+/* Level 1 Virtual Machine Table Entry */
+typedef __le64 vmtl1_entry;
+
+/* Level 2 Virtual Machine Table Entry */
+struct vmtl2_entry {
+ __le64 val[4];
+};
+
+/* Virtual PE Table Entry */
+typedef __le64 vpe_entry;
+
+struct vgic_v5_vm_info {
+ void __iomem *vmd_base;
+ vpe_entry __iomem *vpet_base;
+ void __iomem **vped_ptrs;
+ u8 vpe_id_bits;
+};
+
+struct vgic_v5_vmt {
+ union {
+ struct {
+ struct vmtl2_entry *vmt_base;
+ unsigned int num_ents;
+ } linear;
+ struct {
+ vmtl1_entry *vmt_base;
+ struct vmtl2_entry **l2ptrs;
+ unsigned int num_l1_ents;
+ } l2;
+ };
+ bool two_level;
+ unsigned int num_entries;
+ unsigned int max_vpes;
+ size_t vmd_size;
+ size_t vped_size;
+ struct ida vm_id_ida;
+};
+
+static inline u16 vgic_v5_vm_id(struct kvm *kvm)
+{
+ return kvm->arch.vgic.gicv5_vm.vm_id;
+}
+
+static inline u16 vgic_v5_vpe_id(struct kvm_vcpu *vcpu)
+{
+ return vcpu->vcpu_id;
+}
+
+static inline int vgic_v5_vpe_db(struct kvm_vcpu *vcpu)
+{
+ return vcpu->arch.vgic_cpu.vgic_v5.gicv5_vpe.db;
+}
+
+int vgic_v5_vmt_allocate(unsigned int max_vpes);
+int vgic_v5_vmt_free(void);
+
+int vgic_v5_allocate_vm_id(struct kvm *kvm);
+void vgic_v5_release_vm_id(struct kvm *kvm);
+
+int vgic_v5_vmte_init(struct kvm *kvm);
+int vgic_v5_vmte_release(struct kvm *kvm);
+int vgic_v5_vmte_alloc_vpe(struct kvm_vcpu *vcpu);
+int vgic_v5_vmte_free_vpe(struct kvm_vcpu *vcpu);
+
+#endif
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index 52924408ca990..adfe0b207ef40 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -153,6 +153,20 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
return 0;
}
+static int vgic_v5_db_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+{
+ enum gicv5_vcpu_cmd *cmd = vcpu_info;
+
+ switch (*cmd) {
+ case VMT_L2_MAP:
+ case VMTE_MAKE_VALID:
+ case VMTE_MAKE_INVALID:
+ /* Not yet implemented */
+ default:
+ return -EINVAL;
+ }
+}
+
/*
* This set of irq_chip functions is specific for doorbells.
*/
@@ -164,6 +178,7 @@ static const struct irq_chip vgic_v5_db_irq_chip = {
.irq_set_affinity = irq_chip_set_affinity_parent,
.irq_get_irqchip_state = irq_chip_get_parent_state,
.irq_set_irqchip_state = irq_chip_set_parent_state,
+ .irq_set_vcpu_affinity = vgic_v5_db_set_vcpu_affinity,
.flags = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_SKIP_SET_WAKE |
IRQCHIP_MASK_ON_SUSPEND,
};
diff --git a/drivers/irqchip/irq-gic-v5-irs.c b/drivers/irqchip/irq-gic-v5-irs.c
index 607e066821b52..70502b07ec8d7 100644
--- a/drivers/irqchip/irq-gic-v5-irs.c
+++ b/drivers/irqchip/irq-gic-v5-irs.c
@@ -269,24 +269,24 @@ int gicv5_irs_iste_alloc(const u32 lpi)
* itself is not supported) again serves to make it easier to find physically
* contiguous blocks of memory.
*/
-static unsigned int gicv5_irs_l2_sz(u32 idr2)
+unsigned int gicv5_irs_l2_sz(u32 l2sz)
{
switch (PAGE_SIZE) {
case SZ_64K:
- if (GICV5_IRS_IST_L2SZ_SUPPORT_64KB(idr2))
+ if (GICV5_IRS_IST_L2SZ_SUPPORT_64KB(l2sz))
return GICV5_IRS_IST_CFGR_L2SZ_64K;
fallthrough;
case SZ_4K:
- if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(idr2))
+ if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(l2sz))
return GICV5_IRS_IST_CFGR_L2SZ_4K;
fallthrough;
case SZ_16K:
- if (GICV5_IRS_IST_L2SZ_SUPPORT_16KB(idr2))
+ if (GICV5_IRS_IST_L2SZ_SUPPORT_16KB(l2sz))
return GICV5_IRS_IST_CFGR_L2SZ_16K;
break;
}
- if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(idr2))
+ if (GICV5_IRS_IST_L2SZ_SUPPORT_4KB(l2sz))
return GICV5_IRS_IST_CFGR_L2SZ_4K;
return GICV5_IRS_IST_CFGR_L2SZ_64K;
@@ -334,7 +334,7 @@ static int __init gicv5_irs_init_ist(struct gicv5_irs_chip_data *irs_data)
lpi_id_bits = min(lpi_id_bits, gicv5_global_data.cpuif_id_bits);
if (two_levels)
- l2sz = gicv5_irs_l2_sz(idr2);
+ l2sz = gicv5_irs_l2_sz(FIELD_GET(GICV5_IRS_IDR2_IST_L2SZ, idr2));
istmd = !!FIELD_GET(GICV5_IRS_IDR2_ISTMD, idr2);
diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index bff2b7c896d55..ba32cd71fe0a7 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -374,6 +374,8 @@ struct vgic_redist_region {
struct list_head list;
};
+#define VGIC_V5_VM_ID_INVAL (-1)
+
struct vgic_v5_vm {
/*
* We only expose a subset of PPIs to the guest. This subset is a
@@ -396,6 +398,8 @@ struct vgic_v5_vm {
struct fwnode_handle *fwnode;
struct irq_domain *domain;
int vpe_db_base;
+ u32 vm_id;
+ bool vmte_allocated;
};
struct vgic_dist {
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index 1702b57527dee..64e31068d9d17 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -159,9 +159,9 @@
#define GICV5_IRS_IDR2_LPI BIT(5)
#define GICV5_IRS_IDR2_ID_BITS GENMASK(4, 0)
-#define GICV5_IRS_IST_L2SZ_SUPPORT_4KB(r) FIELD_GET(BIT(11), (r))
-#define GICV5_IRS_IST_L2SZ_SUPPORT_16KB(r) FIELD_GET(BIT(12), (r))
-#define GICV5_IRS_IST_L2SZ_SUPPORT_64KB(r) FIELD_GET(BIT(13), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_4KB(r) FIELD_GET(BIT(0), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_16KB(r) FIELD_GET(BIT(1), (r))
+#define GICV5_IRS_IST_L2SZ_SUPPORT_64KB(r) FIELD_GET(BIT(2), (r))
#define GICV5_IRS_IDR3_VMT_LEVELS BIT(10)
#define GICV5_IRS_IDR3_VM_ID_BITS GENMASK(9, 5)
@@ -573,6 +573,7 @@ int gicv5_irs_cpu_to_iaffid(int cpu_id, u16 *iaffid);
struct gicv5_irs_chip_data *gicv5_irs_lookup_by_spi_id(u32 spi_id);
int gicv5_spi_irq_set_type(struct irq_data *d, unsigned int type);
int gicv5_irs_iste_alloc(u32 lpi);
+unsigned int gicv5_irs_l2_sz(u32 l2sz);
void gicv5_irs_syncr(void);
/* Embedded in kvm.arch */
@@ -617,4 +618,11 @@ void gicv5_deinit_lpis(void);
void __init gicv5_its_of_probe(struct device_node *parent);
void __init gicv5_its_acpi_probe(void);
+
+enum gicv5_vcpu_cmd {
+ VMT_L2_MAP, /* Map in a L2 VMT - *may* happen on VM init */
+ VMTE_MAKE_VALID, /* Make the VMTE valid */
+ VMTE_MAKE_INVALID, /* Make the VMTE (et al.) invalid */
+};
+
#endif
--
2.34.1
More information about the linux-arm-kernel
mailing list