[PATCH 09/43] KVM: arm64: gic-v5: Implement VMT/vIST IRS MMIO Ops

Sascha Bischoff Sascha.Bischoff at arm.com
Mon Apr 27 09:09:06 PDT 2026


GICv5 has rules about which fields of a VMTE (or L1 VMT) may be
directly written by the host once the table is valid. This ensures
that no stale state is cached by the hardware, and provides a clear
interface for making VMs, ISTs, etc, valid.

The hypervisor is responsible for populating the VMTE for a
VM. However, it is not permitted to write the Valid bit (as the VM
table is already valid). Instead, the VM is made valid via an IRS MMIO
Op. The same applies to the ISTs - they must be made valid via the
host IRS.

This commit adds support for:

* Making level 2 VMTs valid (only), allowing for dynamic level 2 table
  allocation.
* Making VMTEs (VMs) valid or invalid
* Making SPI/LPI ISTs valid or invalid for a specific VM

When (successfully) probing for a GICv5, the VMT is allocated, and is
made valid via the IRS's MMIO interface.

This commit also extends the doorbell domain to allow the doorbells
themselves to act as a conduit for issuing commands - this is similar
to what exists for GICv4 support. Effectively, irq_set_vcpu_affinity()
becomes an ioctl-like interface for issuing commands specific to
either a VM or the particular VPE that the doorbell belongs to. This
change adds support for the following via the VPE doorbells:

        VMT_L2_MAP - Make a second level VM table valid
        VMTE_MAKE_VALID - Make a single VMTE (and hence VM) valid
        VMTE_MAKE_INVALID - Make a single VMTE (and hence VM) invalid
        SPI_VIST_MAKE_VALID - Make the SPI IST valid
        LPI_VIST_MAKE_VALID - Make the LPI IST valid
        LPI_VIST_MAKE_INVALID - Make the LPI IST invalid

Note: It is intentional that there is no SPI_VIST_MAKE_INVALID - this
cannot happen while the VM is live, and given that the SPI is
allocated as part of VM creation, there is no need to make it invalid
again until the VM is destroyed, at which point the VMTE is
invalid. Therefore, there's no need to do this via the host's IRS MMIO
interface, as it can be directly marked as invalid and freed. LPIs, on
the other hand, are driven by the guest itself, and the guest is
theoretically free to invalidate and free the LPI IST at any point.

Signed-off-by: Sascha Bischoff <sascha.bischoff at arm.com>
---
 arch/arm64/kvm/vgic/vgic-v5-tables.c |  25 +++
 arch/arm64/kvm/vgic/vgic-v5-tables.h |   2 +
 arch/arm64/kvm/vgic/vgic-v5.c        | 236 ++++++++++++++++++++++++++-
 include/linux/irqchip/arm-gic-v5.h   |  30 ++++
 4 files changed, 290 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.c b/arch/arm64/kvm/vgic/vgic-v5-tables.c
index de905f37b61a5..0120c3205dea6 100644
--- a/arch/arm64/kvm/vgic/vgic-v5-tables.c
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.c
@@ -666,6 +666,26 @@ int vgic_v5_vmte_free_vpe(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+phys_addr_t vgic_v5_get_vmt_base(void)
+{
+	phys_addr_t vmt_base;
+
+	if (!vgic_v5_vmt_allocated())
+		return -ENXIO;
+
+	if (!vmt_info->two_level)
+		vmt_base = virt_to_phys(vmt_info->linear.vmt_base);
+	else
+		vmt_base = virt_to_phys(vmt_info->l2.vmt_base);
+
+	return vmt_base;
+}
+
+u8 vgic_v5_vmt_vpe_id_bits(void)
+{
+	return fls(vmt_info->max_vpes) - 1;
+}
+
 /*
  * Assign an already allocated IST to the VM by populating the fields in the
  * corresponding VMTE. We re-use this code for both an SPI IST and LPI IST, even
@@ -715,6 +735,11 @@ int vgic_v5_vmte_assign_ist(struct kvm *kvm, phys_addr_t ist_base,
 	/* Finally, mark the entry as valid */
 	cmd_info.cmd_type = spi_ist ? SPI_VIST_MAKE_VALID : LPI_VIST_MAKE_VALID;
 	ret = irq_set_vcpu_affinity(vgic_v5_vpe_db(vcpu0), &cmd_info);
+	if (ret) {
+		WRITE_ONCE(vmte->val[section], 0ULL);
+		vgic_v5_clean_inval(vmte, sizeof(*vmte), true, false);
+		return ret;
+	}
 
 	/* Any cached entries we now have are stale! */
 	vgic_v5_clean_inval(vmte, sizeof(*vmte), false, true);
diff --git a/arch/arm64/kvm/vgic/vgic-v5-tables.h b/arch/arm64/kvm/vgic/vgic-v5-tables.h
index 37e220cda1987..6a024337eba79 100644
--- a/arch/arm64/kvm/vgic/vgic-v5-tables.h
+++ b/arch/arm64/kvm/vgic/vgic-v5-tables.h
@@ -150,6 +150,8 @@ int vgic_v5_vmt_allocate(bool two_level, unsigned int num_entries,
 			 size_t vmd_size, size_t vped_size,
 			 unsigned int vpe_id_bits);
 int vgic_v5_vmt_free(void);
+phys_addr_t vgic_v5_get_vmt_base(void);
+u8 vgic_v5_vmt_vpe_id_bits(void);
 
 int vgic_v5_allocate_vm_id(struct kvm *kvm);
 void vgic_v5_release_vm_id(struct kvm *kvm);
diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index 4e0d52b309628..49eb01ca07961 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -36,6 +36,12 @@ static void vgic_v5_get_implemented_ppis(void)
 	__assign_bit(GICV5_ARCH_PPI_PMUIRQ, ppi_caps.impl_ppi_mask, system_supports_pmuv3());
 }
 
+/*
+ * The IRS MMIO interface is shared between all VMs, so make sure we don't do
+ * anything stupid!
+ */
+static DEFINE_RAW_SPINLOCK(vm_config_lock);
+
 static void __iomem *irs_base;
 
 static u32 irs_readl_relaxed(const u32 reg_offset)
@@ -43,6 +49,21 @@ static u32 irs_readl_relaxed(const u32 reg_offset)
 	return readl_relaxed(irs_base + reg_offset);
 }
 
+static void irs_writel_relaxed(const u32 val, const u32 reg_offset)
+{
+	writel_relaxed(val, irs_base + reg_offset);
+}
+
+static u64 irs_readq_relaxed(const u32 reg_offset)
+{
+	return readq_relaxed(irs_base + reg_offset);
+}
+
+static void irs_writeq_relaxed(const u64 val, const u32 reg_offset)
+{
+	writeq_relaxed(val, irs_base + reg_offset);
+}
+
 static int gicv5_irs_extract_vm_caps(const struct gic_kvm_info *info)
 {
 	u64 idr;
@@ -84,16 +105,22 @@ static int gicv5_irs_extract_vm_caps(const struct gic_kvm_info *info)
 	return 0;
 }
 
+/* Forward decl for cleaner code layout */
+static int vgic_v5_irs_assign_vmt(bool two_level, u8 vm_id_bits, phys_addr_t vmt_base);
+static int vgic_v5_irs_clear_vmt(void);
+
 /*
  * Probe for a vGICv5 compatible interrupt controller, returning 0 on success.
  */
 int vgic_v5_probe(const struct gic_kvm_info *info)
 {
+	struct vgic_v5_host_ist_caps *ist_caps;
 	bool v5_registered = false;
 	u64 ich_vtr_el2;
 	int ret;
 
 	kvm_vgic_global_state.type = VGIC_V5;
+	kvm_vgic_global_state.max_gic_vcpus = VGIC_V5_MAX_CPUS;
 
 	kvm_vgic_global_state.vcpu_base = 0;
 	kvm_vgic_global_state.vctrl_base = NULL;
@@ -114,13 +141,53 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	if (gicv5_irs_extract_vm_caps(info))
 		goto skip_v5;
 
-	kvm_vgic_global_state.max_gic_vcpus = VGIC_V5_MAX_CPUS;
+	ist_caps = vgic_v5_host_caps();
+
+	/*
+	 * Even if the HW supports more per-VM vCPUs, artifically cap as we
+	 * can't use them all.
+	 */
+	kvm_vgic_global_state.max_gic_vcpus = min(ist_caps->max_vpes,
+						  VGIC_V5_MAX_CPUS);
+
+	/*
+	 * GICv5 requires a set of tables to be allocated in order to manage
+	 * VMs. We allocate them in advance here, which alas means that we
+	 * already have to make a decisions regarding the maximum number of VMs
+	 * we want to run. For now, we match the maximum number offered by the
+	 * hardware, but this might not be a wise choice in the long term.
+	 */
+	ret = vgic_v5_vmt_allocate(ist_caps->two_level_vmt_support,
+				   ist_caps->max_vms, ist_caps->vmd_size,
+				   ist_caps->vped_size,
+				   kvm_vgic_global_state.max_gic_vcpus);
+	if (ret) {
+		kvm_err("Failed to allocate the GICv5 VM tables; no GICv5 support\n");
+		goto skip_v5;
+	}
+
+	/*
+	 * We've now allocated the VM table, but the host's IRS doesn't know
+	 * about it yet. Provide the base address of the VMT to the IRS, as well
+	 * as the number of ID bits that it covers and the structure used
+	 * (linear/two-level).
+	 */
+	ret = vgic_v5_irs_assign_vmt(ist_caps->two_level_vmt_support,
+				     vgic_v5_vmt_vpe_id_bits(),
+				     vgic_v5_get_vmt_base());
+	if (ret) {
+		kvm_err("Failed to assign the GICv5 VM tables to the IRS; no GICv5 support\n");
+		vgic_v5_vmt_free();
+		goto skip_v5;
+	}
 
 	vgic_v5_get_implemented_ppis();
 
 	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V5);
 	if (ret) {
 		kvm_err("Cannot register GICv5 KVM device.\n");
+		vgic_v5_irs_clear_vmt();
+		vgic_v5_vmt_free();
 		goto skip_v5;
 	}
 
@@ -148,12 +215,13 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	ret = kvm_register_vgic_device(KVM_DEV_TYPE_ARM_VGIC_V3);
 	if (ret) {
 		kvm_err("Cannot register GICv3-legacy KVM device.\n");
-		return ret;
+		/* vGICv5 should still work */
+		return v5_registered ? 0 : ret;
 	}
 
 	/* We potentially limit the max VCPUs further than we need to here */
 	kvm_vgic_global_state.max_gic_vcpus = min(VGIC_V3_MAX_CPUS,
-						  VGIC_V5_MAX_CPUS);
+						  kvm_vgic_global_state.max_gic_vcpus);
 
 	static_branch_enable(&kvm_vgic_global_state.gicv3_cpuif);
 	kvm_info("GCIE legacy system register CPU interface\n");
@@ -163,6 +231,167 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
 	return 0;
 }
 
+/*
+ * Wait for completion of a change in any of IRS_VMT_BASER, IRS_VMAP_L2_VMTR,
+ * IRS_VMAP_VMR, IRS_VMAP_VPER, IRS_VMAP_VISTR, IRS_VMAP_L2_VISTR.
+ */
+static int vgic_v5_irs_wait_for_vm_op(void)
+{
+	u32 statusr;
+	int ret;
+
+	ret = readl_relaxed_poll_timeout_atomic(
+		irs_base + GICV5_IRS_VMT_STATUSR, statusr,
+		FIELD_GET(GICV5_IRS_VMT_STATUSR_IDLE, statusr), 1,
+		USEC_PER_SEC);
+
+	if (ret == -ETIMEDOUT) {
+		pr_err_ratelimited("Time out waiting for IRS VM Op\n");
+		return ret;
+	}
+
+	return 0;
+}
+
+static int vgic_v5_irs_assign_vmt(bool two_level, u8 vm_id_bits, phys_addr_t vmt_base)
+{
+	u64 vmt_baser;
+	u32 vmt_cfgr;
+
+	vmt_baser = irs_readq_relaxed(GICV5_IRS_VMT_BASER);
+	if (!!FIELD_GET(GICV5_IRS_VMT_BASER_VALID, vmt_baser))
+		return -EBUSY;
+
+	vmt_cfgr = FIELD_PREP(GICV5_IRS_VMT_CFGR_VM_ID_BITS, vm_id_bits);
+	if (two_level)
+		vmt_cfgr |= FIELD_PREP(GICV5_IRS_VMT_CFGR_STRUCTURE,
+				       GICV5_IRS_VMT_CFGR_STRUCTURE_TWO_LEVEL);
+
+	irs_writel_relaxed(vmt_cfgr, GICV5_IRS_VMT_CFGR);
+
+	/* The base address is intentionally only masked and not shifted */
+	vmt_baser = FIELD_PREP(GICV5_IRS_VMT_BASER_VALID, true) |
+		    (vmt_base & GICV5_IRS_VMT_BASER_ADDR);
+	irs_writeq_relaxed(vmt_baser, GICV5_IRS_VMT_BASER);
+
+	return vgic_v5_irs_wait_for_vm_op();
+}
+
+static int vgic_v5_irs_clear_vmt(void)
+{
+	irs_writeq_relaxed(0ULL, GICV5_IRS_VMT_BASER);
+
+	return vgic_v5_irs_wait_for_vm_op();
+}
+
+static int vgic_v5_irs_vmap_l2_vmt(int vm_id)
+{
+	u64 vmap_l2_vmtr;
+	int ret = 0;
+
+	guard(raw_spinlock)(&vm_config_lock);
+
+	/* Make sure that we are idle to begin with */
+	ret = vgic_v5_irs_wait_for_vm_op();
+	if (ret)
+		return ret;
+
+	/* Mark the VM as valid */
+	vmap_l2_vmtr = FIELD_PREP(GICV5_IRS_VMAP_L2_VMTR_VM_ID, vm_id) |
+		       FIELD_PREP(GICV5_IRS_VMAP_L2_VMTR_M, true);
+	irs_writeq_relaxed(vmap_l2_vmtr, GICV5_IRS_VMAP_L2_VMTR);
+
+	return vgic_v5_irs_wait_for_vm_op();
+}
+
+static int __vgic_v5_irs_vmap_vm(int vm_id, bool unmap)
+{
+	u64 vmap_vmr;
+	int ret;
+
+	guard(raw_spinlock)(&vm_config_lock);
+
+	/* Make sure that we are idle to begin with */
+	ret = vgic_v5_irs_wait_for_vm_op();
+	if (ret)
+		return ret;
+
+	/* Mark the VM as valid */
+	vmap_vmr = FIELD_PREP(GICV5_IRS_VMAP_VMR_VM_ID, vm_id) |
+		   FIELD_PREP(GICV5_IRS_VMAP_VMR_U, unmap) |
+		   FIELD_PREP(GICV5_IRS_VMAP_VMR_M, true);
+	irs_writeq_relaxed(vmap_vmr, GICV5_IRS_VMAP_VMR);
+
+	return vgic_v5_irs_wait_for_vm_op();
+}
+
+static int vgic_v5_irs_set_vm_valid(int vm_id)
+{
+	return __vgic_v5_irs_vmap_vm(vm_id, false);
+}
+
+static int vgic_v5_irs_set_vm_invalid(int vm_id)
+{
+	return __vgic_v5_irs_vmap_vm(vm_id, true);
+}
+
+static int __vgic_v5_irs_update_vist_validity(int vm_id, bool spi_ist, bool unmap)
+{
+	u8 type = spi_ist ? 0b011 : 0b010;
+	u64 vmap_vistr;
+	int ret;
+
+	guard(raw_spinlock)(&vm_config_lock);
+
+	/* Make sure that we are idle to begin with */
+	ret = vgic_v5_irs_wait_for_vm_op();
+	if (ret)
+		return ret;
+
+	/* Mark the IST as valid */
+	vmap_vistr = FIELD_PREP(GICV5_IRS_VMAP_VISTR_TYPE, type) |
+		     FIELD_PREP(GICV5_IRS_VMAP_VISTR_VM_ID, vm_id) |
+		     FIELD_PREP(GICV5_IRS_VMAP_VISTR_U, unmap) |
+		     FIELD_PREP(GICV5_IRS_VMAP_VISTR_M, true);
+	irs_writeq_relaxed(vmap_vistr, GICV5_IRS_VMAP_VISTR);
+
+	return vgic_v5_irs_wait_for_vm_op();
+}
+
+static int vgic_v5_irs_set_vist_valid(int vm_id, bool spi_ist)
+{
+	return __vgic_v5_irs_update_vist_validity(vm_id, spi_ist, false);
+}
+
+/* Note: We currently do not use this as we rely on the VM becoming invalid. */
+static int vgic_v5_irs_set_vist_invalid(int vm_id, bool spi_ist)
+{
+	return __vgic_v5_irs_update_vist_validity(vm_id, spi_ist, true);
+}
+
+static int vgic_v5_db_set_vcpu_affinity(struct irq_data *data, void *vcpu_info)
+{
+	struct vgic_v5_vm *vm = data->domain->host_data;
+	struct gicv5_cmd_info *cmd_info = vcpu_info;
+
+	switch (cmd_info->cmd_type) {
+	case VMT_L2_MAP:
+		return vgic_v5_irs_vmap_l2_vmt(vm->vm_id);
+	case VMTE_MAKE_VALID:
+		return vgic_v5_irs_set_vm_valid(vm->vm_id);
+	case VMTE_MAKE_INVALID:
+		return vgic_v5_irs_set_vm_invalid(vm->vm_id);
+	case SPI_VIST_MAKE_VALID:
+		return vgic_v5_irs_set_vist_valid(vm->vm_id, true);
+	case LPI_VIST_MAKE_VALID:
+		return vgic_v5_irs_set_vist_valid(vm->vm_id, false);
+	case LPI_VIST_MAKE_INVALID:
+		return vgic_v5_irs_set_vist_invalid(vm->vm_id, false);
+	default:
+		return -EINVAL;
+	}
+}
+
 /*
  * This set of irq_chip functions is specific for doorbells.
  */
@@ -174,6 +403,7 @@ static struct irq_chip vgic_v5_db_irq_chip = {
 	.irq_set_affinity = irq_chip_set_affinity_parent,
 	.irq_get_irqchip_state = irq_chip_get_parent_state,
 	.irq_set_irqchip_state = irq_chip_set_parent_state,
+	.irq_set_vcpu_affinity = vgic_v5_db_set_vcpu_affinity,
 	.flags = IRQCHIP_SET_TYPE_MASKED | IRQCHIP_SKIP_SET_WAKE |
 		 IRQCHIP_MASK_ON_SUSPEND,
 };
diff --git a/include/linux/irqchip/arm-gic-v5.h b/include/linux/irqchip/arm-gic-v5.h
index ccec0a045927c..ff5ad653252d2 100644
--- a/include/linux/irqchip/arm-gic-v5.h
+++ b/include/linux/irqchip/arm-gic-v5.h
@@ -87,6 +87,12 @@
 #define GICV5_IRS_IST_CFGR		0x0190
 #define GICV5_IRS_IST_STATUSR		0x0194
 #define GICV5_IRS_MAP_L2_ISTR		0x01c0
+#define GICV5_IRS_VMT_BASER		0x0200
+#define GICV5_IRS_VMT_CFGR		0x0210
+#define GICV5_IRS_VMT_STATUSR		0x0214
+#define GICV5_IRS_VMAP_L2_VMTR		0x02c0
+#define GICV5_IRS_VMAP_VMR		0x02c8
+#define GICV5_IRS_VMAP_VISTR		0x02d0
 
 #define GICV5_IRS_IDR0_VIRT		BIT(6)
 
@@ -181,6 +187,30 @@
 
 #define GICV5_IRS_MAP_L2_ISTR_ID	GENMASK(23, 0)
 
+#define GICV5_IRS_VMT_BASER_ADDR	GENMASK_ULL(51, 3)
+#define GICV5_IRS_VMT_BASER_ADDR_SHIFT	3ULL
+#define GICV5_IRS_VMT_BASER_VALID	BIT_ULL(0)
+
+#define GICV5_IRS_VMT_CFGR_STRUCTURE_TWO_LEVEL	0b1
+#define GICV5_IRS_VMT_CFGR_STRUCTURE_LINEAR	0b0
+
+#define GICV5_IRS_VMT_CFGR_STRUCTURE	BIT(16)
+#define GICV5_IRS_VMT_CFGR_VM_ID_BITS	GENMASK(4, 0)
+
+#define GICV5_IRS_VMT_STATUSR_IDLE	BIT(0)
+
+#define GICV5_IRS_VMAP_L2_VMTR_M	BIT_ULL(63)
+#define GICV5_IRS_VMAP_L2_VMTR_VM_ID	GENMASK_ULL(15, 0)
+
+#define GICV5_IRS_VMAP_VMR_M		BIT_ULL(63)
+#define GICV5_IRS_VMAP_VMR_U		BIT_ULL(62)
+#define GICV5_IRS_VMAP_VMR_VM_ID	GENMASK_ULL(15, 0)
+
+#define GICV5_IRS_VMAP_VISTR_M		BIT_ULL(63)
+#define GICV5_IRS_VMAP_VISTR_U		BIT_ULL(62)
+#define GICV5_IRS_VMAP_VISTR_VM_ID	GENMASK_ULL(47, 32)
+#define GICV5_IRS_VMAP_VISTR_TYPE	GENMASK_ULL(31, 29)
+
 #define GICV5_ISTL1E_VALID		BIT_ULL(0)
 #define GICV5_IRS_ISTL1E_SIZE		8UL
 
-- 
2.34.1



More information about the linux-arm-kernel mailing list