[PATCH v3 19/36] KVM: arm64: gic-v5: Implement PPI interrupt injection

Fri Jan 9 09:04:45 PST 2026

This change introduces interrupt injection for PPIs for GICv5-based
guests.

The lifecycle of PPIs is largely managed by the hardware for a GICv5
system. The hypervisor injects pending state into the guest by using
the ICH_PPI_PENDRx_EL2 registers. These are used by the hardware to
pick a Highest Priority Pending Interrupt (HPPI) for the guest based
on the enable state of each individual interrupt. The enable state and
priority for each interrupt are provided by the guest itself (through
writes to the PPI registers).

When Direct Virtual Interrupt (DVI) is set for a particular PPI, the
hypervisor is even able to skip the injection of the pending state
altogether - it all happens in hardware.

The result of the above is that no AP lists are required for GICv5,
unlike for older GICs. Instead, for PPIs the ICH_PPI_* registers
fulfil the same purpose for all 128 PPIs. Hence, as long as the
ICH_PPI_* registers are populated prior to guest entry, and merged
back into the KVM shadow state on exit, the PPI state is preserved,
and interrupts can be injected.

When injecting the state of a PPI the state is merged into the KVM's
shadow state using the set_pending_state irq_op. The directly sets the
relevant bit in the shadow ICH_PPI_PENDRx_EL2, which is presented to
the guest (and GICv5 hardware) on next guest entry. The
queue_irq_unlock irq_op is required to kick the vCPU to ensure that it
seems the new state. The result is that no AP lists are used for
private interrupts on GICv5.

Prior to entering the guest, vgic_v5_flush_ppi_state is called from
kvm_vgic_flush_hwstate. The effectively snapshots the shadow PPI
pending state (twice - an entry and an exit copy) in order to track
any changes. These changes can come from a guest consuming an
interrupt or from a guest making an Edge-triggered interrupt pending.

When returning from running a guest, the guest's PPI state is merged
back into KVM's shadow state in vgic_v5_merge_ppi_state from
kvm_vgic_sync_hwstate. The Enable and Active state is synced back for
all PPIs, and the pending state is synced back for Edge PPIs (Level is
driven directly by the devices generating said levels). The incoming
pending state from the guest is merged with KVM's shadow state to
avoid losing any incoming interrupts.

Signed-off-by: Sascha Bischoff <sascha.bischoff at arm.com>
---
 arch/arm64/kvm/vgic/vgic-v5.c | 160 ++++++++++++++++++++++++++++++++++
 arch/arm64/kvm/vgic/vgic.c    |  40 +++++++--
 arch/arm64/kvm/vgic/vgic.h    |  25 ++++--
 3 files changed, 209 insertions(+), 16 deletions(-)

diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
index bf2c77bafa1d3..c1899add8f5c3 100644
--- a/arch/arm64/kvm/vgic/vgic-v5.c
+++ b/arch/arm64/kvm/vgic/vgic-v5.c
@@ -139,6 +139,166 @@ void vgic_v5_get_implemented_ppis(void)
 		ppi_caps->impl_ppi_mask[0] |= BIT_ULL(GICV5_ARCH_PPI_PMUIRQ);
 }
 
+static bool vgic_v5_ppi_set_pending_state(struct kvm_vcpu *vcpu,
+					  struct vgic_irq *irq)
+{
+	struct vgic_v5_cpu_if *cpu_if;
+	const u32 id = FIELD_GET(GICV5_HWIRQ_ID, irq->intid);
+	unsigned long *p;
+
+	if (!vcpu || !irq)
+		return false;
+
+	/*
+	 * For DVI'd interrupts, the state is directly driven by the host
+	 * hardware connected to the interrupt line. There is nothing for us to
+	 * do here. Moreover, this is just broken!
+	 */
+	if (WARN_ON(irq->directly_injected))
+		return true;
+
+	cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+
+	p = (unsigned long *)&cpu_if->vgic_ppi_pendr[id / 64];
+	__assign_bit(id % 64, p, irq_is_pending(irq));
+
+	return true;
+}
+
+/*
+ * For GICv5, the PPIs are mostly directly managed by the hardware. We (the
+ * hypervisor) handle the pending, active, enable state save/restore, but don't
+ * need the PPIs to be queued on a per-VCPU AP list. Therefore, sanity check the
+ * state, unlock, and return.
+ */
+static bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
+					 unsigned long flags)
+	__releases(&irq->irq_lock)
+{
+	struct kvm_vcpu *vcpu;
+
+	lockdep_assert_held(&irq->irq_lock);
+
+	if (WARN_ON_ONCE(!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, irq->intid)))
+		goto out_unlock_fail;
+
+	vcpu = irq->target_vcpu;
+	if (WARN_ON_ONCE(!vcpu))
+		goto out_unlock_fail;
+
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+	/* Directly kick the target VCPU to make sure it sees the IRQ */
+	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
+	kvm_vcpu_kick(vcpu);
+
+	return true;
+
+out_unlock_fail:
+	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
+
+	return false;
+}
+
+static struct irq_ops vgic_v5_ppi_irq_ops = {
+	.set_pending_state = vgic_v5_ppi_set_pending_state,
+	.queue_irq_unlock = vgic_v5_ppi_queue_irq_unlock,
+};
+
+void vgic_v5_set_ppi_ops(struct vgic_irq *irq)
+{
+	if (WARN_ON(!irq))
+		return;
+
+	guard(raw_spinlock_irqsave)(&irq->irq_lock);
+
+	if (!WARN_ON(irq->ops))
+		irq->ops = &vgic_v5_ppi_irq_ops;
+}
+
+/*
+ * Detect any PPIs state changes, and propagate the state with KVM's
+ * shadow structures.
+ */
+void vgic_v5_fold_ppi_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+
+	for (int reg = 0; reg < 2; reg++) {
+		const u64 activer = host_data_ptr(vgic_v5_ppi_state)->activer_exit[reg];
+		const u64 pendr = host_data_ptr(vgic_v5_ppi_state)->pendr_exit[reg];
+		unsigned long changed_bits;
+		int i;
+
+		/*
+		 * Track what changed across activer, pendr, but mask with
+		 * ~DVI.
+		 */
+		changed_bits = cpu_if->vgic_ppi_activer[reg] ^ activer;
+		changed_bits |= host_data_ptr(vgic_v5_ppi_state)->pendr_entry[reg] ^ pendr;
+		changed_bits &= ~cpu_if->vgic_ppi_dvir[reg];
+
+		for_each_set_bit(i, &changed_bits, 64) {
+			struct vgic_irq *irq;
+			u32 intid;
+
+			intid = FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_PPI);
+			intid |= FIELD_PREP(GICV5_HWIRQ_ID, reg * 64 + i);
+
+			irq = vgic_get_vcpu_irq(vcpu, intid);
+
+			scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
+				irq->active = !!(activer & BIT(i));
+
+				/*
+				 * This is an OR to avoid losing incoming
+				 * edges!
+				 */
+				if (irq->config == VGIC_CONFIG_EDGE)
+					irq->pending_latch |= !!(pendr & BIT(i));
+			}
+
+			vgic_put_irq(vcpu->kvm, irq);
+		}
+
+		/* Re-inject the exit state as entry state next time! */
+		cpu_if->vgic_ppi_activer[reg] = activer;
+
+		/*
+		 * Pending state is a bit different. We only propagate back
+		 * pending state for Edge interrupts. Moreover, this is OR'd
+		 * with the incoming state to make sure we don't lose incoming
+		 * edges. Use the (inverse) HMR to mask off all Level bits, and
+		 * OR.
+		 */
+		cpu_if->vgic_ppi_pendr[reg] |=
+			pendr & ~vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_hmr[reg];
+	}
+}
+
+void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu)
+{
+	struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
+
+	/*
+	 * We're about to enter the guest. Copy the shadow state to the pending
+	 * reg that will be written to the ICH_PPI_PENDRx_EL2 regs. While the
+	 * guest is running we track any incoming changes to the pending state in
+	 * vgic_ppi_pendr. The incoming changes are merged with the outgoing
+	 * changes on the return path.
+	 */
+	host_data_ptr(vgic_v5_ppi_state)->pendr_entry[0] = cpu_if->vgic_ppi_pendr[0];
+	host_data_ptr(vgic_v5_ppi_state)->pendr_entry[1] = cpu_if->vgic_ppi_pendr[1];
+
+	/*
+	 * Make sure that we can correctly detect "edges" in the PPI
+	 * state. There's a path where we never actually enter the guest, and
+	 * failure to do this risks losing pending state
+	 */
+	host_data_ptr(vgic_v5_ppi_state)->pendr_exit[0] = cpu_if->vgic_ppi_pendr[0];
+	host_data_ptr(vgic_v5_ppi_state)->pendr_exit[1] = cpu_if->vgic_ppi_pendr[1];
+}
+
 /*
  * Sets/clears the corresponding bit in the ICH_PPI_DVIR register.
  */
diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
index c465ff51cb073..1cdfa5224ead5 100644
--- a/arch/arm64/kvm/vgic/vgic.c
+++ b/arch/arm64/kvm/vgic/vgic.c
@@ -105,6 +105,18 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
 	if (WARN_ON(!vcpu))
 		return NULL;
 
+	if (vgic_is_v5(vcpu->kvm)) {
+		u32 int_num;
+
+		if (!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, intid))
+			return NULL;
+
+		int_num = FIELD_GET(GICV5_HWIRQ_ID, intid);
+		int_num = array_index_nospec(int_num, VGIC_V5_NR_PRIVATE_IRQS);
+
+		return &vcpu->arch.vgic_cpu.private_irqs[int_num];
+	}
+
 	/* SGIs and PPIs */
 	if (intid < VGIC_NR_PRIVATE_IRQS) {
 		intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS);
@@ -828,9 +840,11 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
 		vgic_release_deleted_lpis(vcpu->kvm);
 }
 
-static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
+static void vgic_fold_state(struct kvm_vcpu *vcpu)
 {
-	if (kvm_vgic_global_state.type == VGIC_V2)
+	if (vgic_is_v5(vcpu->kvm))
+		vgic_v5_fold_ppi_state(vcpu);
+	else if (kvm_vgic_global_state.type == VGIC_V2)
 		vgic_v2_fold_lr_state(vcpu);
 	else
 		vgic_v3_fold_lr_state(vcpu);
@@ -1037,8 +1051,10 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
 	if (can_access_vgic_from_kernel())
 		vgic_save_state(vcpu);
 
-	vgic_fold_lr_state(vcpu);
-	vgic_prune_ap_list(vcpu);
+	vgic_fold_state(vcpu);
+
+	if (!vgic_is_v5(vcpu->kvm))
+		vgic_prune_ap_list(vcpu);
 }
 
 /* Sync interrupts that were deactivated through a DIR trap */
@@ -1062,6 +1078,17 @@ static inline void vgic_restore_state(struct kvm_vcpu *vcpu)
 		__vgic_v3_restore_state(&vcpu->arch.vgic_cpu.vgic_v3);
 }
 
+static void vgic_flush_state(struct kvm_vcpu *vcpu)
+{
+	if (vgic_is_v5(vcpu->kvm)) {
+		vgic_v5_flush_ppi_state(vcpu);
+		return;
+	}
+
+	scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock)
+		vgic_flush_lr_state(vcpu);
+}
+
 /* Flush our emulation state into the GIC hardware before entering the guest. */
 void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 {
@@ -1098,13 +1125,12 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
 
 	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
 
-	scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock)
-		vgic_flush_lr_state(vcpu);
+	vgic_flush_state(vcpu);
 
 	if (can_access_vgic_from_kernel())
 		vgic_restore_state(vcpu);
 
-	if (vgic_supports_direct_irqs(vcpu->kvm))
+	if (vgic_supports_direct_irqs(vcpu->kvm) && kvm_vgic_global_state.has_gicv4)
 		vgic_v4_commit(vcpu);
 }
 
diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
index d5d9264f0a1e9..c8f538e65303f 100644
--- a/arch/arm64/kvm/vgic/vgic.h
+++ b/arch/arm64/kvm/vgic/vgic.h
@@ -364,7 +364,10 @@ void vgic_debug_destroy(struct kvm *kvm);
 
 int vgic_v5_probe(const struct gic_kvm_info *info);
 void vgic_v5_get_implemented_ppis(void);
+void vgic_v5_set_ppi_ops(struct vgic_irq *irq);
 int vgic_v5_set_ppi_dvi(struct kvm_vcpu *vcpu, u32 irq, bool dvi);
+void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu);
+void vgic_v5_fold_ppi_state(struct kvm_vcpu *vcpu);
 void vgic_v5_load(struct kvm_vcpu *vcpu);
 void vgic_v5_put(struct kvm_vcpu *vcpu);
 void vgic_v5_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
@@ -433,15 +436,6 @@ void vgic_its_invalidate_all_caches(struct kvm *kvm);
 int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
 int vgic_its_invall(struct kvm_vcpu *vcpu);
 
-bool system_supports_direct_sgis(void);
-bool vgic_supports_direct_msis(struct kvm *kvm);
-bool vgic_supports_direct_sgis(struct kvm *kvm);
-
-static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
-{
-	return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
-}
-
 int vgic_v4_init(struct kvm *kvm);
 void vgic_v4_teardown(struct kvm *kvm);
 void vgic_v4_configure_vsgis(struct kvm *kvm);
@@ -481,6 +475,19 @@ static inline bool vgic_is_v3(struct kvm *kvm)
 	return kvm_vgic_global_state.type == VGIC_V3 || vgic_is_v3_compat(kvm);
 }
 
+bool system_supports_direct_sgis(void);
+bool vgic_supports_direct_msis(struct kvm *kvm);
+bool vgic_supports_direct_sgis(struct kvm *kvm);
+
+static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
+{
+	/* GICv5 always supports direct IRQs */
+	if (vgic_is_v5(kvm))
+		return true;
+
+	return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
+}
+
 int vgic_its_debug_init(struct kvm_device *dev);
 void vgic_its_debug_destroy(struct kvm_device *dev);
 
-- 
2.34.1