[PATCH 17/32] KVM: arm64: gic-v5: Implement PPI interrupt injection

Wed Dec 17 02:33:00 PST 2025

On Fri, 12 Dec 2025 15:22:41 +0000,
Sascha Bischoff <Sascha.Bischoff at arm.com> wrote:
> 
> This change introduces interrupt injection for PPIs for GICv5-based
> guests.
> 
> The lifecycle of PPIs is largely managed by the hardware for a GICv5
> system. The hypervisor injects pending state into the guest by using
> the ICH_PPI_PENDRx_EL2 registers. These are used by the hardware to
> pick a Highest Priority Pending Interrupt (HPPI) for the guest based
> on the enable state of each individual interrupt. The enable state and
> priority for each interrupt are provided by the guest itself (through
> writes to the PPI registers).
> 
> When Direct Virtual Interrupt (DVI) is set for a particular PPI, the
> hypervisor is even able to skip the injection of the pending state
> altogether - it all happens in hardware.
> 
> The result of the above is that no AP lists are required for GICv5,
> unlike for older GICs. Instead, for PPIs the ICH_PPI_* registers
> fulfil the same purpose for all 128 PPIs. Hence, as long as the
> ICH_PPI_* registers are populated prior to guest entry, and merged
> back into the KVM shadow state on exit, the PPI state is preserved,
> and interrupts can be injected.
> 
> When injecting the state of a PPI the state is merged into the KVM's
> shadow state using the set_pending_state irq_op. The directly sets the
> relevant bit in the shadow ICH_PPI_PENDRx_EL2, which is presented to
> the guest (and GICv5 hardware) on next guest entry. The
> queue_irq_unlock irq_op is required to kick the vCPU to ensure that it
> seems the new state. The result is that no AP lists are used for
> private interrupts on GICv5.
> 
> Prior to entering the guest, vgic_v5_flush_ppi_state is called from
> kvm_vgic_flush_hwstate. The effectively snapshots the shadow PPI
> pending state (twice - an entry and an exit copy) in order to track
> any changes. These changes can come from a guest consuming an
> interrupt or from a guest making an Edge-triggered interrupt pending.
> 
> When returning from running a guest, the guest's PPI state is merged
> back into KVM's shadow state in vgic_v5_merge_ppi_state from
> kvm_vgic_sync_hwstate. The Enable and Active state is synced back for
> all PPIs, and the pending state is synced back for Edge PPIs (Level is
> driven directly by the devices generating said levels). The incoming
> pending state from the guest is merged with KVM's shadow state to
> avoid losing any incoming interrupts.
> 
> Signed-off-by: Sascha Bischoff <sascha.bischoff at arm.com>
> ---
>  arch/arm64/kvm/vgic/vgic-v5.c | 157 ++++++++++++++++++++++++++++++++++
>  arch/arm64/kvm/vgic/vgic.c    |  35 ++++++--
>  arch/arm64/kvm/vgic/vgic.h    |  49 ++++++++---
>  include/kvm/arm_vgic.h        |   3 +
>  4 files changed, 226 insertions(+), 18 deletions(-)
> 
> diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
> index 22558080711eb..d54595fbf4586 100644
> --- a/arch/arm64/kvm/vgic/vgic-v5.c
> +++ b/arch/arm64/kvm/vgic/vgic-v5.c
> @@ -54,6 +54,163 @@ int vgic_v5_probe(const struct gic_kvm_info *info)
>  	return 0;
>  }
>  
> +static bool vgic_v5_ppi_set_pending_state(struct kvm_vcpu *vcpu,
> +					  struct vgic_irq *irq)
> +{
> +	struct vgic_v5_cpu_if *cpu_if;
> +	const u32 id_bit = BIT_ULL(irq->intid % 64);

Try that with intid==32...

> +	const u32 reg = FIELD_GET(GICV5_HWIRQ_ID, irq->intid) / 64;
> +
> +	if (!vcpu || !irq)
> +		return false;
> +
> +	/* Skip injecting the state altogether */
> +	if (irq->directly_injected)
> +		return true;

It is unclear to me under which circumstances we could end-up here.
Wouldn't that be a bug?

> +
> +	cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
> +
> +	if (irq_is_pending(irq))
> +		cpu_if->vgic_ppi_pendr[reg] |= id_bit;
> +	else
> +		cpu_if->vgic_ppi_pendr[reg] &= ~id_bit;
> +
> +	return true;
> +}
> +
> +/*
> + * For GICv5, the PPIs are mostly directly managed by the hardware. We
> + * (the hypervisor) handle the pending, active, enable state
> + * save/restore, but don't need the PPIs to be queued on a per-VCPU AP
> + * list. Therefore, sanity check the state, unlock, and return.
> + */
> +static bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
> +					 unsigned long flags)
> +	__releases(&irq->irq_lock)
> +{
> +	struct kvm_vcpu *vcpu;
> +
> +	lockdep_assert_held(&irq->irq_lock);
> +
> +	if (WARN_ON_ONCE(!irq_is_ppi_v5(irq->intid)))
> +		return false;
> +
> +	vcpu = irq->target_vcpu;
> +	if (WARN_ON_ONCE(!vcpu))
> +		return false;

Errr... You're returning with the lock held, making a bad bug even
worse!

> +
> +	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
> +
> +	/* Directly kick the target VCPU to make sure it sees the IRQ */
> +	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
> +	kvm_vcpu_kick(vcpu);
> +
> +	return true;
> +}
> +
> +static struct irq_ops vgic_v5_ppi_irq_ops = {
> +	.set_pending_state = vgic_v5_ppi_set_pending_state,
> +	.queue_irq_unlock = vgic_v5_ppi_queue_irq_unlock,
> +};
> +
> +void vgic_v5_set_ppi_ops(struct vgic_irq *irq)
> +{
> +	if (WARN_ON(!irq) || WARN_ON(irq->ops))
> +		return;
> +
> +	irq->ops = &vgic_v5_ppi_irq_ops;

Is there any locking requirement here?

> +}
> +
> +/*
> + * Detect any PPIs state changes, and propagate the state with KVM's
> + * shadow structures.
> + */
> +static void vgic_v5_merge_ppi_state(struct kvm_vcpu *vcpu)

Since this is only called from vgic_v5_fold_irq_state(), do yourself a
favour and call it that.

> +{
> +	struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
> +	unsigned long flags;
> +	int i, reg;
> +
> +	for (reg = 0; reg < 2; reg++) {
> +		unsigned long changed_bits;
> +		const unsigned long enabler = cpu_if->vgic_ich_ppi_enabler_exit[reg];
> +		const unsigned long activer = cpu_if->vgic_ppi_activer_exit[reg];
> +		const unsigned long pendr = cpu_if->vgic_ppi_pendr_exit[reg];
> +
> +		/*
> +		 * Track what changed across enabler, activer, pendr, but mask
> +		 * with ~DVI.
> +		 */
> +		changed_bits = cpu_if->vgic_ich_ppi_enabler_entry[reg] ^ enabler;
> +		changed_bits |= cpu_if->vgic_ppi_activer_entry[reg] ^ activer;
> +		changed_bits |= cpu_if->vgic_ppi_pendr_entry[reg] ^ pendr;
> +		changed_bits &= ~cpu_if->vgic_ppi_dvir[reg];
> +
> +		for_each_set_bit(i, &changed_bits, 64) {
> +			struct vgic_irq *irq;
> +			u32 intid;
> +
> +			intid = FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_PPI);
> +			intid |= FIELD_PREP(GICV5_HWIRQ_ID, reg * 64 + i);
> +
> +			irq = vgic_get_vcpu_irq(vcpu, intid);
> +
> +			raw_spin_lock_irqsave(&irq->irq_lock, flags);

Consider using a scoped_guard() for this.

> +			irq->enabled = !!(enabler & BIT(i));
> +			irq->active = !!(activer & BIT(i));
> +			/* This is an OR to avoid losing incoming edges! */
> +			if (irq->config == VGIC_CONFIG_EDGE)
> +				irq->pending_latch |= !!(pendr & BIT(i));
> +			raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
> +
> +			vgic_put_irq(vcpu->kvm, irq);
> +		}
> +
> +		/* Re-inject the exit state as entry state next time! */
> +		cpu_if->vgic_ich_ppi_enabler_entry[reg] = enabler;
> +		cpu_if->vgic_ppi_activer_entry[reg] = activer;
> +
> +		/*
> +		 * Pending state is a bit different. We only propagate back
> +		 * pending state for Edge interrupts. Moreover, this is OR'd
> +		 * with the incoming state to make sure we don't lose incoming
> +		 * edges. Use the (inverse) HMR to mask off all Level bits, and
> +		 * OR.
> +		 */
> +		cpu_if->vgic_ppi_pendr[reg] |= pendr & ~cpu_if->vgic_ppi_hmr[reg];
> +	}
> +}
> +
> +void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu)
> +{
> +	struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
> +
> +	/*
> +	 * We're about to enter the guest. Copy the shadow state to the pending
> +	 * reg that will be written to the ICH_PPI_PENDRx_EL2 regs. While the
> +	 * guest is running we track any incoming changes to the pending state in
> +	 * vgic_ppi_pendr. The incoming changes are merged with the outgoing
> +	 * changes on the return path.
> +	 */
> +	cpu_if->vgic_ppi_pendr_entry[0] = cpu_if->vgic_ppi_pendr[0];
> +	cpu_if->vgic_ppi_pendr_entry[1] = cpu_if->vgic_ppi_pendr[1];
> +
> +	/*
> +	 * Make sure that we can correctly detect "edges" in the PPI
> +	 * state. There's a path where we never actually enter the guest, and
> +	 * failure to do this risks losing pending state
> +	 */
> +	cpu_if->vgic_ppi_pendr_exit[0] = cpu_if->vgic_ppi_pendr[0];
> +	cpu_if->vgic_ppi_pendr_exit[1] = cpu_if->vgic_ppi_pendr[1];

I find it a bit objectionable that this entry/exit stuff is kept on a
per vcpu basis. Given that we cannot be preempted between flush and
fold, the intermediate bookkeeping should probably live in a per-CPU
structure.

> +
> +}
> +
> +void vgic_v5_fold_irq_state(struct kvm_vcpu *vcpu)
> +{
> +	/* Sync back the guest PPI state to the KVM shadow state */
> +	vgic_v5_merge_ppi_state(vcpu);
> +}
> +
>  /*
>   * Sets/clears the corresponding bit in the ICH_PPI_DVIR register.
>   */
> diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
> index fc01c6d07fe62..e534876656ca7 100644
> --- a/arch/arm64/kvm/vgic/vgic.c
> +++ b/arch/arm64/kvm/vgic/vgic.c
> @@ -105,6 +105,15 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
>  	if (WARN_ON(!vcpu))
>  		return NULL;
>  
> +	if (vcpu->kvm->arch.vgic.vgic_model == KVM_DEV_TYPE_ARM_VGIC_V5) {
> +		u32 int_num = FIELD_GET(GICV5_HWIRQ_ID, intid);
> +
> +		if (irq_is_ppi_v5(intid)) {
> +			int_num = array_index_nospec(int_num, VGIC_V5_NR_PRIVATE_IRQS);
> +			return &vcpu->arch.vgic_cpu.private_irqs[int_num];
> +		}
> +	}
> +
>  	/* SGIs and PPIs */
>  	if (intid < VGIC_NR_PRIVATE_IRQS) {
>  		intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS);
> @@ -258,10 +267,12 @@ struct kvm_vcpu *vgic_target_oracle(struct vgic_irq *irq)
>  	 * If the distributor is disabled, pending interrupts shouldn't be
>  	 * forwarded.
>  	 */
> -	if (irq->enabled && irq_is_pending(irq)) {
> -		if (unlikely(irq->target_vcpu &&
> -			     !irq->target_vcpu->kvm->arch.vgic.enabled))
> -			return NULL;
> +	if (irq_is_enabled(irq) && irq_is_pending(irq)) {
> +		if (irq->target_vcpu) {
> +			if (!vgic_is_v5(irq->target_vcpu->kvm) &&
> +			    unlikely(!irq->target_vcpu->kvm->arch.vgic.enabled))
> +				return NULL;
> +		}
>  
>  		return irq->target_vcpu;
>  	}
> @@ -1044,7 +1055,11 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>  	if (can_access_vgic_from_kernel())
>  		vgic_save_state(vcpu);
>  
> -	vgic_fold_lr_state(vcpu);
> +	if (!vgic_is_v5(vcpu->kvm))
> +		vgic_fold_lr_state(vcpu);
> +	else
> +		vgic_v5_fold_irq_state(vcpu);
> +

This is gross. We already have this:

static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
{
	if (kvm_vgic_global_state.type == VGIC_V2)
		vgic_v2_fold_lr_state(vcpu);
	else
		vgic_v3_fold_lr_state(vcpu);
}

Rename it to vgic_fold_state(), drop the inline, make it a switch(),
and hook the v5 stuff in it.

>  	vgic_prune_ap_list(vcpu);

Are you actually pruning the ap_list() on v5? What is there the first place?

>  }
>  
> @@ -1105,13 +1120,17 @@ void kvm_vgic_flush_hwstate(struct kvm_vcpu *vcpu)
>  
>  	DEBUG_SPINLOCK_BUG_ON(!irqs_disabled());
>  
> -	scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock)
> -		vgic_flush_lr_state(vcpu);
> +	if (!vgic_is_v5(vcpu->kvm)) {
> +		scoped_guard(raw_spinlock, &vcpu->arch.vgic_cpu.ap_list_lock)
> +			vgic_flush_lr_state(vcpu);
> +	} else {
> +		vgic_v5_flush_ppi_state(vcpu);
> +	}

Same thing here. Move everything into the common helper, including the
lock management.

>  
>  	if (can_access_vgic_from_kernel())
>  		vgic_restore_state(vcpu);
>  
> -	if (vgic_supports_direct_irqs(vcpu->kvm))
> +	if (vgic_supports_direct_irqs(vcpu->kvm) && !vgic_is_v5(vcpu->kvm))
>  		vgic_v4_commit(vcpu);

nit: swap the two terms of the conjunction, making it more readable
(for me...). Or move the conditions into vgic_v4_commit().

>  }
>  
> diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h
> index b6e3f5e3aba18..5a77318ddb87a 100644
> --- a/arch/arm64/kvm/vgic/vgic.h
> +++ b/arch/arm64/kvm/vgic/vgic.h
> @@ -132,6 +132,28 @@ static inline bool irq_is_pending(struct vgic_irq *irq)
>  		return irq->pending_latch || irq->line_level;
>  }
>  
> +/* Requires the irq_lock to be held by the caller. */
> +static inline bool irq_is_enabled(struct vgic_irq *irq)
> +{
> +	if (irq->enabled)
> +		return true;
> +
> +	/*
> +	 * We always consider GICv5 interrupts as enabled as we can
> +	 * always inject them. The state is handled by the hardware,
> +	 * and the hardware will only signal the interrupt to the
> +	 * guest once the guest enables it.
> +	 */
> +	if (irq->target_vcpu) {

Under which circumstances is target_vcpu NULL for PPIs?

> +		u32 vgic_model = irq->target_vcpu->kvm->arch.vgic.vgic_model;
> +
> +		if (vgic_model == KVM_DEV_TYPE_ARM_VGIC_V5)
> +			return true;
> +	}
> +
> +	return false;
> +}
> +
>  static inline bool vgic_irq_is_mapped_level(struct vgic_irq *irq)
>  {
>  	return irq->config == VGIC_CONFIG_LEVEL && irq->hw;
> @@ -306,7 +328,7 @@ static inline bool vgic_try_get_irq_ref(struct vgic_irq *irq)
>  	if (!irq)
>  		return false;
>  
> -	if (irq->intid < VGIC_MIN_LPI)
> +	if (irq->target_vcpu && !irq_is_lpi(irq->target_vcpu->kvm, irq->intid))
>  		return true;

This change is rather obscure, and doesn't do what you think it does.

What has target_vcpu to do with anything? For example, a GICv3 SPI
that has its target_vcpu set to NULL (just point its IROUTER register
to a non-existent vcpu) would end-up using the refcount path,
something that isn't expected at all.

You should make clear what the refcounting rules are for GICv5, and
use that, instead of using odd side-effects.

>  
>  	return refcount_inc_not_zero(&irq->refcount);
> @@ -363,7 +385,10 @@ void vgic_debug_init(struct kvm *kvm);
>  void vgic_debug_destroy(struct kvm *kvm);
>  
>  int vgic_v5_probe(const struct gic_kvm_info *info);
> +void vgic_v5_set_ppi_ops(struct vgic_irq *irq);
>  int vgic_v5_set_ppi_dvi(struct kvm_vcpu *vcpu, u32 irq, bool dvi);
> +void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu);
> +void vgic_v5_fold_irq_state(struct kvm_vcpu *vcpu);
>  void vgic_v5_load(struct kvm_vcpu *vcpu);
>  void vgic_v5_put(struct kvm_vcpu *vcpu);
>  void vgic_v5_set_vmcr(struct kvm_vcpu *vcpu, struct vgic_vmcr *vmcr);
> @@ -432,15 +457,6 @@ void vgic_its_invalidate_all_caches(struct kvm *kvm);
>  int vgic_its_inv_lpi(struct kvm *kvm, struct vgic_irq *irq);
>  int vgic_its_invall(struct kvm_vcpu *vcpu);
>  
> -bool system_supports_direct_sgis(void);
> -bool vgic_supports_direct_msis(struct kvm *kvm);
> -bool vgic_supports_direct_sgis(struct kvm *kvm);
> -
> -static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
> -{
> -	return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
> -}
> -
>  int vgic_v4_init(struct kvm *kvm);
>  void vgic_v4_teardown(struct kvm *kvm);
>  void vgic_v4_configure_vsgis(struct kvm *kvm);
> @@ -485,6 +501,19 @@ static inline bool vgic_is_v5(struct kvm *kvm)
>  	return kvm_vgic_global_state.type == VGIC_V5 && !vgic_is_v3_compat(kvm);
>  }
>  
> +bool system_supports_direct_sgis(void);
> +bool vgic_supports_direct_msis(struct kvm *kvm);
> +bool vgic_supports_direct_sgis(struct kvm *kvm);
> +
> +static inline bool vgic_supports_direct_irqs(struct kvm *kvm)
> +{
> +	/* GICv5 always supports direct IRQs */
> +	if (vgic_is_v5(kvm))
> +		return true;
> +
> +	return vgic_supports_direct_msis(kvm) || vgic_supports_direct_sgis(kvm);
> +}
> +
>  int vgic_its_debug_init(struct kvm_device *dev);
>  void vgic_its_debug_destroy(struct kvm_device *dev);
>  
> diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
> index 20c908730fa00..5a46fe3c35b5c 100644
> --- a/include/kvm/arm_vgic.h
> +++ b/include/kvm/arm_vgic.h
> @@ -32,6 +32,9 @@
>  #define VGIC_MIN_LPI		8192
>  #define KVM_IRQCHIP_NUM_PINS	(1020 - 32)
>  
> +/* GICv5 constants */
> +#define VGIC_V5_NR_PRIVATE_IRQS	128
> +
>  #define irq_is_ppi_legacy(irq) ((irq) >= VGIC_NR_SGIS && (irq) < VGIC_NR_PRIVATE_IRQS)
>  #define irq_is_spi_legacy(irq) ((irq) >= VGIC_NR_PRIVATE_IRQS && \
>  					(irq) <= VGIC_MAX_SPI)

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.