[PATCH v5 19/36] KVM: arm64: gic-v5: Implement PPI interrupt injection

Wed Mar 4 05:08:33 PST 2026

On Thu, 26 Feb 2026 16:00:21 +0000,
Sascha Bischoff <Sascha.Bischoff at arm.com> wrote:
> 
> This change introduces interrupt injection for PPIs for GICv5-based
> guests.
> 
> The lifecycle of PPIs is largely managed by the hardware for a GICv5
> system. The hypervisor injects pending state into the guest by using
> the ICH_PPI_PENDRx_EL2 registers. These are used by the hardware to
> pick a Highest Priority Pending Interrupt (HPPI) for the guest based
> on the enable state of each individual interrupt. The enable state and
> priority for each interrupt are provided by the guest itself (through
> writes to the PPI registers).
> 
> When Direct Virtual Interrupt (DVI) is set for a particular PPI, the
> hypervisor is even able to skip the injection of the pending state
> altogether - it all happens in hardware.
> 
> The result of the above is that no AP lists are required for GICv5,
> unlike for older GICs. Instead, for PPIs the ICH_PPI_* registers
> fulfil the same purpose for all 128 PPIs. Hence, as long as the
> ICH_PPI_* registers are populated prior to guest entry, and merged
> back into the KVM shadow state on exit, the PPI state is preserved,
> and interrupts can be injected.
> 
> When injecting the state of a PPI the state is merged into the
> PPI-specific vgic_irq structure. The PPIs are made pending via the
> ICH_PPI_PENDRx_EL2 registers, the value of which is generated from the
> vgic_irq structures for each PPI exposed on guest entry. The
> queue_irq_unlock() irq_op is required to kick the vCPU to ensure that
> it seems the new state. The result is that no AP lists are used for
> private interrupts on GICv5.
> 
> Prior to entering the guest, vgic_v5_flush_ppi_state() is called from
> kvm_vgic_flush_hwstate(). This generates the pending state to inject
> into the guest, and snapshots it (twice - an entry and an exit copy)
> in order to track any changes. These changes can come from a guest
> consuming an interrupt or from a guest making an Edge-triggered
> interrupt pending.
> 
> When returning from running a guest, the guest's PPI state is merged
> back into KVM's vgic_irq state in vgic_v5_merge_ppi_state() from
> kvm_vgic_sync_hwstate(). The Enable and Active state is synced back for
> all PPIs, and the pending state is synced back for Edge PPIs (Level is
> driven directly by the devices generating said levels). The incoming
> pending state from the guest is merged with KVM's shadow state to
> avoid losing any incoming interrupts.
> 
> Signed-off-by: Sascha Bischoff <sascha.bischoff at arm.com>
> Reviewed-by: Jonathan Cameron <jonathan.cameron at huawei.com>
> ---
>  arch/arm64/kvm/vgic/vgic-v5.c | 160 ++++++++++++++++++++++++++++++++++
>  arch/arm64/kvm/vgic/vgic.c    |  40 +++++++--
>  arch/arm64/kvm/vgic/vgic.h    |  25 ++++--
>  3 files changed, 209 insertions(+), 16 deletions(-)
> 
> diff --git a/arch/arm64/kvm/vgic/vgic-v5.c b/arch/arm64/kvm/vgic/vgic-v5.c
> index db2225aefb130..a230c45db46ee 100644
> --- a/arch/arm64/kvm/vgic/vgic-v5.c
> +++ b/arch/arm64/kvm/vgic/vgic-v5.c
> @@ -132,6 +132,166 @@ int vgic_v5_finalize_ppi_state(struct kvm *kvm)
>  	return 0;
>  }
>  
> +/*
> + * For GICv5, the PPIs are mostly directly managed by the hardware. We (the
> + * hypervisor) handle the pending, active, enable state save/restore, but don't
> + * need the PPIs to be queued on a per-VCPU AP list. Therefore, sanity check the
> + * state, unlock, and return.
> + */
> +static bool vgic_v5_ppi_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq,
> +					 unsigned long flags)
> +	__releases(&irq->irq_lock)
> +{
> +	struct kvm_vcpu *vcpu;
> +
> +	lockdep_assert_held(&irq->irq_lock);
> +
> +	if (WARN_ON_ONCE(!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, irq->intid)))
> +		goto out_unlock_fail;
> +
> +	vcpu = irq->target_vcpu;
> +	if (WARN_ON_ONCE(!vcpu))
> +		goto out_unlock_fail;
> +
> +	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
> +
> +	/* Directly kick the target VCPU to make sure it sees the IRQ */
> +	kvm_make_request(KVM_REQ_IRQ_PENDING, vcpu);
> +	kvm_vcpu_kick(vcpu);
> +
> +	return true;
> +
> +out_unlock_fail:
> +	raw_spin_unlock_irqrestore(&irq->irq_lock, flags);
> +
> +	return false;
> +}
> +
> +static struct irq_ops vgic_v5_ppi_irq_ops = {
> +	.queue_irq_unlock = vgic_v5_ppi_queue_irq_unlock,
> +};
> +
> +void vgic_v5_set_ppi_ops(struct vgic_irq *irq)
> +{
> +	if (WARN_ON(!irq))
> +		return;
> +
> +	guard(raw_spinlock_irqsave)(&irq->irq_lock);
> +
> +	if (!WARN_ON(irq->ops))
> +		irq->ops = &vgic_v5_ppi_irq_ops;
> +}
> +
> +/*
> + * Detect any PPIs state changes, and propagate the state with KVM's
> + * shadow structures.
> + */
> +void vgic_v5_fold_ppi_state(struct kvm_vcpu *vcpu)
> +{
> +	struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
> +
> +	for (int reg = 0; reg < 2; reg++) {
> +		const u64 activer = host_data_ptr(vgic_v5_ppi_state)->activer_exit[reg];
> +		const u64 pendr = host_data_ptr(vgic_v5_ppi_state)->pendr_exit[reg];
> +		unsigned long changed_bits;
> +		int i;
> +
> +		/*
> +		 * Track what changed across activer, pendr, but mask with
> +		 * ~DVI.
> +		 */
> +		changed_bits = cpu_if->vgic_ppi_activer[reg] ^ activer;
> +		changed_bits |= host_data_ptr(vgic_v5_ppi_state)->pendr_entry[reg] ^ pendr;
> +		changed_bits &= ~cpu_if->vgic_ppi_dvir[reg];
> +
> +		for_each_set_bit(i, &changed_bits, 64) {
> +			struct vgic_irq *irq;
> +			u32 intid;
> +
> +			intid = FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_PPI);
> +			intid |= FIELD_PREP(GICV5_HWIRQ_ID, reg * 64 + i);
> +
> +			irq = vgic_get_vcpu_irq(vcpu, intid);
> +
> +			scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
> +				irq->active = !!(activer & BIT(i));
> +
> +				/*
> +				 * This is an OR to avoid losing incoming
> +				 * edges!
> +				 */
> +				if (irq->config == VGIC_CONFIG_EDGE)
> +					irq->pending_latch |= !!(pendr & BIT(i));
> +			}
> +
> +			vgic_put_irq(vcpu->kvm, irq);
> +		}
> +
> +		/*
> +		 * Re-inject the exit state as entry state next time!
> +		 *
> +		 * Note that the write of the Enable state is trapped, and hence
> +		 * there is nothing to explcitly sync back here as we already
> +		 * have the latest copy by definition.
> +		 */
> +		cpu_if->vgic_ppi_activer[reg] = activer;
> +	}

I think this whole thing would benefit from using bitmap operations
rather than these nested loops. I wrote the following, which isn't
very nice either (too many casts), but could be improved by either
changing the underlying types to be actual bitmaps or using
bitmap_from_arr64()...

void vgic_v5_fold_ppi_state(struct kvm_vcpu *vcpu)
{
	struct vgic_v5_cpu_if *cpu_if = &vcpu->arch.vgic_cpu.vgic_v5;
	DECLARE_BITMAP(changed_pending, 128);
	DECLARE_BITMAP(changed_active, 128);
	DECLARE_BITMAP(changed_bits, 128);
	unsigned long *activer, *pendr;
	int i;

	activer = (unsigned long *)&host_data_ptr(vgic_v5_ppi_state)->activer_exit;
	pendr = (unsigned long *)&host_data_ptr(vgic_v5_ppi_state)->pendr_exit;

	bitmap_xor(changed_active, (unsigned long *)cpu_if->vgic_ppi_activer, activer, 128);
	bitmap_xor(changed_pending, (unsigned long *)host_data_ptr(vgic_v5_ppi_state)->pendr_entry, pendr, 128);
	bitmap_or(changed_bits, changed_active, changed_pending, 128);

	for_each_set_bit(i, changed_bits, 128) {
		struct vgic_irq *irq;
		bool active;
		u32 intid;

		intid = FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_PPI);
		intid |= FIELD_PREP(GICV5_HWIRQ_ID, i);

		irq = vgic_get_vcpu_irq(vcpu, intid);
		active = test_bit(i, activer);

		scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
			irq->active = active;

			/*
			 * This is an OR to avoid losing incoming
			 * edges!
			 */
			if (irq->config == VGIC_CONFIG_EDGE)
				irq->pending_latch |= test_bit(i, pendr);
		}

		/*
		 * Re-inject the exit state as entry state next time!
		 *
		 * Note that the write of the Enable state is trapped, and
		 * hence there is nothing to explcitly sync back here as we
		 * already have the latest copy by definition.
		 */
		__assign_bit(i, (unsigned long *)cpu_if->vgic_ppi_activer, active);

		vgic_put_irq(vcpu->kvm, irq);
	}
}

> +}
> +
> +void vgic_v5_flush_ppi_state(struct kvm_vcpu *vcpu)
> +{
> +	unsigned long pendr[2];
> +
> +	/*
> +	 * Time to enter the guest - we first need to build the guest's
> +	 * ICC_PPI_PENDRx_EL1, however.
> +	 */
> +	pendr[0] = 0;
> +	pendr[1] = 0;
> +	for (int reg = 0; reg < 2; reg++) {
> +		u64 mask = vcpu->kvm->arch.vgic.gicv5_vm.vgic_ppi_mask[reg];
> +		unsigned long bm_p = 0;
> +		int i;
> +
> +		bitmap_from_arr64(&bm_p, &mask, 64);

Given that you are already converting a 64bit quantity, you could bite
the bullet and do all 128 bits at once.

> +
> +		for_each_set_bit(i, &bm_p, 64) {
> +			struct vgic_irq *irq;
> +			u32 intid;
> +
> +			intid = FIELD_PREP(GICV5_HWIRQ_TYPE, GICV5_HWIRQ_TYPE_PPI);
> +			intid |= FIELD_PREP(GICV5_HWIRQ_ID, reg * 64 + i);
> +
> +			irq = vgic_get_vcpu_irq(vcpu, intid);
> +
> +			scoped_guard(raw_spinlock_irqsave, &irq->irq_lock) {
> +				if (irq_is_pending(irq))
> +					__assign_bit(i % 64, &pendr[reg], 1);
> +			}
> +
> +			vgic_put_irq(vcpu->kvm, irq);
> +		}
> +	}
> +
> +	/*
> +	 * Copy the shadow state to the pending reg that will be written to the
> +	 * ICH_PPI_PENDRx_EL2 regs. While the guest is running we track any
> +	 * incoming changes to the pending state in the vgic_irq structures. The
> +	 * incoming changes are merged with the outgoing changes on the return
> +	 * path.
> +	 */
> +	host_data_ptr(vgic_v5_ppi_state)->pendr_entry[0] = pendr[0];
> +	host_data_ptr(vgic_v5_ppi_state)->pendr_entry[1] = pendr[1];
> +
> +	/*
> +	 * Make sure that we can correctly detect "edges" in the PPI
> +	 * state. There's a path where we never actually enter the guest, and
> +	 * failure to do this risks losing pending state
> +	 */
> +	host_data_ptr(vgic_v5_ppi_state)->pendr_exit[0] = pendr[0];
> +	host_data_ptr(vgic_v5_ppi_state)->pendr_exit[1] = pendr[1];
> +}
> +
>  /*
>   * Sets/clears the corresponding bit in the ICH_PPI_DVIR register.
>   */
> diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c
> index 49d65e8cc742b..69bfa0f81624c 100644
> --- a/arch/arm64/kvm/vgic/vgic.c
> +++ b/arch/arm64/kvm/vgic/vgic.c
> @@ -105,6 +105,18 @@ struct vgic_irq *vgic_get_vcpu_irq(struct kvm_vcpu *vcpu, u32 intid)
>  	if (WARN_ON(!vcpu))
>  		return NULL;
>  
> +	if (vgic_is_v5(vcpu->kvm)) {
> +		u32 int_num, hwirq_id;
> +
> +		if (!__irq_is_ppi(KVM_DEV_TYPE_ARM_VGIC_V5, intid))
> +			return NULL;
> +
> +		hwirq_id = FIELD_GET(GICV5_HWIRQ_ID, intid);
> +		int_num = array_index_nospec(hwirq_id, VGIC_V5_NR_PRIVATE_IRQS);
> +
> +		return &vcpu->arch.vgic_cpu.private_irqs[int_num];
> +	}
> +
>  	/* SGIs and PPIs */
>  	if (intid < VGIC_NR_PRIVATE_IRQS) {
>  		intid = array_index_nospec(intid, VGIC_NR_PRIVATE_IRQS);
> @@ -825,9 +837,11 @@ static void vgic_prune_ap_list(struct kvm_vcpu *vcpu)
>  		vgic_release_deleted_lpis(vcpu->kvm);
>  }
>  
> -static inline void vgic_fold_lr_state(struct kvm_vcpu *vcpu)
> +static void vgic_fold_state(struct kvm_vcpu *vcpu)
>  {
> -	if (kvm_vgic_global_state.type == VGIC_V2)
> +	if (vgic_is_v5(vcpu->kvm))
> +		vgic_v5_fold_ppi_state(vcpu);
> +	else if (kvm_vgic_global_state.type == VGIC_V2)
>  		vgic_v2_fold_lr_state(vcpu);
>  	else
>  		vgic_v3_fold_lr_state(vcpu);
> @@ -1034,8 +1048,10 @@ void kvm_vgic_sync_hwstate(struct kvm_vcpu *vcpu)
>  	if (can_access_vgic_from_kernel())
>  		vgic_save_state(vcpu);
>  
> -	vgic_fold_lr_state(vcpu);
> -	vgic_prune_ap_list(vcpu);
> +	vgic_fold_state(vcpu);
> +
> +	if (!vgic_is_v5(vcpu->kvm))
> +		vgic_prune_ap_list(vcpu);

I'm starting to think we should have per-GIC implementations of these
things. This is becoming very tortuous.

Thanks,

	M.

-- 
Without deviation from the norm, progress is not possible.