[PATCH v5 10/36] KVM: arm64: gic-v5: Sanitize ID_AA64PFR2_EL1.GCIE

Tue Mar 3 09:49:47 PST 2026

On Tue, 2026-03-03 at 15:54 +0000, Marc Zyngier wrote:
> On Thu, 26 Feb 2026 15:58:00 +0000,
> Sascha Bischoff <Sascha.Bischoff at arm.com> wrote:
> > 
> > Add in a sanitization function for ID_AA64PFR2_EL1, preserving the
> > already-present behaviour for the FPMR, MTEFAR, and MTESTOREONLY
> > fields. Add sanitisation for the GCIE field, which is set to IMP if
> > the host supports a GICv5 guest and NI, otherwise.
> > 
> > Extend the sanitisation that takes place in kvm_vgic_create() to
> > zero
> > the ID_AA64PFR2.GCIE field when a non-GICv5 GIC is created. More
> > importantly, move this sanitisation to a separate function,
> > kvm_vgic_finalize_sysregs(), and call it from
> > kvm_finalize_sys_regs().
> > 
> > We are required to finalize the GIC and GCIE fields a second time
> > in
> > kvm_finalize_sys_regs() due to how QEMU blindly reads out then
> > verbatim restores the system register state. This avoids the issue
> > where both the GCIE and GIC features are marked as present (an
> > architecturally invalid combination), and hence guests fall over.
> > See
> > the comment in kvm_finalize_sys_regs() for more details.
> > 
> > Overall, the following happens:
> > 
> > * Before an irqchip is created, FEAT_GCIE is presented if the host
> >   supports GICv5-based guests.
> > * Once an irqchip is created, all other supported irqchips are
> > hidden
> >   from the guest; system register state reflects the guest's
> > irqchip.
> > * Userspace is allowed to set invalid irqchip feature combinations
> > in
> >   the system registers, but...
> > * ...invalid combinations are removed a second time prior to the
> > first
> >   run of the guest, and things hopefully just work.
> > 
> > All of this extra work is required to make sure that "legacy" GICv3
> > guests based on QEMU transparently work on compatible GICv5 hosts
> > without modification.
> > 
> > Signed-off-by: Sascha Bischoff <sascha.bischoff at arm.com>
> > ---
> >  arch/arm64/kvm/sys_regs.c       | 70
> > +++++++++++++++++++++++++++++----
> >  arch/arm64/kvm/vgic/vgic-init.c | 43 +++++++++++++-------
> >  include/kvm/arm_vgic.h          |  1 +
> >  3 files changed, 92 insertions(+), 22 deletions(-)
> > 
> > diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
> > index 11e75f2522f95..1039150716d43 100644
> > --- a/arch/arm64/kvm/sys_regs.c
> > +++ b/arch/arm64/kvm/sys_regs.c
> > @@ -1758,6 +1758,7 @@ static u8 pmuver_to_perfmon(u8 pmuver)
> >  
> >  static u64 sanitise_id_aa64pfr0_el1(const struct kvm_vcpu *vcpu,
> > u64 val);
> >  static u64 sanitise_id_aa64pfr1_el1(const struct kvm_vcpu *vcpu,
> > u64 val);
> > +static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu,
> > u64 val);
> >  static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu,
> > u64 val);
> >  
> >  /* Read a sanitised cpufeature ID register by sys_reg_desc */
> > @@ -1783,10 +1784,7 @@ static u64 __kvm_read_sanitised_id_reg(const
> > struct kvm_vcpu *vcpu,
> >  		val = sanitise_id_aa64pfr1_el1(vcpu, val);
> >  		break;
> >  	case SYS_ID_AA64PFR2_EL1:
> > -		val &= ID_AA64PFR2_EL1_FPMR |
> > -			(kvm_has_mte(vcpu->kvm) ?
> > -			 ID_AA64PFR2_EL1_MTEFAR |
> > ID_AA64PFR2_EL1_MTESTOREONLY :
> > -			 0);
> > +		val = sanitise_id_aa64pfr2_el1(vcpu, val);
> >  		break;
> >  	case SYS_ID_AA64ISAR1_EL1:
> >  		if (!vcpu_has_ptrauth(vcpu))
> > @@ -2024,6 +2022,23 @@ static u64 sanitise_id_aa64pfr1_el1(const
> > struct kvm_vcpu *vcpu, u64 val)
> >  	return val;
> >  }
> >  
> > +static u64 sanitise_id_aa64pfr2_el1(const struct kvm_vcpu *vcpu,
> > u64 val)
> > +{
> > +	val &= ID_AA64PFR2_EL1_FPMR |
> > +	       ID_AA64PFR2_EL1_MTEFAR |
> > +	       ID_AA64PFR2_EL1_MTESTOREONLY;
> > +
> > +	if (!kvm_has_mte(vcpu->kvm)) {
> > +		val &= ~ID_AA64PFR2_EL1_MTEFAR;
> > +		val &= ~ID_AA64PFR2_EL1_MTESTOREONLY;
> > +	}
> > +
> > +	if (vgic_host_has_gicv5())
> > +		val |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1, GCIE,
> > IMP);
> > +
> > +	return val;
> > +}
> > +
> >  static u64 sanitise_id_aa64dfr0_el1(const struct kvm_vcpu *vcpu,
> > u64 val)
> >  {
> >  	val = ID_REG_LIMIT_FIELD_ENUM(val, ID_AA64DFR0_EL1,
> > DebugVer, V8P8);
> > @@ -2213,6 +2228,12 @@ static int set_id_aa64pfr1_el1(struct
> > kvm_vcpu *vcpu,
> >  	return set_id_reg(vcpu, rd, user_val);
> >  }
> >  
> > +static int set_id_aa64pfr2_el1(struct kvm_vcpu *vcpu,
> > +			       const struct sys_reg_desc *rd, u64
> > user_val)
> > +{
> > +	return set_id_reg(vcpu, rd, user_val);
> > +}
> > +
> >  /*
> >   * Allow userspace to de-feature a stage-2 translation granule but
> > prevent it
> >   * from claiming the impossible.
> > @@ -3194,10 +3215,11 @@ static const struct sys_reg_desc
> > sys_reg_descs[] = {
> >  				       ID_AA64PFR1_EL1_RES0 |
> >  				       ID_AA64PFR1_EL1_MPAM_frac |
> >  				       ID_AA64PFR1_EL1_MTE)),
> > -	ID_WRITABLE(ID_AA64PFR2_EL1,
> > -		    ID_AA64PFR2_EL1_FPMR |
> > -		    ID_AA64PFR2_EL1_MTEFAR |
> > -		    ID_AA64PFR2_EL1_MTESTOREONLY),
> > +	ID_FILTERED(ID_AA64PFR2_EL1, id_aa64pfr2_el1,
> > +		    ~(ID_AA64PFR2_EL1_FPMR |
> > +		      ID_AA64PFR2_EL1_MTEFAR |
> > +		      ID_AA64PFR2_EL1_MTESTOREONLY |
> > +		      ID_AA64PFR2_EL1_GCIE)),
> >  	ID_UNALLOCATED(4,3),
> >  	ID_WRITABLE(ID_AA64ZFR0_EL1, ~ID_AA64ZFR0_EL1_RES0),
> >  	ID_HIDDEN(ID_AA64SMFR0_EL1),
> > @@ -5668,8 +5690,40 @@ int kvm_finalize_sys_regs(struct kvm_vcpu
> > *vcpu)
> >  
> >  		val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1)
> > & ~ID_AA64PFR0_EL1_GIC;
> >  		kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, val);
> > +		val = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1)
> > & ~ID_AA64PFR2_EL1_GCIE;
> > +		kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1, val);
> >  		val = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) &
> > ~ID_PFR1_EL1_GIC;
> >  		kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, val);
> > +	} else {
> > +		/*
> > +		 * Certain userspace software - QEMU - samples the
> > system
> > +		 * register state without creating an irqchip,
> > then blindly
> > +		 * restores the state prior to running the final
> > guest. This
> > +		 * means that it restores the virtualization &
> > emulation
> > +		 * capabilities of the host system, rather than
> > something that
> > +		 * reflects the final guest state. Moreover, it
> > checks that the
> > +		 * state was "correctly" restored (i.e.,
> > verbatim), bailing if
> > +		 * it isn't, so masking off invalid state isn't an
> > option.
> > +		 *
> > +		 * On GICv5 hardware that supports
> > FEAT_GCIE_LEGACY we can run
> > +		 * both GICv3- and GICv5-based guests. Therefore,
> > we initially
> > +		 * present both ID_AA64PFR0.GIC and
> > ID_AA64PFR2.GCIE as IMP to
> > +		 * reflect that userspace can create EITHER a
> > vGICv3 or a
> > +		 * vGICv5. This is an architecturally invalid
> > combination, of
> > +		 * course. Once an in-kernel GIC is created, the
> > sysreg state is
> > +		 * updated to reflect the actual, valid
> > configuration.
> > +		 *
> > +		 * Setting both the GIC and GCIE features to IMP
> > unsurprisingly
> > +		 * results in guests falling over, and hence we
> > need to fix up
> > +		 * this mess in KVM. Before running for the first
> > time we yet
> > +		 * again ensure that the GIC and GCIE fields
> > accurately reflect
> > +		 * the actual hardware the guest should see.
> > +		 *
> > +		 * This hack allows legacy QEMU-based GICv3 guests
> > to run
> > +		 * unmodified on compatible GICv5 hosts, and
> > avoids the inverse
> > +		 * problem for GICv5-based guests in the future.
> > +		 */
> > +		kvm_vgic_finalize_sysregs(kvm);
> 
> An alternative to this sorry hack would be to have a separate view of
> the idregs for luserspace to get whatever expected. But you then need
> to invalidate that copy at some point so that you can migrate the
> guest safely, and you'd probably end-up doing a similar thing.
> 
> I appreciate that you are doing this for the sake of preserving SW
> compatibility, but do you foresee a way out of this mess that does
> not
> involve asking the QEMU folks to fix their stuff? I don't think we
> can
> paper over their over-simplistic design forever.

Regrettably, I've not been able to come up with a clean solution to
this issue. I don't like doing this fixing up of state either, but if
we want existing QEMU-based guests using GICv3 irqchips to work on
(future) GICv5 hardware, then this sort of fix-up needs to happen
somewhere in KVM. One way or another, we need to (re-)sanitise whatever
userspace has done to give us something that is architecturally valid,
or we end up with guests falling over.

As you said, we could provide userspace a different view of the system
registers to make sure that KVM's internal state at the very least
remains valid, but would need to collapse that state on migration. I
suspect that this isn't much cleaner in the grand scheme of things, but
I'm happy to re-work things to do that if you prefer.

In my view, the best thing would be for userspace, so in this case
QEMU, to understand what it is writing to the system registers, rather
than blindly setting the state. This way, it can avoid setting things
that don't make sense, and we ideally avoid these sorts of issues.
Until that happens, we're always going to hit cases where it tries to
set combinations of state that simply cannot be combined and remain
valid. GICv3/5 is one case, but I'm sure that other such issues will
come up in time.

As I said above in my in-code comment, we're going to hit the same
issue with GICv5-based guests if the approach doesn't change. This,
again, would by fixed up by this code before running the guest, but
ideally we should try and make sure that we don't need to do that in
the first place for GICv5.

> 
> >  	}
> >  
> >  	if (vcpu_has_nv(vcpu)) {
> > diff --git a/arch/arm64/kvm/vgic/vgic-init.c
> > b/arch/arm64/kvm/vgic/vgic-init.c
> > index 9b3091ad868cf..d1db384698238 100644
> > --- a/arch/arm64/kvm/vgic/vgic-init.c
> > +++ b/arch/arm64/kvm/vgic/vgic-init.c
> > @@ -71,7 +71,6 @@ static int
> > vgic_allocate_private_irqs_locked(struct kvm_vcpu *vcpu, u32 type);
> >  int kvm_vgic_create(struct kvm *kvm, u32 type)
> >  {
> >  	struct kvm_vcpu *vcpu;
> > -	u64 aa64pfr0, pfr1;
> >  	unsigned long i;
> >  	int ret;
> >  
> > @@ -162,19 +161,11 @@ int kvm_vgic_create(struct kvm *kvm, u32
> > type)
> >  
> >  	kvm->arch.vgic.vgic_dist_base = VGIC_ADDR_UNDEF;
> >  
> > -	aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) &
> > ~ID_AA64PFR0_EL1_GIC;
> > -	pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) &
> > ~ID_PFR1_EL1_GIC;
> > -
> > -	if (type == KVM_DEV_TYPE_ARM_VGIC_V2) {
> > -		kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
> > -	} else {
> > -		INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
> > -		aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1,
> > GIC, IMP);
> > -		pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC,
> > GICv3);
> > -	}
> > -
> > -	kvm_set_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1, aa64pfr0);
> > -	kvm_set_vm_id_reg(kvm, SYS_ID_PFR1_EL1, pfr1);
> > +	/*
> > +	 * We've now created the GIC. Update the system register
> > state
> > +	 * to accurately reflect what we've created.
> > +	 */
> > +	kvm_vgic_finalize_sysregs(kvm);
> 
> As pointed out f2f, this will conflict with the patch posted at
> https://patch.msgid.link/20260228164559.936268-1-maz@kernel.org

Thanks, and noted!

> 
> >  
> >  	if (type == KVM_DEV_TYPE_ARM_VGIC_V3)
> >  		kvm->arch.vgic.nassgicap =
> > system_supports_direct_sgis();
> > @@ -617,6 +608,30 @@ int kvm_vgic_map_resources(struct kvm *kvm)
> >  	return ret;
> >  }
> >  
> > +void kvm_vgic_finalize_sysregs(struct kvm *kvm)
> 
> nit: could you rename this to kvm_vgic_finalize_idregs()?

Done.

> 
> > +{
> > +	u32 type = kvm->arch.vgic.vgic_model;
> > +	u64 aa64pfr0, aa64pfr2, pfr1;
> > +
> > +	aa64pfr0 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR0_EL1) &
> > ~ID_AA64PFR0_EL1_GIC;
> > +	aa64pfr2 = kvm_read_vm_id_reg(kvm, SYS_ID_AA64PFR2_EL1) &
> > ~ID_AA64PFR2_EL1_GCIE;
> > +	pfr1 = kvm_read_vm_id_reg(kvm, SYS_ID_PFR1_EL1) &
> > ~ID_PFR1_EL1_GIC;
> > +
> > +	if (type == KVM_DEV_TYPE_ARM_VGIC_V2) {
> > +		kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
> > +	} else if (type == KVM_DEV_TYPE_ARM_VGIC_V3) {
> > +		INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
> > +		aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1,
> > GIC, IMP);
> > +		pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC,
> > GICv3);
> > +	} else {
> > +		aa64pfr2 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1,
> > GCIE, IMP);
> > +	}
> 
> I'd rather see this written as:
> 
> 	switch (kvm->arch.vgic.vgic_model) {
> 	case KVM_DEV_TYPE_ARM_VGIC_V2:
> 		kvm->arch.vgic.vgic_cpu_base = VGIC_ADDR_UNDEF;
> 		break;
> 	case KVM_DEV_TYPE_ARM_VGIC_V3:
> 		INIT_LIST_HEAD(&kvm->arch.vgic.rd_regions);
> 		aa64pfr0 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR0_EL1,
> GIC, IMP);
> 		pfr1 |= SYS_FIELD_PREP_ENUM(ID_PFR1_EL1, GIC,
> GICv3);
> 		break;
> 	case KVM_DEV_TYPE_ARM_VGIC_V5:
> 		aa64pfr2 |= SYS_FIELD_PREP_ENUM(ID_AA64PFR2_EL1,
> GCIE, IMP);
> 		break;
> 	default:
> 		WARN_ONCE(1, "WTF???\n");
> 	}
> 
> which I find more readable than the if/else cascade.

Done.

Thanks,
Sascha

> 
> Thanks,
> 
> 	M.
>