[PATCH 2/5] KVM: arm64: Build MPIDR to vcpu index cache at runtime

Joey Gouly joey.gouly at arm.com
Thu Sep 7 08:29:18 PDT 2023


On Thu, Sep 07, 2023 at 11:09:28AM +0100, Marc Zyngier wrote:
> The MPIDR_EL1 register contains a unique value that identifies
> the CPU. The only problem with it is that it is stupidly large
> (32 bits, once the useless stuff is removed).
> 
> Trying to obtain a vcpu from an MPIDR value is a fairly common,
> yet costly operation: we iterate over all the vcpus until we
> find the correct one. While this is cheap for small VMs, it is
> pretty expensive on large ones, specially if you are trying to
> get to the one that's at the end of the list...
> 
> In order to help with this, it is important to realise that
> the MPIDR values are actually structured, and that implementations
> tend to use a small number of significant bits in the 32bit space.
> 
> We can use this fact to our advantage by computing a small hash
> table that uses the "compression" of the significant MPIDR bits
> as an index, giving us the vcpu index as a result.
> 
> Given that the MPIDR values can be supplied by userspace, and
> that an evil VMM could decide to make *all* bits significant,
> resulting in a 4G-entry table, we only use this method if the
> resulting table fits in a single page. Otherwise, we fallback
> to the good old iterative method.
> 
> Nothing uses that table just yet, but keep your eyes peeled.
> 
> Signed-off-by: Marc Zyngier <maz at kernel.org>
> ---
>  arch/arm64/include/asm/kvm_host.h | 28 ++++++++++++++++
>  arch/arm64/kvm/arm.c              | 54 +++++++++++++++++++++++++++++++
>  2 files changed, 82 insertions(+)
> 
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index af06ccb7ee34..361aaf66ac32 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -202,6 +202,31 @@ struct kvm_protected_vm {
>  	struct kvm_hyp_memcache teardown_mc;
>  };
>  
> +struct kvm_mpidr_data {
> +	u64			mpidr_mask;
> +	DECLARE_FLEX_ARRAY(u16, cmpidr_to_idx);
> +};
> +
> +static inline u16 kvm_mpidr_index(struct kvm_mpidr_data *data, u64 mpidr)
> +{
> +	unsigned long mask = data->mpidr_mask;
> +	u64 aff = mpidr & MPIDR_HWID_BITMASK;
> +	int nbits, bit, bit_idx = 0;
> +	u16 vcpu_idx = 0;
> +
> +	/*
> +	 * If this looks like RISC-V's BEXT or x86's PEXT
> +	 * instructions, it isn't by accident.
> +	 */
> +	nbits = fls(mask);
> +	for_each_set_bit(bit, &mask, nbits) {
> +		vcpu_idx |= (aff & BIT(bit)) >> (bit - bit_idx);
> +		bit_idx++;
> +	}
> +
> +	return vcpu_idx;
> +}
> +
>  struct kvm_arch {
>  	struct kvm_s2_mmu mmu;
>  
> @@ -248,6 +273,9 @@ struct kvm_arch {
>  	/* VM-wide vCPU feature set */
>  	DECLARE_BITMAP(vcpu_features, KVM_VCPU_MAX_FEATURES);
>  
> +	/* MPIDR to vcpu index mapping, optional */
> +	struct kvm_mpidr_data *mpidr_data;
> +
>  	/*
>  	 * VM-wide PMU filter, implemented as a bitmap and big enough for
>  	 * up to 2^10 events (ARMv8.0) or 2^16 events (ARMv8.1+).
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 4866b3f7b4ea..30ce394c09d4 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -205,6 +205,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
>  	if (is_protected_kvm_enabled())
>  		pkvm_destroy_hyp_vm(kvm);
>  
> +	kfree(kvm->arch.mpidr_data);
>  	kvm_destroy_vcpus(kvm);
>  
>  	kvm_unshare_hyp(kvm, kvm + 1);
> @@ -578,6 +579,57 @@ static int kvm_vcpu_initialized(struct kvm_vcpu *vcpu)
>  	return vcpu_get_flag(vcpu, VCPU_INITIALIZED);
>  }
>  
> +static void kvm_init_mpidr_data(struct kvm *kvm)
> +{
> +	struct kvm_mpidr_data *data = NULL;
> +	unsigned long c, mask, nr_entries;
> +	u64 aff_set = 0, aff_clr = ~0UL;
> +	struct kvm_vcpu *vcpu;
> +
> +	mutex_lock(&kvm->arch.config_lock);
> +
> +	if (kvm->arch.mpidr_data || atomic_read(&kvm->online_vcpus) == 1)
> +		goto out;
> +
> +	kvm_for_each_vcpu(c, vcpu, kvm) {
> +		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
> +		aff_set |= aff;
> +		aff_clr &= aff;
> +	}
> +
> +	/*
> +	 * A significant bit can be either 0 or 1, and will only appear in
> +	 * aff_set. Use aff_clr to weed out the useless stuff.
> +	 */
> +	mask = aff_set ^ aff_clr;
> +	nr_entries = BIT_ULL(hweight_long(mask));
> +
> +	/*
> +	 * Don't let userspace fool us. If we need more than a single page
> +	 * to describe the compressed MPIDR array, just fall back to the
> +	 * iterative method. Single vcpu VMs do not need this either.
> +	 */
> +	if (struct_size(data, cmpidr_to_idx, nr_entries) <= PAGE_SIZE)
> +		data = kzalloc(struct_size(data, cmpidr_to_idx, nr_entries),
> +			       GFP_KERNEL_ACCOUNT);
> +
> +	if (!data)
> +		goto out;

Probably not a big deal, but if the data doesn't fit, every vCPU will run this
function up until this point (if the data fits or there's only 1 vCPU we bail
out earlier)

> +
> +	data->mpidr_mask = mask;
> +
> +	kvm_for_each_vcpu(c, vcpu, kvm) {
> +		u64 aff = kvm_vcpu_get_mpidr_aff(vcpu);
> +		u16 vcpu_idx = kvm_mpidr_index(data, aff);
> +
> +		data->cmpidr_to_idx[vcpu_idx] = c;
> +	}
> +
> +	kvm->arch.mpidr_data = data;
> +out:
> +	mutex_unlock(&kvm->arch.config_lock);
> +}
> +
>  /*
>   * Handle both the initialisation that is being done when the vcpu is
>   * run for the first time, as well as the updates that must be
> @@ -601,6 +653,8 @@ int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu)
>  	if (likely(vcpu_has_run_once(vcpu)))
>  		return 0;
>  
> +	kvm_init_mpidr_data(kvm);
> +
>  	kvm_arm_vcpu_init_debug(vcpu);
>  
>  	if (likely(irqchip_in_kernel(kvm))) {

Reviewed-by: Joey Gouly <joey.gouly at arm.com>



More information about the linux-arm-kernel mailing list