[PATCH v2 1/4] KVM: arm64: Avoid vcpu->mutex v. kvm->lock inversion in CPU_ON
Marc Zyngier
maz at kernel.org
Wed Mar 22 05:02:24 PDT 2023
On Thu, 16 Mar 2023 21:14:09 +0000,
Oliver Upton <oliver.upton at linux.dev> wrote:
>
> KVM/arm64 had the lock ordering backwards on vcpu->mutex and kvm->lock
> from the very beginning. One such example is the way vCPU resets are
> handled: the kvm->lock is acquired while handling a guest CPU_ON PSCI
> call.
>
> Add a dedicated lock to serialize writes to kvm_vcpu_arch::mp_state.
> Hold the lock in kvm_psci_vcpu_on() to protect against changes while
> the reset state is being configured. Ensure that readers read mp_state
> exactly once. While at it, plug yet another race by taking the
> mp_state_lock in the KVM_SET_MP_STATE ioctl handler.
>
> As changes to MP state are now guarded with a dedicated lock, drop the
> kvm->lock acquisition from the PSCI CPU_ON path. Similarly, move the
> reader of reset_state outside of the kvm->lock and insert a barrier to
> ensure reset_state is read before dropping the pending reset state.
>
> While the kvm->lock inversion still exists in kvm_reset_vcpu(), at least
> now PSCI CPU_ON no longer depends on it for serializing vCPU reset.
>
> Signed-off-by: Oliver Upton <oliver.upton at linux.dev>
> ---
> arch/arm64/include/asm/kvm_host.h | 1 +
> arch/arm64/kvm/arm.c | 23 ++++++++++++++++++-----
> arch/arm64/kvm/psci.c | 19 ++++++++++---------
> arch/arm64/kvm/reset.c | 10 ++++++----
> 4 files changed, 35 insertions(+), 18 deletions(-)
>
> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h
> index bcd774d74f34..917586237a4d 100644
> --- a/arch/arm64/include/asm/kvm_host.h
> +++ b/arch/arm64/include/asm/kvm_host.h
> @@ -522,6 +522,7 @@ struct kvm_vcpu_arch {
>
> /* vcpu power state */
> struct kvm_mp_state mp_state;
> + spinlock_t mp_state_lock;
>
> /* Cache some mmu pages needed inside spinlock regions */
> struct kvm_mmu_memory_cache mmu_page_cache;
> diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c
> index 3bd732eaf087..731a78f85915 100644
> --- a/arch/arm64/kvm/arm.c
> +++ b/arch/arm64/kvm/arm.c
> @@ -326,6 +326,8 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu)
> {
> int err;
>
> + spin_lock_init(&vcpu->arch.mp_state_lock);
> +
> /* Force users to call KVM_ARM_VCPU_INIT */
> vcpu->arch.target = -1;
> bitmap_zero(vcpu->arch.features, KVM_VCPU_MAX_FEATURES);
> @@ -443,16 +445,23 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
> vcpu->cpu = -1;
> }
>
> -void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
> +static void __kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
> {
> vcpu->arch.mp_state.mp_state = KVM_MP_STATE_STOPPED;
> kvm_make_request(KVM_REQ_SLEEP, vcpu);
> kvm_vcpu_kick(vcpu);
> }
>
> +void kvm_arm_vcpu_power_off(struct kvm_vcpu *vcpu)
> +{
> + spin_lock(&vcpu->arch.mp_state_lock);
> + __kvm_arm_vcpu_power_off(vcpu);
> + spin_unlock(&vcpu->arch.mp_state_lock);
> +}
> +
> bool kvm_arm_vcpu_stopped(struct kvm_vcpu *vcpu)
> {
> - return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_STOPPED;
> + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_STOPPED;
> }
>
> static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
> @@ -464,13 +473,13 @@ static void kvm_arm_vcpu_suspend(struct kvm_vcpu *vcpu)
>
> static bool kvm_arm_vcpu_suspended(struct kvm_vcpu *vcpu)
> {
> - return vcpu->arch.mp_state.mp_state == KVM_MP_STATE_SUSPENDED;
> + return READ_ONCE(vcpu->arch.mp_state.mp_state) == KVM_MP_STATE_SUSPENDED;
> }
>
> int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
> struct kvm_mp_state *mp_state)
> {
> - *mp_state = vcpu->arch.mp_state;
> + *mp_state = READ_ONCE(vcpu->arch.mp_state);
>
> return 0;
> }
> @@ -480,12 +489,14 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
> {
> int ret = 0;
>
> + spin_lock(&vcpu->arch.mp_state_lock);
> +
> switch (mp_state->mp_state) {
> case KVM_MP_STATE_RUNNABLE:
> vcpu->arch.mp_state = *mp_state;
Given that the above helpers snapshot mp_state without holding the
lock, it'd be better to at least turn this into a WRITE_ONCE().
> break;
> case KVM_MP_STATE_STOPPED:
> - kvm_arm_vcpu_power_off(vcpu);
> + __kvm_arm_vcpu_power_off(vcpu);
> break;
> case KVM_MP_STATE_SUSPENDED:
> kvm_arm_vcpu_suspend(vcpu);
> @@ -494,6 +505,8 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
> ret = -EINVAL;
> }
>
> + spin_unlock(&vcpu->arch.mp_state_lock);
> +
> return ret;
> }
>
> diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c
> index 7fbc4c1b9df0..7f1bff1cd670 100644
> --- a/arch/arm64/kvm/psci.c
> +++ b/arch/arm64/kvm/psci.c
> @@ -62,6 +62,7 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
> struct vcpu_reset_state *reset_state;
> struct kvm *kvm = source_vcpu->kvm;
> struct kvm_vcpu *vcpu = NULL;
> + int ret = PSCI_RET_SUCCESS;
> unsigned long cpu_id;
>
> cpu_id = smccc_get_arg1(source_vcpu);
> @@ -76,11 +77,15 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
> */
> if (!vcpu)
> return PSCI_RET_INVALID_PARAMS;
> +
> + spin_lock(&vcpu->arch.mp_state_lock);
> if (!kvm_arm_vcpu_stopped(vcpu)) {
> if (kvm_psci_version(source_vcpu) != KVM_ARM_PSCI_0_1)
> - return PSCI_RET_ALREADY_ON;
> + ret = PSCI_RET_ALREADY_ON;
> else
> - return PSCI_RET_INVALID_PARAMS;
> + ret = PSCI_RET_INVALID_PARAMS;
> +
> + goto out_unlock;
> }
>
> reset_state = &vcpu->arch.reset_state;
> @@ -108,7 +113,9 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu)
> vcpu->arch.mp_state.mp_state = KVM_MP_STATE_RUNNABLE;
> kvm_vcpu_wake_up(vcpu);
>
> - return PSCI_RET_SUCCESS;
> +out_unlock:
> + spin_unlock(&vcpu->arch.mp_state_lock);
> + return ret;
> }
>
> static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu)
> @@ -229,7 +236,6 @@ static unsigned long kvm_psci_check_allowed_function(struct kvm_vcpu *vcpu, u32
>
> static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
> {
> - struct kvm *kvm = vcpu->kvm;
> u32 psci_fn = smccc_get_function(vcpu);
> unsigned long val;
> int ret = 1;
> @@ -254,9 +260,7 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu)
> kvm_psci_narrow_to_32bit(vcpu);
> fallthrough;
> case PSCI_0_2_FN64_CPU_ON:
> - mutex_lock(&kvm->lock);
> val = kvm_psci_vcpu_on(vcpu);
> - mutex_unlock(&kvm->lock);
> break;
> case PSCI_0_2_FN_AFFINITY_INFO:
> kvm_psci_narrow_to_32bit(vcpu);
> @@ -395,7 +399,6 @@ static int kvm_psci_1_x_call(struct kvm_vcpu *vcpu, u32 minor)
>
> static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
> {
> - struct kvm *kvm = vcpu->kvm;
> u32 psci_fn = smccc_get_function(vcpu);
> unsigned long val;
>
> @@ -405,9 +408,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu)
> val = PSCI_RET_SUCCESS;
> break;
> case KVM_PSCI_FN_CPU_ON:
> - mutex_lock(&kvm->lock);
> val = kvm_psci_vcpu_on(vcpu);
> - mutex_unlock(&kvm->lock);
> break;
> default:
> val = PSCI_RET_NOT_SUPPORTED;
> diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
> index 49a3257dec46..b874ec6a37c1 100644
> --- a/arch/arm64/kvm/reset.c
> +++ b/arch/arm64/kvm/reset.c
> @@ -264,15 +264,17 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu)
>
> mutex_lock(&vcpu->kvm->lock);
> ret = kvm_set_vm_width(vcpu);
> - if (!ret) {
> - reset_state = vcpu->arch.reset_state;
> - WRITE_ONCE(vcpu->arch.reset_state.reset, false);
> - }
> mutex_unlock(&vcpu->kvm->lock);
>
> if (ret)
> return ret;
>
> + reset_state = vcpu->arch.reset_state;
> +
> + /* Ensure reset_state is read before clearing the pending state */
> + smp_rmb();
> + WRITE_ONCE(vcpu->arch.reset_state.reset, false);
What prevents a concurrent PSCI call from messing with this state? I'd
have expected this to be done while holding the mp_state lock... There
must be something, somewhere, but I fail to spot it right now.
Thanks,
M.
--
Without deviation from the norm, progress is not possible.
More information about the linux-arm-kernel
mailing list