[PATCH v5 01/13] KVM: x86: Report host tsc and realtime values in KVM_GET_CLOCK

Sean Christopherson seanjc at google.com
Fri Jul 30 10:48:46 PDT 2021


On Thu, Jul 29, 2021, Oliver Upton wrote:
> diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
> index 916c976e99ab..e052c7afaac4 100644
> --- a/arch/x86/kvm/x86.c
> +++ b/arch/x86/kvm/x86.c
> @@ -2782,17 +2782,24 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
>  #endif
>  }
>  
> -u64 get_kvmclock_ns(struct kvm *kvm)
> +/*
> + * Returns true if realtime and TSC values were written back to the caller.
> + * Returns false if a clock triplet cannot be obtained, such as if the host's
> + * realtime clock is not based on the TSC.
> + */
> +static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
> +				      u64 *realtime_ns, u64 *tsc)
>  {
>  	struct kvm_arch *ka = &kvm->arch;
>  	struct pvclock_vcpu_time_info hv_clock;
>  	unsigned long flags;
> -	u64 ret;
> +	bool ret = false;
>  
>  	spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
>  	if (!ka->use_master_clock) {
>  		spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
> -		return get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		*kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +		return false;
>  	}
>  
>  	hv_clock.tsc_timestamp = ka->master_cycle_now;
> @@ -2803,18 +2810,36 @@ u64 get_kvmclock_ns(struct kvm *kvm)
>       get_cpu();
>
>       if (__this_cpu_read(cpu_tsc_khz)) {
> +             struct timespec64 ts;
> +             u64 tsc_val;
> +
>               kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
>                                  &hv_clock.tsc_shift,
>                                  &hv_clock.tsc_to_system_mul);
> -             ret = __pvclock_read_cycles(&hv_clock, rdtsc());
> +
> +             if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
> +                     *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
> +                     *tsc = tsc_val;
> +                     ret = true;
> +             }
> +
> +             *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);

This is buggy, as tsc_val is not initialized if kvm_get_walltime_and_clockread()
returns false.  This also straight up fails to compile on 32-bit kernels because
kvm_get_walltime_and_clockread() is wrapped with CONFIG_X86_64.

A gross way to resolve both issues would be (see below regarding 'data'):

	if (__this_cpu_read(cpu_tsc_khz)) {
#ifdef CONFIG_X86_64
		struct timespec64 ts;

		if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
			data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
			data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
		} else
#endif
		data->host_tsc = rdtsc();

		kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
				   &hv_clock.tsc_shift,
				   &hv_clock.tsc_to_system_mul);

		data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
	} else {
		data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
	}


>       } else

Not your code, but this needs braces.

> -             ret = get_kvmclock_base_ns() + ka->kvmclock_offset;
> +             *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
>
>       put_cpu();
>
>       return ret;
>  } 

...

> @@ -5820,6 +5845,68 @@ int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state)
>  }
>  #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
>  
> +static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_clock_data data;
> +
> +	memset(&data, 0, sizeof(data));
> +
> +	if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
> +				      &data.host_tsc))
> +		data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
> +
> +	if (kvm->arch.use_master_clock)
> +		data.flags |= KVM_CLOCK_TSC_STABLE;

I know nothing about the actual behavior, but this appears to have a potential
race (though it came from the existing code).  get_kvmclock_and_realtime() checks
arch.use_master_clock under pvclock_gtod_sync_lock, but this does not.  Even if
that's functionally ok, it's a needless complication.

Fixing that weirdness would also provide an opportunity to clean up the API,
e.g. the boolean return is not exactly straightforward.  The simplest approach
would be to take "struct kvm_clock_data *data" in get_kvmclock_and_realtime()
instead of multiple params.  Then that helper can set the flags accordingly, thus
avoiding the question of whether or not there's a race and eliminating the boolean
return.  E.g.

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index e052c7afaac4..872a53a7c467 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -2782,25 +2782,20 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
-/*
- * Returns true if realtime and TSC values were written back to the caller.
- * Returns false if a clock triplet cannot be obtained, such as if the host's
- * realtime clock is not based on the TSC.
- */
-static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
-                                     u64 *realtime_ns, u64 *tsc)
+static void get_kvmclock_and_realtime(struct kvm *kvm,
+                                     struct kvm_clock_data *data)
 {
        struct kvm_arch *ka = &kvm->arch;
        struct pvclock_vcpu_time_info hv_clock;
        unsigned long flags;
-       bool ret = false;
 
        spin_lock_irqsave(&ka->pvclock_gtod_sync_lock, flags);
        if (!ka->use_master_clock) {
                spin_unlock_irqrestore(&ka->pvclock_gtod_sync_lock, flags);
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
-               return false;
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               return;
        }
+       data->flags |= KVM_CLOCK_TSC_STABLE;
 
        hv_clock.tsc_timestamp = ka->master_cycle_now;
        hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
@@ -2810,34 +2805,40 @@ static bool get_kvmclock_and_realtime(struct kvm *kvm, u64 *kvmclock_ns,
        get_cpu();
 
        if (__this_cpu_read(cpu_tsc_khz)) {
+#ifdef CONFIG_X86_64
                struct timespec64 ts;
-               u64 tsc_val;
+
+               if (kvm_get_walltime_and_clockread(&ts, &data->host_tsc)) {
+                       data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
+                       data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
+               } else
+#endif
+               data->host_tsc = rdtsc();
 
                kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL,
                                   &hv_clock.tsc_shift,
                                   &hv_clock.tsc_to_system_mul);
 
-               if (kvm_get_walltime_and_clockread(&ts, &tsc_val)) {
-                       *realtime_ns = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec;
-                       *tsc = tsc_val;
-                       ret = true;
-               }
-
-               *kvmclock_ns = __pvclock_read_cycles(&hv_clock, tsc_val);
-       } else
-               *kvmclock_ns = get_kvmclock_base_ns() + ka->kvmclock_offset;
+               data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
+       } else {
+               data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset;
+       }
 
        put_cpu();
-
-       return ret;
 }
 
 u64 get_kvmclock_ns(struct kvm *kvm)
 {
-       u64 kvmclock_ns, realtime_ns, tsc;
+       struct kvm_clock_data data;
 
-       get_kvmclock_and_realtime(kvm, &kvmclock_ns, &realtime_ns, &tsc);
-       return kvmclock_ns;
+       /*
+        * Zero flags as it's accessed RMW, leave everything else uninitialized
+        * as clock is always written and no other fields are consumed.
+        */
+       data.flags = 0;
+
+       get_kvmclock_and_realtime(kvm, &data);
+       return data.clock;
 }
 
 static void kvm_setup_pvclock_page(struct kvm_vcpu *v,
@@ -5852,12 +5853,7 @@ static int kvm_vm_ioctl_get_clock(struct kvm *kvm,
 
        memset(&data, 0, sizeof(data));
 
-       if (get_kvmclock_and_realtime(kvm, &data.clock, &data.realtime,
-                                     &data.host_tsc))
-               data.flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC;
-
-       if (kvm->arch.use_master_clock)
-               data.flags |= KVM_CLOCK_TSC_STABLE;
+       get_kvmclock_and_realtime(kvm, &data);
 
        if (copy_to_user(argp, &data, sizeof(data)))
                return -EFAULT;

> +
> +	if (copy_to_user(argp, &data, sizeof(data)))
> +		return -EFAULT;
> +
> +	return 0;
> +}
> +
> +static int kvm_vm_ioctl_set_clock(struct kvm *kvm,
> +				  void __user *argp)
> +{
> +	struct kvm_arch *ka = &kvm->arch;
> +	struct kvm_clock_data data;
> +	u64 now_raw_ns;
> +
> +	if (copy_from_user(&data, argp, sizeof(data)))
> +		return -EFAULT;
> +
> +	if (data.flags & ~KVM_CLOCK_REALTIME)
> +		return -EINVAL;

Huh, this an odd ABI, the output of KVM_GET_CLOCK isn't legal input to KVM_SET_CLOCK.
The existing code has the same behavior, so apparently it's ok, just odd.

> +
> +	/*
> +	 * TODO: userspace has to take care of races with VCPU_RUN, so
> +	 * kvm_gen_update_masterclock() can be cut down to locked
> +	 * pvclock_update_vm_gtod_copy().
> +	 */
> +	kvm_gen_update_masterclock(kvm);
> +
> +	spin_lock_irq(&ka->pvclock_gtod_sync_lock);
> +	if (data.flags & KVM_CLOCK_REALTIME) {
> +		u64 now_real_ns = ktime_get_real_ns();
> +
> +		/*
> +		 * Avoid stepping the kvmclock backwards.
> +		 */
> +		if (now_real_ns > data.realtime)
> +			data.clock += now_real_ns - data.realtime;
> +	}
> +
> +	if (ka->use_master_clock)
> +		now_raw_ns = ka->master_kernel_ns;
> +	else
> +		now_raw_ns = get_kvmclock_base_ns();
> +	ka->kvmclock_offset = data.clock - now_raw_ns;
> +	spin_unlock_irq(&ka->pvclock_gtod_sync_lock);
> +
> +	kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
> +	return 0;
> +}
> +
>  long kvm_arch_vm_ioctl(struct file *filp,
>  		       unsigned int ioctl, unsigned long arg)
>  {
> @@ -6064,57 +6151,11 @@ long kvm_arch_vm_ioctl(struct file *filp,
>  	}
>  #endif
>  	case KVM_SET_CLOCK: {

The curly braces can be dropped, both here and in KVM_GET_CLOCK.

>  	}
>  	case KVM_GET_CLOCK: {

...

>  	}



More information about the linux-arm-kernel mailing list