KVM: x86/xen: improve accuracy of Xen timers

A test program such as http://david.woodhou.se/timerlat.c confirms user reports that timers are increasingly inaccurate as the lifetime of a guest increases. Reporting the actual delay observed when asking for 100µs of sleep, it starts off OK on a newly-launched guest but gets worse over time, giving incorrect sleep times: root@ip-10-0-193-21:~# ./timerlat -c -n 5 00000000 latency 103243/100000 (3.2430%) 00000001 latency 103243/100000 (3.2430%) 00000002 latency 103242/100000 (3.2420%) 00000003 latency 103245/100000 (3.2450%) 00000004 latency 103245/100000 (3.2450%) The biggest problem is that get_kvmclock_ns() returns inaccurate values when the guest TSC is scaled. The guest sees a TSC value scaled from the host TSC by a mul/shift conversion (hopefully done in hardware). The guest then converts that guest TSC value into nanoseconds using the mul/shift conversion given to it by the KVM pvclock information. But get_kvmclock_ns() performs only a single conversion directly from host TSC to nanoseconds, giving a different result. A test program at http://david.woodhou.se/tsdrift.c demonstrates the cumulative error over a day. It's non-trivial to fix get_kvmclock_ns(), although I'll come back to that. The actual guest hv_clock is per-CPU, and *theoretically* each vCPU could be running at a *different* frequency. But this patch is needed anyway because... The other issue with Xen timers was that the code would snapshot the host CLOCK_MONOTONIC at some point in time, and then... after a few interrupts may have occurred, some preemption perhaps... would also read the guest's kvmclock. Then it would proceed under the false assumption that those two happened at the *same* time. Any time which *actually* elapsed between reading the two clocks was introduced as inaccuracies in the time at which the timer fired. Fix it to use a variant of kvm_get_time_and_clockread(), which reads the host TSC just *once*, then use the returned TSC value to calculate the kvmclock (making sure to do that the way the guest would instead of making the same mistake get_kvmclock_ns() does). Sadly, hrtimers based on CLOCK_MONOTONIC_RAW are not supported, so Xen timers still have to use CLOCK_MONOTONIC. In practice the difference between the two won't matter over the timescales involved, as the *absolute* values don't matter; just the delta. This does mean a new variant of kvm_get_time_and_clockread() is needed; called kvm_get_monotonic_and_clockread() because that's what it does. Fixes: 5363952605 ("KVM: x86/xen: handle PV timers oneshot mode") Signed-off-by: David Woodhouse <dwmw@amazon.co.uk> Reviewed-by: Paul Durrant <paul@xen.org> Link: https://lore.kernel.org/r/20240227115648.3104-2-dwmw2@infradead.org [sean: massage moved comment, tweak if statement formatting] Signed-off-by: Sean Christopherson <seanjc@google.com>
2025-08-29 02:59:13 +00:00 · 2024-02-27 11:49:15 +00:00 · 2024-02-27 11:49:15 +00:00 · 451a707813
commit 451a707813
parent 003d914220
3 changed files with 152 additions and 40 deletions
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@ -2862,7 +2862,11 @@ static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp,
 	return v * clock->mult;
 }
-static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
+/*
 * As with get_kvmclock_base_ns(), this counts from boot time, at the
 * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot).
 */
 static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp)
 {
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	unsigned long seq;
@ -2881,6 +2885,29 @@ static int do_monotonic_raw(s64 *t, u64 *tsc_timestamp)
 	return mode;
 }
 /*
 * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with
 * no boot time offset.
 */
 static int do_monotonic(s64 *t, u64 *tsc_timestamp)
 {
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
 	unsigned long seq;
 	int mode;
 	u64 ns;
 	do {
 		seq = read_seqcount_begin(&gtod->seq);
 		ns = gtod->clock.base_cycles;
 		ns += vgettsc(&gtod->clock, tsc_timestamp, &mode);
 		ns >>= gtod->clock.shift;
 		ns += ktime_to_ns(gtod->clock.offset);
 	} while (unlikely(read_seqcount_retry(&gtod->seq, seq)));
 	*t = ns;
 	return mode;
 }
 static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
 {
 	struct pvclock_gtod_data *gtod = &pvclock_gtod_data;
@ -2902,18 +2929,42 @@ static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp)
 	return mode;
 }
-/* returns true if host is using TSC based clocksource */
+/*
 * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and
 * reports the TSC value from which it do so. Returns true if host is
 * using TSC based clocksource.
 */
 static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
 {
 	/* checked again under seqlock below */
 	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
 		return false;
-	return gtod_is_based_on_tsc(do_monotonic_raw(kernel_ns,
+	return gtod_is_based_on_tsc(do_kvmclock_base(kernel_ns,
-						      tsc_timestamp));
+						     tsc_timestamp));
 }
-/* returns true if host is using TSC based clocksource */
+/*
 * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did
 * so. Returns true if host is using TSC based clocksource.
 */
 bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp)
 {
 	/* checked again under seqlock below */
 	if (!gtod_is_based_on_tsc(pvclock_gtod_data.clock.vclock_mode))
 		return false;
 	return gtod_is_based_on_tsc(do_monotonic(kernel_ns,
 						 tsc_timestamp));
 }
 /*
 * Calculates CLOCK_REALTIME and reports the TSC value from which it did
 * so. Returns true if host is using TSC based clocksource.
 *
 * DO NOT USE this for anything related to migration. You want CLOCK_TAI
 * for that.
 */
 static bool kvm_get_walltime_and_clockread(struct timespec64 *ts,
 					   u64 *tsc_timestamp)
 {
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@ -294,6 +294,7 @@ void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 u64 get_kvmclock_ns(struct kvm *kvm);
 uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm);
 bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp);
 int kvm_read_guest_virt(struct kvm_vcpu *vcpu,
 	gva_t addr, void *val, unsigned int bytes,
--- a/arch/x86/kvm/xen.c
+++ b/arch/x86/kvm/xen.c
@ -24,6 +24,7 @@
 #include <xen/interface/sched.h>
 #include <asm/xen/cpuid.h>
 #include <asm/pvclock.h>
 #include "cpuid.h"
 #include "trace.h"
@ -149,8 +150,93 @@ static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
 	return HRTIMER_NORESTART;
 }
-static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_ns)
+static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
 				bool linux_wa)
 {
 	int64_t kernel_now, delta;
 	uint64_t guest_now;
 	/*
 	 * The guest provides the requested timeout in absolute nanoseconds
 	 * of the KVM clock — as *it* sees it, based on the scaled TSC and
 	 * the pvclock information provided by KVM.
 	 *
 	 * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
 	 * so use CLOCK_MONOTONIC. In the timescales covered by timers, the
 	 * difference won't matter much as there is no cumulative effect.
 	 *
 	 * Calculate the time for some arbitrary point in time around "now"
 	 * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
 	 * delta between the kvmclock "now" value and the guest's requested
 	 * timeout, apply the "Linux workaround" described below, and add
 	 * the resulting delta to the CLOCK_MONOTONIC "now" value, to get
 	 * the absolute CLOCK_MONOTONIC time at which the timer should
 	 * fire.
 	 */
 	if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
 	    static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
 		uint64_t host_tsc, guest_tsc;
 		if (!IS_ENABLED(CONFIG_64BIT) ||
 		    !kvm_get_monotonic_and_clockread(&kernel_now, &host_tsc)) {
 			/*
 			 * Don't fall back to get_kvmclock_ns() because it's
 			 * broken; it has a systemic error in its results
 			 * because it scales directly from host TSC to
 			 * nanoseconds, and doesn't scale first to guest TSC
 			 * and *then* to nanoseconds as the guest does.
 			 *
 			 * There is a small error introduced here because time
 			 * continues to elapse between the ktime_get() and the
 			 * subsequent rdtsc(). But not the systemic drift due
 			 * to get_kvmclock_ns().
 			 */
 			kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */
 			host_tsc = rdtsc();
 		}
 		/* Calculate the guest kvmclock as the guest would do it. */
 		guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
 		guest_now = __pvclock_read_cycles(&vcpu->arch.hv_clock,
 						  guest_tsc);
 	} else {
 		/*
 		 * Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
 		 *
 		 * Also if the guest PV clock hasn't been set up yet, as is
 		 * likely to be the case during migration when the vCPU has
 		 * not been run yet. It would be possible to calculate the
 		 * scaling factors properly in that case but there's not much
 		 * point in doing so. The get_kvmclock_ns() drift accumulates
 		 * over time, so it's OK to use it at startup. Besides, on
 		 * migration there's going to be a little bit of skew in the
 		 * precise moment at which timers fire anyway. Often they'll
 		 * be in the "past" by the time the VM is running again after
 		 * migration.
 		 */
 		guest_now = get_kvmclock_ns(vcpu->kvm);
 		kernel_now = ktime_get();
 	}
 	delta = guest_abs - guest_now;
 	/*
 	 * Xen has a 'Linux workaround' in do_set_timer_op() which checks for
 	 * negative absolute timeout values (caused by integer overflow), and
 	 * for values about 13 days in the future (2^50ns) which would be
 	 * caused by jiffies overflow. For those cases, Xen sets the timeout
 	 * 100ms in the future (not *too* soon, since if a guest really did
 	 * set a long timeout on purpose we don't want to keep churning CPU
 	 * time by waking it up).  Emulate Xen's workaround when starting the
 	 * timer in response to __HYPERVISOR_set_timer_op.
 	 */
 	if (linux_wa &&
 	    unlikely((int64_t)guest_abs < 0 ||
 		     (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
 		delta = 100 * NSEC_PER_MSEC;
 		guest_abs = guest_now + delta;
 	}
 	/*
 	 * Avoid races with the old timer firing. Checking timer_expires
 	 * to avoid calling hrtimer_cancel() will only have false positives
@ -162,14 +248,12 @@ static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, s64 delta_
 	atomic_set(&vcpu->arch.xen.timer_pending, 0);
 	vcpu->arch.xen.timer_expires = guest_abs;
-	if (delta_ns <= 0) {
+	if (delta <= 0)
 		xen_timer_callback(&vcpu->arch.xen.timer);
-	} else {
+	else
 		ktime_t ktime_now = ktime_get();
 		hrtimer_start(&vcpu->arch.xen.timer,
-			      ktime_add_ns(ktime_now, delta_ns),
+			      ktime_add_ns(kernel_now, delta),
 			      HRTIMER_MODE_ABS_HARD);
 	}
 }
 static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
@ -997,9 +1081,7 @@ int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data)
 		/* Start the timer if the new value has a valid vector+expiry. */
 		if (data->u.timer.port && data->u.timer.expires_ns)
-			kvm_xen_start_timer(vcpu, data->u.timer.expires_ns,
+			kvm_xen_start_timer(vcpu, data->u.timer.expires_ns, false);
 					    data->u.timer.expires_ns -
 					    get_kvmclock_ns(vcpu->kvm));
 		r = 0;
 		break;
@ -1472,7 +1554,6 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
 {
 	struct vcpu_set_singleshot_timer oneshot;
 	struct x86_exception e;
 	s64 delta;
 	if (!kvm_xen_timer_enabled(vcpu))
 		return false;
@ -1506,9 +1587,7 @@ static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd,
 			return true;
 		}
-		/* A delta <= 0 results in an immediate callback, which is what we want */
+		kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, false);
 		delta = oneshot.timeout_abs_ns - get_kvmclock_ns(vcpu->kvm);
 		kvm_xen_start_timer(vcpu, oneshot.timeout_abs_ns, delta);
 		*r = 0;
 		return true;
@ -1531,29 +1610,10 @@ static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
 	if (!kvm_xen_timer_enabled(vcpu))
 		return false;
-	if (timeout) {
+	if (timeout)
-		uint64_t guest_now = get_kvmclock_ns(vcpu->kvm);
+		kvm_xen_start_timer(vcpu, timeout, true);
-		int64_t delta = timeout - guest_now;
+	else
 		/* Xen has a 'Linux workaround' in do_set_timer_op() which
 		 * checks for negative absolute timeout values (caused by
 		 * integer overflow), and for values about 13 days in the
 		 * future (2^50ns) which would be caused by jiffies
 		 * overflow. For those cases, it sets the timeout 100ms in
 		 * the future (not *too* soon, since if a guest really did
 		 * set a long timeout on purpose we don't want to keep
 		 * churning CPU time by waking it up).
 		 */
 		if (unlikely((int64_t)timeout < 0 ||
 			     (delta > 0 && (uint32_t) (delta >> 50) != 0))) {
 			delta = 100 * NSEC_PER_MSEC;
 			timeout = guest_now + delta;
 		}
 		kvm_xen_start_timer(vcpu, timeout, delta);
 	} else {
 		kvm_xen_stop_timer(vcpu);
 	}
 	*r = 0;
 	return true;