Commit 3ebcbd22 authored by Anton Romanov's avatar Anton Romanov Committed by Sean Christopherson

KVM: x86: Use current rather than snapshotted TSC frequency if it is constant

Don't snapshot tsc_khz into per-cpu cpu_tsc_khz if the host TSC is
constant, in which case the actual TSC frequency will never change and thus
capturing TSC during initialization is unnecessary, KVM can simply use
tsc_khz.  This value is snapshotted from
kvm_timer_init->kvmclock_cpu_online->tsc_khz_changed(NULL)

On CPUs with constant TSC, but not a hardware-specified TSC frequency,
snapshotting cpu_tsc_khz and using that to set a VM's target TSC frequency
can lead to VM to think its TSC frequency is not what it actually is if
refining the TSC completes after KVM snapshots tsc_khz.  The actual
frequency never changes, only the kernel's calculation of what that
frequency is changes.

Ideally, KVM would not be able to race with TSC refinement, or would have
a hook into tsc_refine_calibration_work() to get an alert when refinement
is complete.  Avoiding the race altogether isn't practical as refinement
takes a relative eternity; it's deliberately put on a work queue outside of
the normal boot sequence to avoid unnecessarily delaying boot.

Adding a hook is doable, but somewhat gross due to KVM's ability to be
built as a module.  And if the TSC is constant, which is likely the case
for every VMX/SVM-capable CPU produced in the last decade, the race can be
hit if and only if userspace is able to create a VM before TSC refinement
completes; refinement is slow, but not that slow.

For now, punt on a proper fix, as not taking a snapshot can help some uses
cases and not taking a snapshot is arguably correct irrespective of the
race with refinement.
Signed-off-by: default avatarAnton Romanov <romanton@google.com>
Reviewed-by: default avatarSean Christopherson <seanjc@google.com>
Link: https://lore.kernel.org/r/20220608183525.1143682-1-romanton@google.comSigned-off-by: default avatarSean Christopherson <seanjc@google.com>
parent b80732fd
...@@ -2974,6 +2974,22 @@ static void kvm_update_masterclock(struct kvm *kvm) ...@@ -2974,6 +2974,22 @@ static void kvm_update_masterclock(struct kvm *kvm)
kvm_end_pvclock_update(kvm); kvm_end_pvclock_update(kvm);
} }
/*
* Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's
* per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz
* can change during boot even if the TSC is constant, as it's possible for KVM
* to be loaded before TSC calibration completes. Ideally, KVM would get a
* notification when calibration completes, but practically speaking calibration
* will complete before userspace is alive enough to create VMs.
*/
static unsigned long get_cpu_tsc_khz(void)
{
if (static_cpu_has(X86_FEATURE_CONSTANT_TSC))
return tsc_khz;
else
return __this_cpu_read(cpu_tsc_khz);
}
/* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */
static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
{ {
...@@ -2984,7 +3000,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) ...@@ -2984,7 +3000,8 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
get_cpu(); get_cpu();
data->flags = 0; data->flags = 0;
if (ka->use_master_clock && __this_cpu_read(cpu_tsc_khz)) { if (ka->use_master_clock &&
(static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) {
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
struct timespec64 ts; struct timespec64 ts;
...@@ -2998,7 +3015,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) ...@@ -2998,7 +3015,7 @@ static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data)
data->flags |= KVM_CLOCK_TSC_STABLE; data->flags |= KVM_CLOCK_TSC_STABLE;
hv_clock.tsc_timestamp = ka->master_cycle_now; hv_clock.tsc_timestamp = ka->master_cycle_now;
hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset;
kvm_get_time_scale(NSEC_PER_SEC, __this_cpu_read(cpu_tsc_khz) * 1000LL, kvm_get_time_scale(NSEC_PER_SEC, get_cpu_tsc_khz() * 1000LL,
&hv_clock.tsc_shift, &hv_clock.tsc_shift,
&hv_clock.tsc_to_system_mul); &hv_clock.tsc_to_system_mul);
data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc); data->clock = __pvclock_read_cycles(&hv_clock, data->host_tsc);
...@@ -3108,7 +3125,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) ...@@ -3108,7 +3125,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
/* Keep irq disabled to prevent changes to the clock */ /* Keep irq disabled to prevent changes to the clock */
local_irq_save(flags); local_irq_save(flags);
tgt_tsc_khz = __this_cpu_read(cpu_tsc_khz); tgt_tsc_khz = get_cpu_tsc_khz();
if (unlikely(tgt_tsc_khz == 0)) { if (unlikely(tgt_tsc_khz == 0)) {
local_irq_restore(flags); local_irq_restore(flags);
kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
...@@ -9038,9 +9055,11 @@ static void tsc_khz_changed(void *data) ...@@ -9038,9 +9055,11 @@ static void tsc_khz_changed(void *data)
struct cpufreq_freqs *freq = data; struct cpufreq_freqs *freq = data;
unsigned long khz = 0; unsigned long khz = 0;
WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC));
if (data) if (data)
khz = freq->new; khz = freq->new;
else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) else
khz = cpufreq_quick_get(raw_smp_processor_id()); khz = cpufreq_quick_get(raw_smp_processor_id());
if (!khz) if (!khz)
khz = tsc_khz; khz = tsc_khz;
...@@ -9061,8 +9080,10 @@ static void kvm_hyperv_tsc_notifier(void) ...@@ -9061,8 +9080,10 @@ static void kvm_hyperv_tsc_notifier(void)
hyperv_stop_tsc_emulation(); hyperv_stop_tsc_emulation();
/* TSC frequency always matches when on Hyper-V */ /* TSC frequency always matches when on Hyper-V */
for_each_present_cpu(cpu) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
per_cpu(cpu_tsc_khz, cpu) = tsc_khz; for_each_present_cpu(cpu)
per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
}
kvm_caps.max_guest_tsc_khz = tsc_khz; kvm_caps.max_guest_tsc_khz = tsc_khz;
list_for_each_entry(kvm, &vm_list, vm_list) { list_for_each_entry(kvm, &vm_list, vm_list) {
...@@ -9199,10 +9220,10 @@ static void kvm_timer_init(void) ...@@ -9199,10 +9220,10 @@ static void kvm_timer_init(void)
} }
cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER); CPUFREQ_TRANSITION_NOTIFIER);
}
cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online", cpuhp_setup_state(CPUHP_AP_X86_KVM_CLK_ONLINE, "x86/kvm/clk:online",
kvmclock_cpu_online, kvmclock_cpu_down_prep); kvmclock_cpu_online, kvmclock_cpu_down_prep);
}
} }
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
...@@ -9362,10 +9383,11 @@ void kvm_arch_exit(void) ...@@ -9362,10 +9383,11 @@ void kvm_arch_exit(void)
#endif #endif
kvm_lapic_exit(); kvm_lapic_exit();
if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
CPUFREQ_TRANSITION_NOTIFIER); CPUFREQ_TRANSITION_NOTIFIER);
cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE); cpuhp_remove_state_nocalls(CPUHP_AP_X86_KVM_CLK_ONLINE);
}
#ifdef CONFIG_X86_64 #ifdef CONFIG_X86_64
pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier); pvclock_gtod_unregister_notifier(&pvclock_gtod_notifier);
irq_work_sync(&pvclock_irq_work); irq_work_sync(&pvclock_irq_work);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment