kvm.c 20.8 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-or-later
2 3 4 5 6 7 8 9
/*
 * KVM paravirt_ops implementation
 *
 * Copyright (C) 2007, Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
 * Copyright IBM Corporation, 2007
 *   Authors: Anthony Liguori <aliguori@us.ibm.com>
 */

10
#include <linux/context_tracking.h>
11
#include <linux/init.h>
12 13 14 15
#include <linux/kernel.h>
#include <linux/kvm_para.h>
#include <linux/cpu.h>
#include <linux/mm.h>
16
#include <linux/highmem.h>
17
#include <linux/hardirq.h>
18 19
#include <linux/notifier.h>
#include <linux/reboot.h>
20 21 22 23
#include <linux/hash.h>
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/kprobes.h>
24
#include <linux/debugfs.h>
25
#include <linux/nmi.h>
26
#include <linux/swait.h>
27
#include <asm/timer.h>
28
#include <asm/cpu.h>
29 30
#include <asm/traps.h>
#include <asm/desc.h>
31
#include <asm/tlbflush.h>
32 33
#include <asm/apic.h>
#include <asm/apicdef.h>
34
#include <asm/hypervisor.h>
35
#include <asm/tlb.h>
36
#include <asm/cpuidle_haltpoll.h>
37

38 39
static int kvmapf = 1;

40
static int __init parse_no_kvmapf(char *arg)
41 42 43 44 45 46 47
{
        kvmapf = 0;
        return 0;
}

early_param("no-kvmapf", parse_no_kvmapf);

48
static int steal_acc = 1;
49
static int __init parse_no_stealacc(char *arg)
50 51 52 53 54 55 56
{
        steal_acc = 0;
        return 0;
}

early_param("no-steal-acc", parse_no_stealacc);

57
static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
58
DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64) __visible;
59
static int has_steal_clock = 0;
60

61 62 63 64 65 66 67
/*
 * No need for any "IO delay" on KVM
 */
static void kvm_io_delay(void)
{
}

68 69 70 71 72
#define KVM_TASK_SLEEP_HASHBITS 8
#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)

struct kvm_task_sleep_node {
	struct hlist_node link;
73
	struct swait_queue_head wq;
74 75
	u32 token;
	int cpu;
76
	bool halted;
77 78 79
};

static struct kvm_task_sleep_head {
80
	raw_spinlock_t lock;
81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98
	struct hlist_head list;
} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];

static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
						  u32 token)
{
	struct hlist_node *p;

	hlist_for_each(p, &b->list) {
		struct kvm_task_sleep_node *n =
			hlist_entry(p, typeof(*n), link);
		if (n->token == token)
			return n;
	}

	return NULL;
}

99 100 101 102 103
/*
 * @interrupt_kernel: Is this called from a routine which interrupts the kernel
 * 		      (other than user space)?
 */
void kvm_async_pf_task_wait(u32 token, int interrupt_kernel)
104 105 106 107
{
	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
	struct kvm_task_sleep_node n, *e;
108
	DECLARE_SWAITQUEUE(wait);
109

110 111
	rcu_irq_enter();

112
	raw_spin_lock(&b->lock);
113 114 115 116 117
	e = _find_apf_task(b, token);
	if (e) {
		/* dummy entry exist -> wake up was delivered ahead of PF */
		hlist_del(&e->link);
		kfree(e);
118
		raw_spin_unlock(&b->lock);
119 120

		rcu_irq_exit();
121 122 123 124 125
		return;
	}

	n.token = token;
	n.cpu = smp_processor_id();
126 127 128 129
	n.halted = is_idle_task(current) ||
		   (IS_ENABLED(CONFIG_PREEMPT_COUNT)
		    ? preempt_count() > 1 || rcu_preempt_depth()
		    : interrupt_kernel);
130
	init_swait_queue_head(&n.wq);
131
	hlist_add_head(&n.link, &b->list);
132
	raw_spin_unlock(&b->lock);
133 134

	for (;;) {
135
		if (!n.halted)
136
			prepare_to_swait_exclusive(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
137 138
		if (hlist_unhashed(&n.link))
			break;
139

140 141
		rcu_irq_exit();

142 143 144 145 146 147 148 149 150 151 152
		if (!n.halted) {
			local_irq_enable();
			schedule();
			local_irq_disable();
		} else {
			/*
			 * We cannot reschedule. So halt.
			 */
			native_safe_halt();
			local_irq_disable();
		}
153 154

		rcu_irq_enter();
155
	}
156
	if (!n.halted)
157
		finish_swait(&n.wq, &wait);
158

159
	rcu_irq_exit();
160 161 162 163 164 165 166
	return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);

static void apf_task_wake_one(struct kvm_task_sleep_node *n)
{
	hlist_del_init(&n->link);
167 168
	if (n->halted)
		smp_send_reschedule(n->cpu);
169
	else if (swq_has_sleeper(&n->wq))
170
		swake_up_one(&n->wq);
171 172 173 174 175 176 177 178 179
}

static void apf_task_wake_all(void)
{
	int i;

	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
		struct hlist_node *p, *next;
		struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
180
		raw_spin_lock(&b->lock);
181 182 183 184 185 186
		hlist_for_each_safe(p, next, &b->list) {
			struct kvm_task_sleep_node *n =
				hlist_entry(p, typeof(*n), link);
			if (n->cpu == smp_processor_id())
				apf_task_wake_one(n);
		}
187
		raw_spin_unlock(&b->lock);
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202
	}
}

void kvm_async_pf_task_wake(u32 token)
{
	u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
	struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
	struct kvm_task_sleep_node *n;

	if (token == ~0) {
		apf_task_wake_all();
		return;
	}

again:
203
	raw_spin_lock(&b->lock);
204 205 206 207 208 209
	n = _find_apf_task(b, token);
	if (!n) {
		/*
		 * async PF was not yet handled.
		 * Add dummy entry for the token.
		 */
210
		n = kzalloc(sizeof(*n), GFP_ATOMIC);
211 212 213 214 215
		if (!n) {
			/*
			 * Allocation failed! Busy wait while other cpu
			 * handles async PF.
			 */
216
			raw_spin_unlock(&b->lock);
217 218 219 220 221
			cpu_relax();
			goto again;
		}
		n->token = token;
		n->cpu = smp_processor_id();
222
		init_swait_queue_head(&n->wq);
223 224 225
		hlist_add_head(&n->link, &b->list);
	} else
		apf_task_wake_one(n);
226
	raw_spin_unlock(&b->lock);
227 228 229 230 231 232 233 234
	return;
}
EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);

u32 kvm_read_and_reset_pf_reason(void)
{
	u32 reason = 0;

235 236 237
	if (__this_cpu_read(apf_reason.enabled)) {
		reason = __this_cpu_read(apf_reason.reason);
		__this_cpu_write(apf_reason.reason, 0);
238 239 240 241 242
	}

	return reason;
}
EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
243
NOKPROBE_SYMBOL(kvm_read_and_reset_pf_reason);
244

245
dotraplinkage void
246
do_async_page_fault(struct pt_regs *regs, unsigned long error_code, unsigned long address)
247
{
248 249
	enum ctx_state prev_state;

250 251
	switch (kvm_read_and_reset_pf_reason()) {
	default:
252
		do_page_fault(regs, error_code, address);
253 254 255
		break;
	case KVM_PV_REASON_PAGE_NOT_PRESENT:
		/* page is swapped out by the host. */
256
		prev_state = exception_enter();
257
		kvm_async_pf_task_wait((u32)address, !user_mode(regs));
258
		exception_exit(prev_state);
259 260
		break;
	case KVM_PV_REASON_PAGE_READY:
261
		rcu_irq_enter();
262
		kvm_async_pf_task_wake((u32)address);
263
		rcu_irq_exit();
264 265 266
		break;
	}
}
267
NOKPROBE_SYMBOL(do_async_page_fault);
268

269
static void __init paravirt_ops_setup(void)
270 271
{
	pv_info.name = "KVM";
272

273
	if (kvm_para_has_feature(KVM_FEATURE_NOP_IO_DELAY))
274
		pv_ops.cpu.io_delay = kvm_io_delay;
275

276 277 278
#ifdef CONFIG_X86_IO_APIC
	no_timer_check = 1;
#endif
279 280
}

281 282 283 284 285 286 287 288
static void kvm_register_steal_time(void)
{
	int cpu = smp_processor_id();
	struct kvm_steal_time *st = &per_cpu(steal_time, cpu);

	if (!has_steal_clock)
		return;

289
	wrmsrl(MSR_KVM_STEAL_TIME, (slow_virt_to_phys(st) | KVM_MSR_ENABLED));
290 291
	pr_info("kvm-stealtime: cpu %d, msr %llx\n",
		cpu, (unsigned long long) slow_virt_to_phys(st));
292 293
}

294
static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
295

296
static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
297 298 299 300 301 302 303 304
{
	/**
	 * This relies on __test_and_clear_bit to modify the memory
	 * in a way that is atomic with respect to the local CPU.
	 * The hypervisor only accesses this memory from the local CPU so
	 * there's no need for lock or memory barriers.
	 * An optimization barrier is implied in apic write.
	 */
305
	if (__test_and_clear_bit(KVM_PV_EOI_BIT, this_cpu_ptr(&kvm_apic_eoi)))
306
		return;
307
	apic->native_eoi_write(APIC_EOI, APIC_EOI_ACK);
308 309
}

310
static void kvm_guest_cpu_init(void)
311 312
{
	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
313
		u64 pa = slow_virt_to_phys(this_cpu_ptr(&apf_reason));
314

315
#ifdef CONFIG_PREEMPTION
316 317
		pa |= KVM_ASYNC_PF_SEND_ALWAYS;
#endif
318 319
		pa |= KVM_ASYNC_PF_ENABLED;

320 321 322 323
		if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF_VMEXIT))
			pa |= KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT;

		wrmsrl(MSR_KVM_ASYNC_PF_EN, pa);
324
		__this_cpu_write(apf_reason.enabled, 1);
325 326 327
		printk(KERN_INFO"KVM setup async PF for cpu %d\n",
		       smp_processor_id());
	}
328

329 330 331 332
	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) {
		unsigned long pa;
		/* Size alignment is implied but just to make it explicit. */
		BUILD_BUG_ON(__alignof__(kvm_apic_eoi) < 4);
333 334
		__this_cpu_write(kvm_apic_eoi, 0);
		pa = slow_virt_to_phys(this_cpu_ptr(&kvm_apic_eoi))
335
			| KVM_MSR_ENABLED;
336 337 338
		wrmsrl(MSR_KVM_PV_EOI_EN, pa);
	}

339 340
	if (has_steal_clock)
		kvm_register_steal_time();
341 342
}

343
static void kvm_pv_disable_apf(void)
344
{
345
	if (!__this_cpu_read(apf_reason.enabled))
346 347 348
		return;

	wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
349
	__this_cpu_write(apf_reason.enabled, 0);
350 351 352 353 354

	printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
	       smp_processor_id());
}

355 356 357 358 359 360 361 362 363 364
static void kvm_pv_guest_cpu_reboot(void *unused)
{
	/*
	 * We disable PV EOI before we load a new kernel by kexec,
	 * since MSR_KVM_PV_EOI_EN stores a pointer into old kernel's memory.
	 * New kernel can re-enable when it boots.
	 */
	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
	kvm_pv_disable_apf();
365
	kvm_disable_steal_time();
366 367
}

368 369 370 371
static int kvm_pv_reboot_notify(struct notifier_block *nb,
				unsigned long code, void *unused)
{
	if (code == SYS_RESTART)
372
		on_each_cpu(kvm_pv_guest_cpu_reboot, NULL, 1);
373 374 375 376 377 378 379
	return NOTIFY_DONE;
}

static struct notifier_block kvm_pv_reboot_nb = {
	.notifier_call = kvm_pv_reboot_notify,
};

380 381 382 383 384 385 386 387 388
static u64 kvm_steal_clock(int cpu)
{
	u64 steal;
	struct kvm_steal_time *src;
	int version;

	src = &per_cpu(steal_time, cpu);
	do {
		version = src->version;
389
		virt_rmb();
390
		steal = src->steal;
391
		virt_rmb();
392 393 394 395 396 397 398 399 400 401 402 403 404
	} while ((version & 1) || (version != src->version));

	return steal;
}

void kvm_disable_steal_time(void)
{
	if (!has_steal_clock)
		return;

	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
}

405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431
static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
{
	early_set_memory_decrypted((unsigned long) ptr, size);
}

/*
 * Iterate through all possible CPUs and map the memory region pointed
 * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
 *
 * Note: we iterate through all possible CPUs to ensure that CPUs
 * hotplugged will have their per-cpu variable already mapped as
 * decrypted.
 */
static void __init sev_map_percpu_data(void)
{
	int cpu;

	if (!sev_active())
		return;

	for_each_possible_cpu(cpu) {
		__set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
		__set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
		__set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
	}
}

432
#ifdef CONFIG_SMP
433 434 435 436 437 438 439 440 441 442 443 444
#define KVM_IPI_CLUSTER_SIZE	(2 * BITS_PER_LONG)

static void __send_ipi_mask(const struct cpumask *mask, int vector)
{
	unsigned long flags;
	int cpu, apic_id, icr;
	int min = 0, max = 0;
#ifdef CONFIG_X86_64
	__uint128_t ipi_bitmap = 0;
#else
	u64 ipi_bitmap = 0;
#endif
445
	long ret;
446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470

	if (cpumask_empty(mask))
		return;

	local_irq_save(flags);

	switch (vector) {
	default:
		icr = APIC_DM_FIXED | vector;
		break;
	case NMI_VECTOR:
		icr = APIC_DM_NMI;
		break;
	}

	for_each_cpu(cpu, mask) {
		apic_id = per_cpu(x86_cpu_to_apicid, cpu);
		if (!ipi_bitmap) {
			min = max = apic_id;
		} else if (apic_id < min && max - apic_id < KVM_IPI_CLUSTER_SIZE) {
			ipi_bitmap <<= min - apic_id;
			min = apic_id;
		} else if (apic_id < min + KVM_IPI_CLUSTER_SIZE) {
			max = apic_id < max ? max : apic_id;
		} else {
471
			ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
472
				(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
473
			WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
474 475 476 477 478 479 480
			min = max = apic_id;
			ipi_bitmap = 0;
		}
		__set_bit(apic_id - min, (unsigned long *)&ipi_bitmap);
	}

	if (ipi_bitmap) {
481
		ret = kvm_hypercall4(KVM_HC_SEND_IPI, (unsigned long)ipi_bitmap,
482
			(unsigned long)(ipi_bitmap >> BITS_PER_LONG), min, icr);
483
		WARN_ONCE(ret < 0, "KVM: failed to send PV IPI: %ld", ret);
484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515
	}

	local_irq_restore(flags);
}

static void kvm_send_ipi_mask(const struct cpumask *mask, int vector)
{
	__send_ipi_mask(mask, vector);
}

static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector)
{
	unsigned int this_cpu = smp_processor_id();
	struct cpumask new_mask;
	const struct cpumask *local_mask;

	cpumask_copy(&new_mask, mask);
	cpumask_clear_cpu(this_cpu, &new_mask);
	local_mask = &new_mask;
	__send_ipi_mask(local_mask, vector);
}

/*
 * Set the IPI entry points
 */
static void kvm_setup_pv_ipi(void)
{
	apic->send_IPI_mask = kvm_send_ipi_mask;
	apic->send_IPI_mask_allbutself = kvm_send_ipi_mask_allbutself;
	pr_info("KVM setup pv IPIs\n");
}

516 517 518 519 520 521 522 523 524 525 526 527 528 529 530
static void kvm_smp_send_call_func_ipi(const struct cpumask *mask)
{
	int cpu;

	native_send_call_func_ipi(mask);

	/* Make sure other vCPUs get a chance to run if they need to. */
	for_each_cpu(cpu, mask) {
		if (vcpu_is_preempted(cpu)) {
			kvm_hypercall1(KVM_HC_SCHED_YIELD, per_cpu(x86_cpu_to_apicid, cpu));
			break;
		}
	}
}

531 532 533
static void __init kvm_smp_prepare_cpus(unsigned int max_cpus)
{
	native_smp_prepare_cpus(max_cpus);
534
	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
535 536 537
		static_branch_disable(&virt_spin_lock_key);
}

538 539
static void __init kvm_smp_prepare_boot_cpu(void)
{
540 541 542 543 544 545
	/*
	 * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
	 * shares the guest physical address with the hypervisor.
	 */
	sev_map_percpu_data();

546
	kvm_guest_cpu_init();
547
	native_smp_prepare_boot_cpu();
548
	kvm_spinlock_init();
549
}
550

551
static void kvm_guest_cpu_offline(void)
552
{
553
	kvm_disable_steal_time();
554 555 556
	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
		wrmsrl(MSR_KVM_PV_EOI_EN, 0);
	kvm_pv_disable_apf();
557
	apf_task_wake_all();
558 559
}

560
static int kvm_cpu_online(unsigned int cpu)
561
{
562 563 564 565
	local_irq_disable();
	kvm_guest_cpu_init();
	local_irq_enable();
	return 0;
566 567
}

568 569 570 571 572 573 574
static int kvm_cpu_down_prepare(unsigned int cpu)
{
	local_irq_disable();
	kvm_guest_cpu_offline();
	local_irq_enable();
	return 0;
}
575 576
#endif

577 578
static void __init kvm_apf_trap_init(void)
{
579
	update_intr_gate(X86_TRAP_PF, async_page_fault);
580 581
}

582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609
static DEFINE_PER_CPU(cpumask_var_t, __pv_tlb_mask);

static void kvm_flush_tlb_others(const struct cpumask *cpumask,
			const struct flush_tlb_info *info)
{
	u8 state;
	int cpu;
	struct kvm_steal_time *src;
	struct cpumask *flushmask = this_cpu_cpumask_var_ptr(__pv_tlb_mask);

	cpumask_copy(flushmask, cpumask);
	/*
	 * We have to call flush only on online vCPUs. And
	 * queue flush_on_enter for pre-empted vCPUs
	 */
	for_each_cpu(cpu, flushmask) {
		src = &per_cpu(steal_time, cpu);
		state = READ_ONCE(src->preempted);
		if ((state & KVM_VCPU_PREEMPTED)) {
			if (try_cmpxchg(&src->preempted, &state,
					state | KVM_VCPU_FLUSH_TLB))
				__cpumask_clear_cpu(cpu, flushmask);
		}
	}

	native_flush_tlb_others(flushmask, info);
}

610
static void __init kvm_guest_init(void)
611
{
612 613
	int i;

614
	paravirt_ops_setup();
615
	register_reboot_notifier(&kvm_pv_reboot_nb);
616
	for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
617
		raw_spin_lock_init(&async_pf_sleepers[i].lock);
618 619 620
	if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
		x86_init.irqs.trap_init = kvm_apf_trap_init;

621 622
	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
		has_steal_clock = 1;
623
		pv_ops.time.steal_clock = kvm_steal_clock;
624 625
	}

626
	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
627
	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
628
	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
629 630
		pv_ops.mmu.flush_tlb_others = kvm_flush_tlb_others;
		pv_ops.mmu.tlb_remove_table = tlb_remove_table;
631
	}
632

633 634
	if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
		apic_set_eoi_write(kvm_guest_apic_eoi_write);
635

636
#ifdef CONFIG_SMP
637
	smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus;
638
	smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
639 640 641 642 643 644
	if (kvm_para_has_feature(KVM_FEATURE_PV_SCHED_YIELD) &&
	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
		smp_ops.send_call_func_ipi = kvm_smp_send_call_func_ipi;
		pr_info("KVM setup pv sched yield\n");
	}
645 646 647
	if (cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "x86/kvm:online",
				      kvm_cpu_online, kvm_cpu_down_prepare) < 0)
		pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
648
#else
649
	sev_map_percpu_data();
650
	kvm_guest_cpu_init();
651
#endif
652 653 654 655 656 657

	/*
	 * Hard lockup detection is enabled by default. Disable it, as guests
	 * can get false positives too easily, for example if the host is
	 * overcommitted.
	 */
658
	hardlockup_detector_disable();
659
}
660

661 662 663 664 665
static noinline uint32_t __kvm_cpuid_base(void)
{
	if (boot_cpu_data.cpuid_level < 0)
		return 0;	/* So we don't blow up on old processors */

666
	if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687
		return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0);

	return 0;
}

static inline uint32_t kvm_cpuid_base(void)
{
	static int kvm_cpuid_base = -1;

	if (kvm_cpuid_base == -1)
		kvm_cpuid_base = __kvm_cpuid_base();

	return kvm_cpuid_base;
}

bool kvm_para_available(void)
{
	return kvm_cpuid_base() != 0;
}
EXPORT_SYMBOL_GPL(kvm_para_available);

688 689 690 691 692
unsigned int kvm_arch_para_features(void)
{
	return cpuid_eax(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}

693 694 695 696
unsigned int kvm_arch_para_hints(void)
{
	return cpuid_edx(kvm_cpuid_base() | KVM_CPUID_FEATURES);
}
697
EXPORT_SYMBOL_GPL(kvm_arch_para_hints);
698

699
static uint32_t __init kvm_detect(void)
700
{
701
	return kvm_cpuid_base();
702 703
}

704 705
static void __init kvm_apic_init(void)
{
706 707 708 709
#if defined(CONFIG_SMP)
	if (kvm_para_has_feature(KVM_FEATURE_PV_SEND_IPI))
		kvm_setup_pv_ipi();
#endif
710 711 712 713
}

static void __init kvm_init_platform(void)
{
714
	kvmclock_init();
715 716 717
	x86_platform.apic_post_init = kvm_apic_init;
}

718
const __initconst struct hypervisor_x86 x86_hyper_kvm = {
719 720
	.name			= "KVM",
	.detect			= kvm_detect,
721
	.type			= X86_HYPER_KVM,
722
	.init.guest_late_init	= kvm_guest_init,
723
	.init.x2apic_available	= kvm_para_available,
724
	.init.init_platform	= kvm_init_platform,
725 726
};

727 728 729
static __init int activate_jump_labels(void)
{
	if (has_steal_clock) {
730
		static_key_slow_inc(&paravirt_steal_enabled);
731
		if (steal_acc)
732
			static_key_slow_inc(&paravirt_steal_rq_enabled);
733 734 735 736 737
	}

	return 0;
}
arch_initcall(activate_jump_labels);
738

739 740 741 742
static __init int kvm_setup_pv_tlb_flush(void)
{
	int cpu;

743
	if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
744
	    !kvm_para_has_hint(KVM_HINTS_REALTIME) &&
745
	    kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
746 747 748 749 750 751 752 753 754 755 756
		for_each_possible_cpu(cpu) {
			zalloc_cpumask_var_node(per_cpu_ptr(&__pv_tlb_mask, cpu),
				GFP_KERNEL, cpu_to_node(cpu));
		}
		pr_info("KVM setup pv remote TLB flush\n");
	}

	return 0;
}
arch_initcall(kvm_setup_pv_tlb_flush);

757 758 759
#ifdef CONFIG_PARAVIRT_SPINLOCKS

/* Kick a cpu by its apicid. Used to wake up a halted vcpu */
760
static void kvm_kick_cpu(int cpu)
761 762 763 764 765 766 767 768
{
	int apicid;
	unsigned long flags = 0;

	apicid = per_cpu(x86_cpu_to_apicid, cpu);
	kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid);
}

769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796
#include <asm/qspinlock.h>

static void kvm_wait(u8 *ptr, u8 val)
{
	unsigned long flags;

	if (in_nmi())
		return;

	local_irq_save(flags);

	if (READ_ONCE(*ptr) != val)
		goto out;

	/*
	 * halt until it's our turn and kicked. Note that we do safe halt
	 * for irq enabled case to avoid hang when lock info is overwritten
	 * in irq spinlock slowpath and no spurious interrupt occur to save us.
	 */
	if (arch_irqs_disabled_flags(flags))
		halt();
	else
		safe_halt();

out:
	local_irq_restore(flags);
}

797
#ifdef CONFIG_X86_32
798
__visible bool __kvm_vcpu_is_preempted(long cpu)
799 800 801
{
	struct kvm_steal_time *src = &per_cpu(steal_time, cpu);

802
	return !!(src->preempted & KVM_VCPU_PREEMPTED);
803 804 805
}
PV_CALLEE_SAVE_REGS_THUNK(__kvm_vcpu_is_preempted);

806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824
#else

#include <asm/asm-offsets.h>

extern bool __raw_callee_save___kvm_vcpu_is_preempted(long);

/*
 * Hand-optimize version for x86-64 to avoid 8 64-bit register saving and
 * restoring to/from the stack.
 */
asm(
".pushsection .text;"
".global __raw_callee_save___kvm_vcpu_is_preempted;"
".type __raw_callee_save___kvm_vcpu_is_preempted, @function;"
"__raw_callee_save___kvm_vcpu_is_preempted:"
"movq	__per_cpu_offset(,%rdi,8), %rax;"
"cmpb	$0, " __stringify(KVM_STEAL_TIME_preempted) "+steal_time(%rax);"
"setne	%al;"
"ret;"
825
".size __raw_callee_save___kvm_vcpu_is_preempted, .-__raw_callee_save___kvm_vcpu_is_preempted;"
826 827 828 829
".popsection");

#endif

830 831 832 833 834 835 836 837 838
/*
 * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present.
 */
void __init kvm_spinlock_init(void)
{
	/* Does host kernel support KVM_FEATURE_PV_UNHALT? */
	if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT))
		return;

839
	if (kvm_para_has_hint(KVM_HINTS_REALTIME))
840 841
		return;

842 843 844 845
	/* Don't use the pvqspinlock code if there is only 1 vCPU. */
	if (num_possible_cpus() == 1)
		return;

846
	__pv_init_lock_hash();
847 848 849 850 851
	pv_ops.lock.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath;
	pv_ops.lock.queued_spin_unlock =
		PV_CALLEE_SAVE(__pv_queued_spin_unlock);
	pv_ops.lock.wait = kvm_wait;
	pv_ops.lock.kick = kvm_kick_cpu;
852 853

	if (kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
854
		pv_ops.lock.vcpu_is_preempted =
855 856
			PV_CALLEE_SAVE(__kvm_vcpu_is_preempted);
	}
857 858
}

859
#endif	/* CONFIG_PARAVIRT_SPINLOCKS */
860 861 862 863 864 865 866 867 868 869 870 871 872

#ifdef CONFIG_ARCH_CPUIDLE_HALTPOLL

static void kvm_disable_host_haltpoll(void *i)
{
	wrmsrl(MSR_KVM_POLL_CONTROL, 0);
}

static void kvm_enable_host_haltpoll(void *i)
{
	wrmsrl(MSR_KVM_POLL_CONTROL, 1);
}

873
void arch_haltpoll_enable(unsigned int cpu)
874 875
{
	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL)) {
876 877
		pr_err_once("kvm: host does not support poll control\n");
		pr_err_once("kvm: host upgrade recommended\n");
878 879 880 881
		return;
	}

	/* Enable guest halt poll disables host halt poll */
882
	smp_call_function_single(cpu, kvm_disable_host_haltpoll, NULL, 1);
883 884 885
}
EXPORT_SYMBOL_GPL(arch_haltpoll_enable);

886
void arch_haltpoll_disable(unsigned int cpu)
887 888 889 890 891
{
	if (!kvm_para_has_feature(KVM_FEATURE_POLL_CONTROL))
		return;

	/* Enable guest halt poll disables host halt poll */
892
	smp_call_function_single(cpu, kvm_enable_host_haltpoll, NULL, 1);
893 894 895
}
EXPORT_SYMBOL_GPL(arch_haltpoll_disable);
#endif