Commit 8a53e7e5 authored by Paolo Bonzini's avatar Paolo Bonzini

Merge branch 'kvm-ppc-next' of...

Merge branch 'kvm-ppc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc into HEAD

- Better machine check handling for HV KVM
- Ability to support guests with threads=2, 4 or 8 on POWER9
- Fix for a race that could cause delayed recognition of signals
- Fix for a bug where POWER9 guests could sleep with interrupts
  pending.
parents 00c14757 8b24e69f
......@@ -4131,6 +4131,34 @@ Parameters: none
Allow use of adapter-interruption suppression.
Returns: 0 on success; -EBUSY if a VCPU has already been created.
7.11 KVM_CAP_PPC_SMT
Architectures: ppc
Parameters: vsmt_mode, flags
Enabling this capability on a VM provides userspace with a way to set
the desired virtual SMT mode (i.e. the number of virtual CPUs per
virtual core). The virtual SMT mode, vsmt_mode, must be a power of 2
between 1 and 8. On POWER8, vsmt_mode must also be no greater than
the number of threads per subcore for the host. Currently flags must
be 0. A successful call to enable this capability will result in
vsmt_mode being returned when the KVM_CAP_PPC_SMT capability is
subsequently queried for the VM. This capability is only supported by
HV KVM, and can only be set before any VCPUs have been created.
The KVM_CAP_PPC_SMT_POSSIBLE capability indicates which virtual SMT
modes are available.
7.12 KVM_CAP_PPC_FWNMI
Architectures: ppc
Parameters: none
With this capability a machine check exception in the guest address
space will cause KVM to exit the guest with NMI exit reason. This
enables QEMU to build error log and branch to guest kernel registered
machine check handling routine. Without this capability KVM will
branch to guests' 0x200 interrupt vector.
8. Other capabilities.
----------------------
......@@ -4292,3 +4320,12 @@ Currently the following bits are defined for the device_irq_level bitmap:
Future versions of kvm may implement additional events. These will get
indicated by returning a higher number from KVM_CHECK_EXTENSION and will be
listed above.
8.10 KVM_CAP_PPC_SMT_POSSIBLE
Architectures: ppc
Querying this capability returns a bitmap indicating the possible
virtual SMT modes that can be set using KVM_CAP_PPC_SMT. If bit N
(counting from the right) is set, then a virtual SMT mode of 2^N is
available.
......@@ -86,7 +86,6 @@ struct kvmppc_vcore {
u16 last_cpu;
u8 vcore_state;
u8 in_guest;
struct kvmppc_vcore *master_vcore;
struct kvm_vcpu *runnable_threads[MAX_SMT_THREADS];
struct list_head preempt_list;
spinlock_t lock;
......
......@@ -81,7 +81,7 @@ struct kvm_split_mode {
u8 subcore_size;
u8 do_nap;
u8 napped[MAX_SMT_THREADS];
struct kvmppc_vcore *master_vcs[MAX_SUBCORES];
struct kvmppc_vcore *vc[MAX_SUBCORES];
};
/*
......
......@@ -35,6 +35,7 @@
#include <asm/page.h>
#include <asm/cacheflush.h>
#include <asm/hvcall.h>
#include <asm/mce.h>
#define KVM_MAX_VCPUS NR_CPUS
#define KVM_MAX_VCORES NR_CPUS
......@@ -267,6 +268,8 @@ struct kvm_resize_hpt;
struct kvm_arch {
unsigned int lpid;
unsigned int smt_mode; /* # vcpus per virtual core */
unsigned int emul_smt_mode; /* emualted SMT mode, on P9 */
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
unsigned int tlb_sets;
struct kvm_hpt_info hpt;
......@@ -285,6 +288,7 @@ struct kvm_arch {
cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
u8 radix;
u8 fwnmi_enabled;
pgd_t *pgtable;
u64 process_table;
struct dentry *debugfs_dir;
......@@ -566,6 +570,7 @@ struct kvm_vcpu_arch {
ulong wort;
ulong tid;
ulong psscr;
ulong hfscr;
ulong shadow_srr1;
#endif
u32 vrsave; /* also USPRG0 */
......@@ -579,7 +584,7 @@ struct kvm_vcpu_arch {
ulong mcsrr0;
ulong mcsrr1;
ulong mcsr;
u32 dec;
ulong dec;
#ifdef CONFIG_BOOKE
u32 decar;
#endif
......@@ -710,6 +715,7 @@ struct kvm_vcpu_arch {
unsigned long pending_exceptions;
u8 ceded;
u8 prodded;
u8 doorbell_request;
u32 last_inst;
struct swait_queue_head *wqp;
......@@ -722,6 +728,7 @@ struct kvm_vcpu_arch {
int prev_cpu;
bool timer_running;
wait_queue_head_t cpu_run;
struct machine_check_event mce_evt; /* Valid if trap == 0x200 */
struct kvm_vcpu_arch_shared *shared;
#if defined(CONFIG_PPC_BOOK3S_64) && defined(CONFIG_KVM_BOOK3S_PR_POSSIBLE)
......
......@@ -315,6 +315,8 @@ struct kvmppc_ops {
struct irq_bypass_producer *);
int (*configure_mmu)(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg);
int (*get_rmmu_info)(struct kvm *kvm, struct kvm_ppc_rmmu_info *info);
int (*set_smt_mode)(struct kvm *kvm, unsigned long mode,
unsigned long flags);
};
extern struct kvmppc_ops *kvmppc_hv_ops;
......
......@@ -103,6 +103,8 @@
#define OP_31_XOP_STBUX 247
#define OP_31_XOP_LHZX 279
#define OP_31_XOP_LHZUX 311
#define OP_31_XOP_MSGSNDP 142
#define OP_31_XOP_MSGCLRP 174
#define OP_31_XOP_MFSPR 339
#define OP_31_XOP_LWAX 341
#define OP_31_XOP_LHAX 343
......
......@@ -60,6 +60,12 @@ struct kvm_regs {
#define KVM_SREGS_E_FSL_PIDn (1 << 0) /* PID1/PID2 */
/* flags for kvm_run.flags */
#define KVM_RUN_PPC_NMI_DISP_MASK (3 << 0)
#define KVM_RUN_PPC_NMI_DISP_FULLY_RECOV (1 << 0)
#define KVM_RUN_PPC_NMI_DISP_LIMITED_RECOV (2 << 0)
#define KVM_RUN_PPC_NMI_DISP_NOT_RECOV (3 << 0)
/*
* Feature bits indicate which sections of the sregs struct are valid,
* both in KVM_GET_SREGS and KVM_SET_SREGS. On KVM_SET_SREGS, registers
......
......@@ -485,6 +485,7 @@ int main(void)
OFFSET(KVM_ENABLED_HCALLS, kvm, arch.enabled_hcalls);
OFFSET(KVM_VRMA_SLB_V, kvm, arch.vrma_slb_v);
OFFSET(KVM_RADIX, kvm, arch.radix);
OFFSET(KVM_FWNMI, kvm, arch.fwnmi_enabled);
OFFSET(VCPU_DSISR, kvm_vcpu, arch.shregs.dsisr);
OFFSET(VCPU_DAR, kvm_vcpu, arch.shregs.dar);
OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
......@@ -513,6 +514,7 @@ int main(void)
OFFSET(VCPU_PENDING_EXC, kvm_vcpu, arch.pending_exceptions);
OFFSET(VCPU_CEDED, kvm_vcpu, arch.ceded);
OFFSET(VCPU_PRODDED, kvm_vcpu, arch.prodded);
OFFSET(VCPU_DBELL_REQ, kvm_vcpu, arch.doorbell_request);
OFFSET(VCPU_MMCR, kvm_vcpu, arch.mmcr);
OFFSET(VCPU_PMC, kvm_vcpu, arch.pmc);
OFFSET(VCPU_SPMC, kvm_vcpu, arch.spmc);
......@@ -542,6 +544,7 @@ int main(void)
OFFSET(VCPU_WORT, kvm_vcpu, arch.wort);
OFFSET(VCPU_TID, kvm_vcpu, arch.tid);
OFFSET(VCPU_PSSCR, kvm_vcpu, arch.psscr);
OFFSET(VCPU_HFSCR, kvm_vcpu, arch.hfscr);
OFFSET(VCORE_ENTRY_EXIT, kvmppc_vcore, entry_exit_map);
OFFSET(VCORE_IN_GUEST, kvmppc_vcore, in_guest);
OFFSET(VCORE_NAPPING_THREADS, kvmppc_vcore, napping_threads);
......
......@@ -405,6 +405,7 @@ void machine_check_print_event_info(struct machine_check_event *evt,
break;
}
}
EXPORT_SYMBOL_GPL(machine_check_print_event_info);
uint64_t get_mce_fault_addr(struct machine_check_event *evt)
{
......
......@@ -46,6 +46,8 @@
#include <linux/of.h>
#include <asm/reg.h>
#include <asm/ppc-opcode.h>
#include <asm/disassemble.h>
#include <asm/cputable.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
......@@ -645,6 +647,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
unsigned long stolen;
unsigned long core_stolen;
u64 now;
unsigned long flags;
dt = vcpu->arch.dtl_ptr;
vpa = vcpu->arch.vpa.pinned_addr;
......@@ -652,10 +655,10 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
core_stolen = vcore_stolen_time(vc, now);
stolen = core_stolen - vcpu->arch.stolen_logged;
vcpu->arch.stolen_logged = core_stolen;
spin_lock_irq(&vcpu->arch.tbacct_lock);
spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
stolen += vcpu->arch.busy_stolen;
vcpu->arch.busy_stolen = 0;
spin_unlock_irq(&vcpu->arch.tbacct_lock);
spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
if (!dt || !vpa)
return;
memset(dt, 0, sizeof(struct dtl_entry));
......@@ -675,6 +678,26 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
vcpu->arch.dtl.dirty = true;
}
/* See if there is a doorbell interrupt pending for a vcpu */
static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
{
int thr;
struct kvmppc_vcore *vc;
if (vcpu->arch.doorbell_request)
return true;
/*
* Ensure that the read of vcore->dpdes comes after the read
* of vcpu->doorbell_request. This barrier matches the
* lwsync in book3s_hv_rmhandlers.S just before the
* fast_guest_return label.
*/
smp_rmb();
vc = vcpu->arch.vcore;
thr = vcpu->vcpu_id - vc->first_vcpuid;
return !!(vc->dpdes & (1 << thr));
}
static bool kvmppc_power8_compatible(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.vcore->arch_compat >= PVR_ARCH_207)
......@@ -926,6 +949,101 @@ static int kvmppc_emulate_debug_inst(struct kvm_run *run,
}
}
static void do_nothing(void *x)
{
}
static unsigned long kvmppc_read_dpdes(struct kvm_vcpu *vcpu)
{
int thr, cpu, pcpu, nthreads;
struct kvm_vcpu *v;
unsigned long dpdes;
nthreads = vcpu->kvm->arch.emul_smt_mode;
dpdes = 0;
cpu = vcpu->vcpu_id & ~(nthreads - 1);
for (thr = 0; thr < nthreads; ++thr, ++cpu) {
v = kvmppc_find_vcpu(vcpu->kvm, cpu);
if (!v)
continue;
/*
* If the vcpu is currently running on a physical cpu thread,
* interrupt it in order to pull it out of the guest briefly,
* which will update its vcore->dpdes value.
*/
pcpu = READ_ONCE(v->cpu);
if (pcpu >= 0)
smp_call_function_single(pcpu, do_nothing, NULL, 1);
if (kvmppc_doorbell_pending(v))
dpdes |= 1 << thr;
}
return dpdes;
}
/*
* On POWER9, emulate doorbell-related instructions in order to
* give the guest the illusion of running on a multi-threaded core.
* The instructions emulated are msgsndp, msgclrp, mfspr TIR,
* and mfspr DPDES.
*/
static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
{
u32 inst, rb, thr;
unsigned long arg;
struct kvm *kvm = vcpu->kvm;
struct kvm_vcpu *tvcpu;
if (!cpu_has_feature(CPU_FTR_ARCH_300))
return EMULATE_FAIL;
if (kvmppc_get_last_inst(vcpu, INST_GENERIC, &inst) != EMULATE_DONE)
return RESUME_GUEST;
if (get_op(inst) != 31)
return EMULATE_FAIL;
rb = get_rb(inst);
thr = vcpu->vcpu_id & (kvm->arch.emul_smt_mode - 1);
switch (get_xop(inst)) {
case OP_31_XOP_MSGSNDP:
arg = kvmppc_get_gpr(vcpu, rb);
if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
break;
arg &= 0x3f;
if (arg >= kvm->arch.emul_smt_mode)
break;
tvcpu = kvmppc_find_vcpu(kvm, vcpu->vcpu_id - thr + arg);
if (!tvcpu)
break;
if (!tvcpu->arch.doorbell_request) {
tvcpu->arch.doorbell_request = 1;
kvmppc_fast_vcpu_kick_hv(tvcpu);
}
break;
case OP_31_XOP_MSGCLRP:
arg = kvmppc_get_gpr(vcpu, rb);
if (((arg >> 27) & 0xf) != PPC_DBELL_SERVER)
break;
vcpu->arch.vcore->dpdes = 0;
vcpu->arch.doorbell_request = 0;
break;
case OP_31_XOP_MFSPR:
switch (get_sprn(inst)) {
case SPRN_TIR:
arg = thr;
break;
case SPRN_DPDES:
arg = kvmppc_read_dpdes(vcpu);
break;
default:
return EMULATE_FAIL;
}
kvmppc_set_gpr(vcpu, get_rt(inst), arg);
break;
default:
return EMULATE_FAIL;
}
kvmppc_set_pc(vcpu, kvmppc_get_pc(vcpu) + 4);
return RESUME_GUEST;
}
static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct task_struct *tsk)
{
......@@ -971,15 +1089,20 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
/*
* Deliver a machine check interrupt to the guest.
* We have to do this, even if the host has handled the
* machine check, because machine checks use SRR0/1 and
* the interrupt might have trashed guest state in them.
*/
kvmppc_book3s_queue_irqprio(vcpu,
BOOK3S_INTERRUPT_MACHINE_CHECK);
r = RESUME_GUEST;
/* Exit to guest with KVM_EXIT_NMI as exit reason */
run->exit_reason = KVM_EXIT_NMI;
run->hw.hardware_exit_reason = vcpu->arch.trap;
/* Clear out the old NMI status from run->flags */
run->flags &= ~KVM_RUN_PPC_NMI_DISP_MASK;
/* Now set the NMI status */
if (vcpu->arch.mce_evt.disposition == MCE_DISPOSITION_RECOVERED)
run->flags |= KVM_RUN_PPC_NMI_DISP_FULLY_RECOV;
else
run->flags |= KVM_RUN_PPC_NMI_DISP_NOT_RECOV;
r = RESUME_HOST;
/* Print the MCE event to host console. */
machine_check_print_event_info(&vcpu->arch.mce_evt, false);
break;
case BOOK3S_INTERRUPT_PROGRAM:
{
......@@ -1048,12 +1171,19 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
/*
* This occurs if the guest (kernel or userspace), does something that
* is prohibited by HFSCR. We just generate a program interrupt to
* the guest.
* is prohibited by HFSCR.
* On POWER9, this could be a doorbell instruction that we need
* to emulate.
* Otherwise, we just generate a program interrupt to the guest.
*/
case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
r = EMULATE_FAIL;
if ((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG)
r = kvmppc_emulate_doorbell_instr(vcpu);
if (r == EMULATE_FAIL) {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
}
break;
case BOOK3S_INTERRUPT_HV_RM_HARD:
r = RESUME_PASSTHROUGH;
......@@ -1143,6 +1273,12 @@ static void kvmppc_set_lpcr(struct kvm_vcpu *vcpu, u64 new_lpcr,
mask = LPCR_DPFD | LPCR_ILE | LPCR_TC;
if (cpu_has_feature(CPU_FTR_ARCH_207S))
mask |= LPCR_AIL;
/*
* On POWER9, allow userspace to enable large decrementer for the
* guest, whether or not the host has it enabled.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
mask |= LPCR_LD;
/* Broken 32-bit version of LPCR must not clear top bits */
if (preserve_top32)
......@@ -1486,6 +1622,14 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len);
break;
case KVM_REG_PPC_TB_OFFSET:
/*
* POWER9 DD1 has an erratum where writing TBU40 causes
* the timebase to lose ticks. So we don't let the
* timebase offset be changed on P9 DD1. (It is
* initialized to zero.)
*/
if (cpu_has_feature(CPU_FTR_POWER9_DD1))
break;
/* round up to multiple of 2^24 */
vcpu->arch.vcore->tb_offset =
ALIGN(set_reg_val(id, *val), 1UL << 24);
......@@ -1603,7 +1747,7 @@ static struct kvmppc_vcore *kvmppc_vcore_create(struct kvm *kvm, int core)
init_swait_queue_head(&vcore->wq);
vcore->preempt_tb = TB_NIL;
vcore->lpcr = kvm->arch.lpcr;
vcore->first_vcpuid = core * threads_per_vcore();
vcore->first_vcpuid = core * kvm->arch.smt_mode;
vcore->kvm = kvm;
INIT_LIST_HEAD(&vcore->preempt_list);
......@@ -1762,14 +1906,10 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
unsigned int id)
{
struct kvm_vcpu *vcpu;
int err = -EINVAL;
int err;
int core;
struct kvmppc_vcore *vcore;
core = id / threads_per_vcore();
if (core >= KVM_MAX_VCORES)
goto out;
err = -ENOMEM;
vcpu = kmem_cache_zalloc(kvm_vcpu_cache, GFP_KERNEL);
if (!vcpu)
......@@ -1800,6 +1940,20 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
vcpu->arch.busy_preempt = TB_NIL;
vcpu->arch.intr_msr = MSR_SF | MSR_ME;
/*
* Set the default HFSCR for the guest from the host value.
* This value is only used on POWER9.
* On POWER9 DD1, TM doesn't work, so we make sure to
* prevent the guest from using it.
* On POWER9, we want to virtualize the doorbell facility, so we
* turn off the HFSCR bit, which causes those instructions to trap.
*/
vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
if (!cpu_has_feature(CPU_FTR_TM))
vcpu->arch.hfscr &= ~HFSCR_TM;
if (cpu_has_feature(CPU_FTR_ARCH_300))
vcpu->arch.hfscr &= ~HFSCR_MSGP;
kvmppc_mmu_book3s_hv_init(vcpu);
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
......@@ -1807,11 +1961,17 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
init_waitqueue_head(&vcpu->arch.cpu_run);
mutex_lock(&kvm->lock);
vcore = kvm->arch.vcores[core];
if (!vcore) {
vcore = kvmppc_vcore_create(kvm, core);
kvm->arch.vcores[core] = vcore;
kvm->arch.online_vcores++;
vcore = NULL;
err = -EINVAL;
core = id / kvm->arch.smt_mode;
if (core < KVM_MAX_VCORES) {
vcore = kvm->arch.vcores[core];
if (!vcore) {
err = -ENOMEM;
vcore = kvmppc_vcore_create(kvm, core);
kvm->arch.vcores[core] = vcore;
kvm->arch.online_vcores++;
}
}
mutex_unlock(&kvm->lock);
......@@ -1839,6 +1999,43 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
return ERR_PTR(err);
}
static int kvmhv_set_smt_mode(struct kvm *kvm, unsigned long smt_mode,
unsigned long flags)
{
int err;
int esmt = 0;
if (flags)
return -EINVAL;
if (smt_mode > MAX_SMT_THREADS || !is_power_of_2(smt_mode))
return -EINVAL;
if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
/*
* On POWER8 (or POWER7), the threading mode is "strict",
* so we pack smt_mode vcpus per vcore.
*/
if (smt_mode > threads_per_subcore)
return -EINVAL;
} else {
/*
* On POWER9, the threading mode is "loose",
* so each vcpu gets its own vcore.
*/
esmt = smt_mode;
smt_mode = 1;
}
mutex_lock(&kvm->lock);
err = -EBUSY;
if (!kvm->arch.online_vcores) {
kvm->arch.smt_mode = smt_mode;
kvm->arch.emul_smt_mode = esmt;
err = 0;
}
mutex_unlock(&kvm->lock);
return err;
}
static void unpin_vpa(struct kvm *kvm, struct kvmppc_vpa *vpa)
{
if (vpa->pinned_addr)
......@@ -1889,7 +2086,7 @@ static void kvmppc_end_cede(struct kvm_vcpu *vcpu)
}
}
extern void __kvmppc_vcore_entry(void);
extern int __kvmppc_vcore_entry(void);
static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
struct kvm_vcpu *vcpu)
......@@ -1954,10 +2151,6 @@ static void kvmppc_release_hwthread(int cpu)
tpaca->kvm_hstate.kvm_split_mode = NULL;
}
static void do_nothing(void *x)
{
}
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
{
int i;
......@@ -1975,11 +2168,35 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
smp_call_function_single(cpu + i, do_nothing, NULL, 1);
}
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
{
struct kvm *kvm = vcpu->kvm;
/*
* With radix, the guest can do TLB invalidations itself,
* and it could choose to use the local form (tlbiel) if
* it is invalidating a translation that has only ever been
* used on one vcpu. However, that doesn't mean it has
* only ever been used on one physical cpu, since vcpus
* can move around between pcpus. To cope with this, when
* a vcpu moves from one pcpu to another, we need to tell
* any vcpus running on the same core as this vcpu previously
* ran to flush the TLB. The TLB is shared between threads,
* so we use a single bit in .need_tlb_flush for all 4 threads.
*/
if (vcpu->arch.prev_cpu != pcpu) {
if (vcpu->arch.prev_cpu >= 0 &&
cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
cpu_first_thread_sibling(pcpu))
radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
vcpu->arch.prev_cpu = pcpu;
}
}
static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
{
int cpu;
struct paca_struct *tpaca;
struct kvmppc_vcore *mvc = vc->master_vcore;
struct kvm *kvm = vc->kvm;
cpu = vc->pcpu;
......@@ -1989,36 +2206,16 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
vcpu->arch.timer_running = 0;
}
cpu += vcpu->arch.ptid;
vcpu->cpu = mvc->pcpu;
vcpu->cpu = vc->pcpu;
vcpu->arch.thread_cpu = cpu;
/*
* With radix, the guest can do TLB invalidations itself,
* and it could choose to use the local form (tlbiel) if
* it is invalidating a translation that has only ever been
* used on one vcpu. However, that doesn't mean it has
* only ever been used on one physical cpu, since vcpus
* can move around between pcpus. To cope with this, when
* a vcpu moves from one pcpu to another, we need to tell
* any vcpus running on the same core as this vcpu previously
* ran to flush the TLB. The TLB is shared between threads,
* so we use a single bit in .need_tlb_flush for all 4 threads.
*/
if (kvm_is_radix(kvm) && vcpu->arch.prev_cpu != cpu) {
if (vcpu->arch.prev_cpu >= 0 &&
cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
cpu_first_thread_sibling(cpu))
radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
vcpu->arch.prev_cpu = cpu;
}
cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
}
tpaca = &paca[cpu];
tpaca->kvm_hstate.kvm_vcpu = vcpu;
tpaca->kvm_hstate.ptid = cpu - mvc->pcpu;
tpaca->kvm_hstate.ptid = cpu - vc->pcpu;
/* Order stores to hstate.kvm_vcpu etc. before store to kvm_vcore */
smp_wmb();
tpaca->kvm_hstate.kvm_vcore = mvc;
tpaca->kvm_hstate.kvm_vcore = vc;
if (cpu != smp_processor_id())
kvmppc_ipi_thread(cpu);
}
......@@ -2147,8 +2344,7 @@ struct core_info {
int max_subcore_threads;
int total_threads;
int subcore_threads[MAX_SUBCORES];
struct kvm *subcore_vm[MAX_SUBCORES];
struct list_head vcs[MAX_SUBCORES];
struct kvmppc_vcore *vc[MAX_SUBCORES];
};
/*
......@@ -2159,17 +2355,12 @@ static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
{
int sub;
memset(cip, 0, sizeof(*cip));
cip->n_subcores = 1;
cip->max_subcore_threads = vc->num_threads;
cip->total_threads = vc->num_threads;
cip->subcore_threads[0] = vc->num_threads;
cip->subcore_vm[0] = vc->kvm;
for (sub = 0; sub < MAX_SUBCORES; ++sub)
INIT_LIST_HEAD(&cip->vcs[sub]);
list_add_tail(&vc->preempt_list, &cip->vcs[0]);
cip->vc[0] = vc;
}
static bool subcore_config_ok(int n_subcores, int n_threads)
......@@ -2189,9 +2380,8 @@ static bool subcore_config_ok(int n_subcores, int n_threads)
return n_subcores * roundup_pow_of_two(n_threads) <= MAX_SMT_THREADS;
}
static void init_master_vcore(struct kvmppc_vcore *vc)
static void init_vcore_to_run(struct kvmppc_vcore *vc)
{
vc->master_vcore = vc;
vc->entry_exit_map = 0;
vc->in_guest = 0;
vc->napping_threads = 0;
......@@ -2216,9 +2406,9 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
++cip->n_subcores;
cip->total_threads += vc->num_threads;
cip->subcore_threads[sub] = vc->num_threads;
cip->subcore_vm[sub] = vc->kvm;
init_master_vcore(vc);
list_move_tail(&vc->preempt_list, &cip->vcs[sub]);
cip->vc[sub] = vc;
init_vcore_to_run(vc);
list_del_init(&vc->preempt_list);
return true;
}
......@@ -2286,6 +2476,18 @@ static void collect_piggybacks(struct core_info *cip, int target_threads)
spin_unlock(&lp->lock);
}
static bool recheck_signals(struct core_info *cip)
{
int sub, i;
struct kvm_vcpu *vcpu;
for (sub = 0; sub < cip->n_subcores; ++sub)
for_each_runnable_thread(i, vcpu, cip->vc[sub])
if (signal_pending(vcpu->arch.run_task))
return true;
return false;
}
static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
{
int still_running = 0, i;
......@@ -2323,7 +2525,6 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
wake_up(&vcpu->arch.cpu_run);
}
}
list_del_init(&vc->preempt_list);
if (!is_master) {
if (still_running > 0) {
kvmppc_vcore_preempt(vc);
......@@ -2385,6 +2586,21 @@ static inline int kvmppc_set_host_core(unsigned int cpu)
return 0;
}
static void set_irq_happened(int trap)
{
switch (trap) {
case BOOK3S_INTERRUPT_EXTERNAL:
local_paca->irq_happened |= PACA_IRQ_EE;
break;
case BOOK3S_INTERRUPT_H_DOORBELL:
local_paca->irq_happened |= PACA_IRQ_DBELL;
break;
case BOOK3S_INTERRUPT_HMI:
local_paca->irq_happened |= PACA_IRQ_HMI;
break;
}
}
/*
* Run a set of guest threads on a physical core.
* Called with vc->lock held.
......@@ -2395,7 +2611,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int i;
int srcu_idx;
struct core_info core_info;
struct kvmppc_vcore *pvc, *vcnext;
struct kvmppc_vcore *pvc;
struct kvm_split_mode split_info, *sip;
int split, subcore_size, active;
int sub;
......@@ -2404,6 +2620,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int pcpu, thr;
int target_threads;
int controlled_threads;
int trap;
/*
* Remove from the list any threads that have a signal pending
......@@ -2418,7 +2635,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
/*
* Initialize *vc.
*/
init_master_vcore(vc);
init_vcore_to_run(vc);
vc->preempt_tb = TB_NIL;
/*
......@@ -2455,6 +2672,43 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
if (vc->num_threads < target_threads)
collect_piggybacks(&core_info, target_threads);
/*
* On radix, arrange for TLB flushing if necessary.
* This has to be done before disabling interrupts since
* it uses smp_call_function().
*/
pcpu = smp_processor_id();
if (kvm_is_radix(vc->kvm)) {
for (sub = 0; sub < core_info.n_subcores; ++sub)
for_each_runnable_thread(i, vcpu, core_info.vc[sub])
kvmppc_prepare_radix_vcpu(vcpu, pcpu);
}
/*
* Hard-disable interrupts, and check resched flag and signals.
* If we need to reschedule or deliver a signal, clean up
* and return without going into the guest(s).
*/
local_irq_disable();
hard_irq_disable();
if (lazy_irq_pending() || need_resched() ||
recheck_signals(&core_info)) {
local_irq_enable();
vc->vcore_state = VCORE_INACTIVE;
/* Unlock all except the primary vcore */
for (sub = 1; sub < core_info.n_subcores; ++sub) {
pvc = core_info.vc[sub];
/* Put back on to the preempted vcores list */
kvmppc_vcore_preempt(pvc);
spin_unlock(&pvc->lock);
}
for (i = 0; i < controlled_threads; ++i)
kvmppc_release_hwthread(pcpu + i);
return;
}
kvmppc_clear_host_core(pcpu);
/* Decide on micro-threading (split-core) mode */
subcore_size = threads_per_subcore;
cmd_bit = stat_bit = 0;
......@@ -2478,13 +2732,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
split_info.ldbar = mfspr(SPRN_LDBAR);
split_info.subcore_size = subcore_size;
for (sub = 0; sub < core_info.n_subcores; ++sub)
split_info.master_vcs[sub] =
list_first_entry(&core_info.vcs[sub],
struct kvmppc_vcore, preempt_list);
split_info.vc[sub] = core_info.vc[sub];
/* order writes to split_info before kvm_split_mode pointer */
smp_wmb();
}
pcpu = smp_processor_id();
for (thr = 0; thr < controlled_threads; ++thr)
paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
......@@ -2504,32 +2755,29 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
}
}
kvmppc_clear_host_core(pcpu);
/* Start all the threads */
active = 0;
for (sub = 0; sub < core_info.n_subcores; ++sub) {
thr = subcore_thread_map[sub];
thr0_done = false;
active |= 1 << thr;
list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list) {
pvc->pcpu = pcpu + thr;
for_each_runnable_thread(i, vcpu, pvc) {
kvmppc_start_thread(vcpu, pvc);
kvmppc_create_dtl_entry(vcpu, pvc);
trace_kvm_guest_enter(vcpu);
if (!vcpu->arch.ptid)
thr0_done = true;
active |= 1 << (thr + vcpu->arch.ptid);
}
/*
* We need to start the first thread of each subcore
* even if it doesn't have a vcpu.
*/
if (pvc->master_vcore == pvc && !thr0_done)
kvmppc_start_thread(NULL, pvc);
thr += pvc->num_threads;
pvc = core_info.vc[sub];
pvc->pcpu = pcpu + thr;
for_each_runnable_thread(i, vcpu, pvc) {
kvmppc_start_thread(vcpu, pvc);
kvmppc_create_dtl_entry(vcpu, pvc);
trace_kvm_guest_enter(vcpu);
if (!vcpu->arch.ptid)
thr0_done = true;
active |= 1 << (thr + vcpu->arch.ptid);
}
/*
* We need to start the first thread of each subcore
* even if it doesn't have a vcpu.
*/
if (!thr0_done)
kvmppc_start_thread(NULL, pvc);
thr += pvc->num_threads;
}
/*
......@@ -2556,17 +2804,27 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
trace_kvmppc_run_core(vc, 0);
for (sub = 0; sub < core_info.n_subcores; ++sub)
list_for_each_entry(pvc, &core_info.vcs[sub], preempt_list)
spin_unlock(&pvc->lock);
spin_unlock(&core_info.vc[sub]->lock);
/*
* Interrupts will be enabled once we get into the guest,
* so tell lockdep that we're about to enable interrupts.
*/
trace_hardirqs_on();
guest_enter();
srcu_idx = srcu_read_lock(&vc->kvm->srcu);
__kvmppc_vcore_entry();
trap = __kvmppc_vcore_entry();
srcu_read_unlock(&vc->kvm->srcu, srcu_idx);
guest_exit();
trace_hardirqs_off();
set_irq_happened(trap);
spin_lock(&vc->lock);
/* prevent other vcpu threads from doing kvmppc_start_thread() now */
vc->vcore_state = VCORE_EXITING;
......@@ -2594,6 +2852,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
split_info.do_nap = 0;
}
kvmppc_set_host_core(pcpu);
local_irq_enable();
/* Let secondaries go back to the offline loop */
for (i = 0; i < controlled_threads; ++i) {
kvmppc_release_hwthread(pcpu + i);
......@@ -2602,18 +2864,15 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
}
kvmppc_set_host_core(pcpu);
spin_unlock(&vc->lock);
/* make sure updates to secondary vcpu structs are visible now */
smp_mb();
guest_exit();
for (sub = 0; sub < core_info.n_subcores; ++sub)
list_for_each_entry_safe(pvc, vcnext, &core_info.vcs[sub],
preempt_list)
post_guest_process(pvc, pvc == vc);
for (sub = 0; sub < core_info.n_subcores; ++sub) {
pvc = core_info.vc[sub];
post_guest_process(pvc, pvc == vc);
}
spin_lock(&vc->lock);
preempt_enable();
......@@ -2658,6 +2917,30 @@ static void shrink_halt_poll_ns(struct kvmppc_vcore *vc)
vc->halt_poll_ns /= halt_poll_ns_shrink;
}
#ifdef CONFIG_KVM_XICS
static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
{
if (!xive_enabled())
return false;
return vcpu->arch.xive_saved_state.pipr <
vcpu->arch.xive_saved_state.cppr;
}
#else
static inline bool xive_interrupt_pending(struct kvm_vcpu *vcpu)
{
return false;
}
#endif /* CONFIG_KVM_XICS */
static bool kvmppc_vcpu_woken(struct kvm_vcpu *vcpu)
{
if (vcpu->arch.pending_exceptions || vcpu->arch.prodded ||
kvmppc_doorbell_pending(vcpu) || xive_interrupt_pending(vcpu))
return true;
return false;
}
/*
* Check to see if any of the runnable vcpus on the vcore have pending
* exceptions or are no longer ceded
......@@ -2668,8 +2951,7 @@ static int kvmppc_vcore_check_block(struct kvmppc_vcore *vc)
int i;
for_each_runnable_thread(i, vcpu, vc) {
if (vcpu->arch.pending_exceptions || !vcpu->arch.ceded ||
vcpu->arch.prodded)
if (!vcpu->arch.ceded || kvmppc_vcpu_woken(vcpu))
return 1;
}
......@@ -2811,15 +3093,14 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
*/
if (!signal_pending(current)) {
if (vc->vcore_state == VCORE_PIGGYBACK) {
struct kvmppc_vcore *mvc = vc->master_vcore;
if (spin_trylock(&mvc->lock)) {
if (mvc->vcore_state == VCORE_RUNNING &&
!VCORE_IS_EXITING(mvc)) {
if (spin_trylock(&vc->lock)) {
if (vc->vcore_state == VCORE_RUNNING &&
!VCORE_IS_EXITING(vc)) {
kvmppc_create_dtl_entry(vcpu, vc);
kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu);
}
spin_unlock(&mvc->lock);
spin_unlock(&vc->lock);
}
} else if (vc->vcore_state == VCORE_RUNNING &&
!VCORE_IS_EXITING(vc)) {
......@@ -2855,7 +3136,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
break;
n_ceded = 0;
for_each_runnable_thread(i, v, vc) {
if (!v->arch.pending_exceptions && !v->arch.prodded)
if (!kvmppc_vcpu_woken(v))
n_ceded += v->arch.ceded;
else
v->arch.ceded = 0;
......@@ -2907,12 +3188,36 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
int r;
int srcu_idx;
unsigned long ebb_regs[3] = {}; /* shut up GCC */
unsigned long user_tar = 0;
unsigned int user_vrsave;
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
return -EINVAL;
}
/*
* Don't allow entry with a suspended transaction, because
* the guest entry/exit code will lose it.
* If the guest has TM enabled, save away their TM-related SPRs
* (they will get restored by the TM unavailable interrupt).
*/
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs &&
(current->thread.regs->msr & MSR_TM)) {
if (MSR_TM_ACTIVE(current->thread.regs->msr)) {
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
run->fail_entry.hardware_entry_failure_reason = 0;
return -EINVAL;
}
current->thread.tm_tfhar = mfspr(SPRN_TFHAR);
current->thread.tm_tfiar = mfspr(SPRN_TFIAR);
current->thread.tm_texasr = mfspr(SPRN_TEXASR);
current->thread.regs->msr &= ~MSR_TM;
}
#endif
kvmppc_core_prepare_to_enter(vcpu);
/* No need to go into the guest when all we'll do is come back out */
......@@ -2934,6 +3239,15 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
flush_all_to_thread(current);
/* Save userspace EBB and other register values */
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
ebb_regs[0] = mfspr(SPRN_EBBHR);
ebb_regs[1] = mfspr(SPRN_EBBRR);
ebb_regs[2] = mfspr(SPRN_BESCR);
user_tar = mfspr(SPRN_TAR);
}
user_vrsave = mfspr(SPRN_VRSAVE);
vcpu->arch.wqp = &vcpu->arch.vcore->wq;
vcpu->arch.pgdir = current->mm->pgd;
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
......@@ -2960,6 +3274,16 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
}
} while (is_kvmppc_resume_guest(r));
/* Restore userspace EBB and other register values */
if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
mtspr(SPRN_EBBHR, ebb_regs[0]);
mtspr(SPRN_EBBRR, ebb_regs[1]);
mtspr(SPRN_BESCR, ebb_regs[2]);
mtspr(SPRN_TAR, user_tar);
mtspr(SPRN_FSCR, current->thread.fscr);
}
mtspr(SPRN_VRSAVE, user_vrsave);
out:
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
atomic_dec(&vcpu->kvm->arch.vcpus_running);
......@@ -3467,6 +3791,19 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
if (!cpu_has_feature(CPU_FTR_ARCH_300))
kvm_hv_vm_activated();
/*
* Initialize smt_mode depending on processor.
* POWER8 and earlier have to use "strict" threading, where
* all vCPUs in a vcore have to run on the same (sub)core,
* whereas on POWER9 the threads can each run a different
* guest.
*/
if (!cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.smt_mode = threads_per_subcore;
else
kvm->arch.smt_mode = 1;
kvm->arch.emul_smt_mode = 1;
/*
* Create a debugfs directory for the VM
*/
......@@ -3896,6 +4233,7 @@ static struct kvmppc_ops kvm_ops_hv = {
#endif
.configure_mmu = kvmhv_configure_mmu,
.get_rmmu_info = kvmhv_get_rmmu_info,
.set_smt_mode = kvmhv_set_smt_mode,
};
static int kvm_init_subcore_bitmap(void)
......
......@@ -307,7 +307,7 @@ void kvmhv_commence_exit(int trap)
return;
for (i = 0; i < MAX_SUBCORES; ++i) {
vc = sip->master_vcs[i];
vc = sip->vc[i];
if (!vc)
break;
do {
......
......@@ -61,13 +61,6 @@ BEGIN_FTR_SECTION
std r3, HSTATE_DABR(r13)
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/* Hard-disable interrupts */
mfmsr r10
std r10, HSTATE_HOST_MSR(r13)
rldicl r10,r10,48,1
rotldi r10,r10,16
mtmsrd r10,1
/* Save host PMU registers */
BEGIN_FTR_SECTION
/* Work around P8 PMAE bug */
......@@ -121,10 +114,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
* Put whatever is in the decrementer into the
* hypervisor decrementer.
*/
BEGIN_FTR_SECTION
ld r5, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_KVM(r5)
ld r9, KVM_HOST_LPCR(r6)
andis. r9, r9, LPCR_LD@h
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
mfspr r8,SPRN_DEC
mftb r7
mtspr SPRN_HDEC,r8
BEGIN_FTR_SECTION
/* On POWER9, don't sign-extend if host LPCR[LD] bit is set */
bne 32f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r8,r8
32: mtspr SPRN_HDEC,r8
add r8,r8,r7
std r8,HSTATE_DECEXP(r13)
......@@ -143,6 +146,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
*
* R1 = host R1
* R2 = host R2
* R3 = trap number on this thread
* R12 = exit handler id
* R13 = PACA
*/
......
......@@ -130,12 +130,28 @@ static long kvmppc_realmode_mc_power7(struct kvm_vcpu *vcpu)
out:
/*
* For guest that supports FWNMI capability, hook the MCE event into
* vcpu structure. We are going to exit the guest with KVM_EXIT_NMI
* exit reason. On our way to exit we will pull this event from vcpu
* structure and print it from thread 0 of the core/subcore.
*
* For guest that does not support FWNMI capability (old QEMU):
* We are now going enter guest either through machine check
* interrupt (for unhandled errors) or will continue from
* current HSRR0 (for handled errors) in guest. Hence
* queue up the event so that we can log it from host console later.
*/
machine_check_queue_event();
if (vcpu->kvm->arch.fwnmi_enabled) {
/*
* Hook up the mce event on to vcpu structure.
* First clear the old event.
*/
memset(&vcpu->arch.mce_evt, 0, sizeof(vcpu->arch.mce_evt));
if (get_mce_event(&mce_evt, MCE_EVENT_RELEASE)) {
vcpu->arch.mce_evt = mce_evt;
}
} else
machine_check_queue_event();
return handled;
}
......
......@@ -32,12 +32,30 @@
#include <asm/opal.h>
#include <asm/xive-regs.h>
/* Sign-extend HDEC if not on POWER9 */
#define EXTEND_HDEC(reg) \
BEGIN_FTR_SECTION; \
extsw reg, reg; \
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM)
/* Values in HSTATE_NAPPING(r13) */
#define NAPPING_CEDE 1
#define NAPPING_NOVCPU 2
/* Stack frame offsets for kvmppc_hv_entry */
#define SFS 160
#define STACK_SLOT_TRAP (SFS-4)
#define STACK_SLOT_TID (SFS-16)
#define STACK_SLOT_PSSCR (SFS-24)
#define STACK_SLOT_PID (SFS-32)
#define STACK_SLOT_IAMR (SFS-40)
#define STACK_SLOT_CIABR (SFS-48)
#define STACK_SLOT_DAWR (SFS-56)
#define STACK_SLOT_DAWRX (SFS-64)
#define STACK_SLOT_HFSCR (SFS-72)
/*
* Call kvmppc_hv_entry in real mode.
* Must be called with interrupts hard-disabled.
......@@ -51,6 +69,7 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
std r0, PPC_LR_STKOFF(r1)
stdu r1, -112(r1)
mfmsr r10
std r10, HSTATE_HOST_MSR(r13)
LOAD_REG_ADDR(r5, kvmppc_call_hv_entry)
li r0,MSR_RI
andc r0,r10,r0
......@@ -135,20 +154,21 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
stb r0, HSTATE_HWTHREAD_REQ(r13)
/*
* For external and machine check interrupts, we need
* to call the Linux handler to process the interrupt.
* We do that by jumping to absolute address 0x500 for
* external interrupts, or the machine_check_fwnmi label
* for machine checks (since firmware might have patched
* the vector area at 0x200). The [h]rfid at the end of the
* handler will return to the book3s_hv_interrupts.S code.
* For other interrupts we do the rfid to get back
* to the book3s_hv_interrupts.S code here.
* For external interrupts we need to call the Linux
* handler to process the interrupt. We do that by jumping
* to absolute address 0x500 for external interrupts.
* The [h]rfid at the end of the handler will return to
* the book3s_hv_interrupts.S code. For other interrupts
* we do the rfid to get back to the book3s_hv_interrupts.S
* code here.
*/
ld r8, 112+PPC_LR_STKOFF(r1)
addi r1, r1, 112
ld r7, HSTATE_HOST_MSR(r13)
/* Return the trap number on this thread as the return value */
mr r3, r12
/*
* If we came back from the guest via a relocation-on interrupt,
* we will be in virtual mode at this point, which makes it a
......@@ -158,62 +178,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
andi. r0, r0, MSR_IR /* in real mode? */
bne .Lvirt_return
cmpwi cr1, r12, BOOK3S_INTERRUPT_MACHINE_CHECK
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq 11f
cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
beq 15f /* Invoke the H_DOORBELL handler */
cmpwi cr2, r12, BOOK3S_INTERRUPT_HMI
beq cr2, 14f /* HMI check */
/* RFI into the highmem handler, or branch to interrupt handler */
/* RFI into the highmem handler */
mfmsr r6
li r0, MSR_RI
andc r6, r6, r0
mtmsrd r6, 1 /* Clear RI in MSR */
mtsrr0 r8
mtsrr1 r7
beq cr1, 13f /* machine check */
RFI
/* On POWER7, we have external interrupts set to use HSRR0/1 */
11: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
ba 0x500
13: b machine_check_fwnmi
14: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
b hmi_exception_after_realmode
15: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
ba 0xe80
/* Virtual-mode return - can't get here for HMI or machine check */
/* Virtual-mode return */
.Lvirt_return:
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
beq 16f
cmpwi r12, BOOK3S_INTERRUPT_H_DOORBELL
beq 17f
andi. r0, r7, MSR_EE /* were interrupts hard-enabled? */
beq 18f
mtmsrd r7, 1 /* if so then re-enable them */
18: mtlr r8
mtlr r8
blr
16: mtspr SPRN_HSRR0, r8 /* jump to reloc-on external vector */
mtspr SPRN_HSRR1, r7
b exc_virt_0x4500_hardware_interrupt
17: mtspr SPRN_HSRR0, r8
mtspr SPRN_HSRR1, r7
b exc_virt_0x4e80_h_doorbell
kvmppc_primary_no_guest:
/* We handle this much like a ceded vcpu */
/* put the HDEC into the DEC, since HDEC interrupts don't wake us */
/* HDEC may be larger than DEC for arch >= v3.00, but since the */
/* HDEC value came from DEC in the first place, it will fit */
mfspr r3, SPRN_HDEC
mtspr SPRN_DEC, r3
/*
......@@ -295,8 +278,9 @@ kvm_novcpu_wakeup:
/* See if our timeslice has expired (HDEC is negative) */
mfspr r0, SPRN_HDEC
EXTEND_HDEC(r0)
li r12, BOOK3S_INTERRUPT_HV_DECREMENTER
cmpwi r0, 0
cmpdi r0, 0
blt kvm_novcpu_exit
/* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */
......@@ -319,10 +303,10 @@ kvm_novcpu_exit:
bl kvmhv_accumulate_time
#endif
13: mr r3, r12
stw r12, 112-4(r1)
stw r12, STACK_SLOT_TRAP(r1)
bl kvmhv_commence_exit
nop
lwz r12, 112-4(r1)
lwz r12, STACK_SLOT_TRAP(r1)
b kvmhv_switch_to_host
/*
......@@ -390,8 +374,8 @@ kvm_secondary_got_guest:
lbz r4, HSTATE_PTID(r13)
cmpwi r4, 0
bne 63f
lis r6, 0x7fff
ori r6, r6, 0xffff
LOAD_REG_ADDR(r6, decrementer_max)
ld r6, 0(r6)
mtspr SPRN_HDEC, r6
/* and set per-LPAR registers, if doing dynamic micro-threading */
ld r6, HSTATE_SPLIT_MODE(r13)
......@@ -545,11 +529,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
* *
*****************************************************************************/
/* Stack frame offsets */
#define STACK_SLOT_TID (112-16)
#define STACK_SLOT_PSSCR (112-24)
#define STACK_SLOT_PID (112-32)
.global kvmppc_hv_entry
kvmppc_hv_entry:
......@@ -565,7 +544,7 @@ kvmppc_hv_entry:
*/
mflr r0
std r0, PPC_LR_STKOFF(r1)
stdu r1, -112(r1)
stdu r1, -SFS(r1)
/* Save R1 in the PACA */
std r1, HSTATE_HOST_R1(r13)
......@@ -749,10 +728,22 @@ BEGIN_FTR_SECTION
mfspr r5, SPRN_TIDR
mfspr r6, SPRN_PSSCR
mfspr r7, SPRN_PID
mfspr r8, SPRN_IAMR
std r5, STACK_SLOT_TID(r1)
std r6, STACK_SLOT_PSSCR(r1)
std r7, STACK_SLOT_PID(r1)
std r8, STACK_SLOT_IAMR(r1)
mfspr r5, SPRN_HFSCR
std r5, STACK_SLOT_HFSCR(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
BEGIN_FTR_SECTION
mfspr r5, SPRN_CIABR
mfspr r6, SPRN_DAWR
mfspr r7, SPRN_DAWRX
std r5, STACK_SLOT_CIABR(r1)
std r6, STACK_SLOT_DAWR(r1)
std r7, STACK_SLOT_DAWRX(r1)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
/* Set partition DABR */
......@@ -895,8 +886,10 @@ FTR_SECTION_ELSE
ld r5, VCPU_TID(r4)
ld r6, VCPU_PSSCR(r4)
oris r6, r6, PSSCR_EC@h /* This makes stop trap to HV */
ld r7, VCPU_HFSCR(r4)
mtspr SPRN_TIDR, r5
mtspr SPRN_PSSCR, r6
mtspr SPRN_HFSCR, r7
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
8:
......@@ -911,7 +904,7 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
mftb r7
subf r3,r7,r8
mtspr SPRN_DEC,r3
stw r3,VCPU_DEC(r4)
std r3,VCPU_DEC(r4)
ld r5, VCPU_SPRG0(r4)
ld r6, VCPU_SPRG1(r4)
......@@ -968,7 +961,8 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
/* Check if HDEC expires soon */
mfspr r3, SPRN_HDEC
cmpwi r3, 512 /* 1 microsecond */
EXTEND_HDEC(r3)
cmpdi r3, 512 /* 1 microsecond */
blt hdec_soon
#ifdef CONFIG_KVM_XICS
......@@ -1022,7 +1016,13 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
li r0, BOOK3S_INTERRUPT_EXTERNAL
bne cr1, 12f
mfspr r0, SPRN_DEC
cmpwi r0, 0
BEGIN_FTR_SECTION
/* On POWER9 check whether the guest has large decrementer enabled */
andis. r8, r8, LPCR_LD@h
bne 15f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r0, r0
15: cmpdi r0, 0
li r0, BOOK3S_INTERRUPT_DECREMENTER
bge 5f
......@@ -1032,6 +1032,23 @@ kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
mr r9, r4
bl kvmppc_msr_interrupt
5:
BEGIN_FTR_SECTION
b fast_guest_return
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/* On POWER9, check for pending doorbell requests */
lbz r0, VCPU_DBELL_REQ(r4)
cmpwi r0, 0
beq fast_guest_return
ld r5, HSTATE_KVM_VCORE(r13)
/* Set DPDES register so the CPU will take a doorbell interrupt */
li r0, 1
mtspr SPRN_DPDES, r0
std r0, VCORE_DPDES(r5)
/* Make sure other cpus see vcore->dpdes set before dbell req clear */
lwsync
/* Clear the pending doorbell request */
li r0, 0
stb r0, VCPU_DBELL_REQ(r4)
/*
* Required state:
......@@ -1206,6 +1223,15 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
stw r12,VCPU_TRAP(r9)
/*
* Now that we have saved away SRR0/1 and HSRR0/1,
* interrupts are recoverable in principle, so set MSR_RI.
* This becomes important for relocation-on interrupts from
* the guest, which we can get in radix mode on POWER9.
*/
li r0, MSR_RI
mtmsrd r0, 1
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
addi r3, r9, VCPU_TB_RMINTR
mr r4, r9
......@@ -1262,6 +1288,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
beq 4f
b guest_exit_cont
3:
/* If it's a hypervisor facility unavailable interrupt, save HFSCR */
cmpwi r12, BOOK3S_INTERRUPT_H_FAC_UNAVAIL
bne 14f
mfspr r3, SPRN_HFSCR
std r3, VCPU_HFSCR(r9)
b guest_exit_cont
14:
/* External interrupt ? */
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
bne+ guest_exit_cont
......@@ -1449,12 +1482,18 @@ mc_cont:
mtspr SPRN_SPURR,r4
/* Save DEC */
ld r3, HSTATE_KVM_VCORE(r13)
mfspr r5,SPRN_DEC
mftb r6
/* On P9, if the guest has large decr enabled, don't sign extend */
BEGIN_FTR_SECTION
ld r4, VCORE_LPCR(r3)
andis. r4, r4, LPCR_LD@h
bne 16f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r5,r5
add r5,r5,r6
16: add r5,r5,r6
/* r5 is a guest timebase value here, convert to host TB */
ld r3,HSTATE_KVM_VCORE(r13)
ld r4,VCORE_TB_OFFSET(r3)
subf r5,r4,r5
std r5,VCPU_DEC_EXPIRES(r9)
......@@ -1499,17 +1538,19 @@ FTR_SECTION_ELSE
rldicl r6, r6, 4, 50 /* r6 &= PSSCR_GUEST_VIS */
rotldi r6, r6, 60
std r6, VCPU_PSSCR(r9)
/* Restore host HFSCR value */
ld r7, STACK_SLOT_HFSCR(r1)
mtspr SPRN_HFSCR, r7
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
/*
* Restore various registers to 0, where non-zero values
* set by the guest could disrupt the host.
*/
li r0, 0
mtspr SPRN_IAMR, r0
mtspr SPRN_CIABR, r0
mtspr SPRN_DAWRX, r0
mtspr SPRN_PSPB, r0
mtspr SPRN_WORT, r0
BEGIN_FTR_SECTION
mtspr SPRN_IAMR, r0
mtspr SPRN_TCSCR, r0
/* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */
li r0, 1
......@@ -1525,6 +1566,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
std r6,VCPU_UAMOR(r9)
li r6,0
mtspr SPRN_AMR,r6
mtspr SPRN_UAMOR, r6
/* Switch DSCR back to host value */
mfspr r8, SPRN_DSCR
......@@ -1669,13 +1711,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
ptesync
/* Restore host values of some registers */
BEGIN_FTR_SECTION
ld r5, STACK_SLOT_CIABR(r1)
ld r6, STACK_SLOT_DAWR(r1)
ld r7, STACK_SLOT_DAWRX(r1)
mtspr SPRN_CIABR, r5
mtspr SPRN_DAWR, r6
mtspr SPRN_DAWRX, r7
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
BEGIN_FTR_SECTION
ld r5, STACK_SLOT_TID(r1)
ld r6, STACK_SLOT_PSSCR(r1)
ld r7, STACK_SLOT_PID(r1)
ld r8, STACK_SLOT_IAMR(r1)
mtspr SPRN_TIDR, r5
mtspr SPRN_PSSCR, r6
mtspr SPRN_PID, r7
mtspr SPRN_IAMR, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
BEGIN_FTR_SECTION
PPC_INVALIDATE_ERAT
......@@ -1819,8 +1871,8 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
li r0, KVM_GUEST_MODE_NONE
stb r0, HSTATE_IN_GUEST(r13)
ld r0, 112+PPC_LR_STKOFF(r1)
addi r1, r1, 112
ld r0, SFS+PPC_LR_STKOFF(r1)
addi r1, r1, SFS
mtlr r0
blr
......@@ -2366,12 +2418,20 @@ END_FTR_SECTION_IFSET(CPU_FTR_TM)
mfspr r3, SPRN_DEC
mfspr r4, SPRN_HDEC
mftb r5
cmpw r3, r4
BEGIN_FTR_SECTION
/* On P9 check whether the guest has large decrementer mode enabled */
ld r6, HSTATE_KVM_VCORE(r13)
ld r6, VCORE_LPCR(r6)
andis. r6, r6, LPCR_LD@h
bne 68f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r3, r3
68: EXTEND_HDEC(r4)
cmpd r3, r4
ble 67f
mtspr SPRN_DEC, r4
67:
/* save expiry time of guest decrementer */
extsw r3, r3
add r3, r3, r5
ld r4, HSTATE_KVM_VCPU(r13)
ld r5, HSTATE_KVM_VCORE(r13)
......@@ -2552,22 +2612,32 @@ machine_check_realmode:
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_MACHINE_CHECK
/*
* Deliver unhandled/fatal (e.g. UE) MCE errors to guest through
* machine check interrupt (set HSRR0 to 0x200). And for handled
* errors (no-fatal), just go back to guest execution with current
* HSRR0 instead of exiting guest. This new approach will inject
* machine check to guest for fatal error causing guest to crash.
*
* The old code used to return to host for unhandled errors which
* was causing guest to hang with soft lockups inside guest and
* makes it difficult to recover guest instance.
* For the guest that is FWNMI capable, deliver all the MCE errors
* (handled/unhandled) by exiting the guest with KVM_EXIT_NMI exit
* reason. This new approach injects machine check errors in guest
* address space to guest with additional information in the form
* of RTAS event, thus enabling guest kernel to suitably handle
* such errors.
*
* For the guest that is not FWNMI capable (old QEMU) fallback
* to old behaviour for backward compatibility:
* Deliver unhandled/fatal (e.g. UE) MCE errors to guest either
* through machine check interrupt (set HSRR0 to 0x200).
* For handled errors (no-fatal), just go back to guest execution
* with current HSRR0.
* if we receive machine check with MSR(RI=0) then deliver it to
* guest as machine check causing guest to crash.
*/
ld r11, VCPU_MSR(r9)
rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
bne mc_cont /* if so, exit to host */
/* Check if guest is capable of handling NMI exit */
ld r10, VCPU_KVM(r9)
lbz r10, KVM_FWNMI(r10)
cmpdi r10, 1 /* FWNMI capable? */
beq mc_cont /* if so, exit with KVM_EXIT_NMI. */
/* if not, fall through for backward compatibility. */
andi. r10, r11, MSR_RI /* check for unrecoverable exception */
beq 1f /* Deliver a machine check to guest */
ld r10, VCPU_PC(r9)
......
......@@ -39,7 +39,7 @@ void kvmppc_emulate_dec(struct kvm_vcpu *vcpu)
unsigned long dec_nsec;
unsigned long long dec_time;
pr_debug("mtDEC: %x\n", vcpu->arch.dec);
pr_debug("mtDEC: %lx\n", vcpu->arch.dec);
hrtimer_try_to_cancel(&vcpu->arch.dec_timer);
#ifdef CONFIG_PPC_BOOK3S
......@@ -109,7 +109,7 @@ static int kvmppc_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, int rs)
case SPRN_TBWU: break;
case SPRN_DEC:
vcpu->arch.dec = spr_val;
vcpu->arch.dec = (u32) spr_val;
kvmppc_emulate_dec(vcpu);
break;
......
......@@ -553,13 +553,28 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
case KVM_CAP_PPC_SMT:
r = 0;
if (hv_enabled) {
if (kvm) {
if (kvm->arch.emul_smt_mode > 1)
r = kvm->arch.emul_smt_mode;
else
r = kvm->arch.smt_mode;
} else if (hv_enabled) {
if (cpu_has_feature(CPU_FTR_ARCH_300))
r = 1;
else
r = threads_per_subcore;
}
break;
case KVM_CAP_PPC_SMT_POSSIBLE:
r = 1;
if (hv_enabled) {
if (!cpu_has_feature(CPU_FTR_ARCH_300))
r = ((threads_per_subcore << 1) - 1);
else
/* P9 can emulate dbells, so allow any mode */
r = 8 | 4 | 2 | 1;
}
break;
case KVM_CAP_PPC_RMA:
r = 0;
break;
......@@ -617,6 +632,11 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
/* Disable this on POWER9 until code handles new HPTE format */
r = !!hv_enabled && !cpu_has_feature(CPU_FTR_ARCH_300);
break;
#endif
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
case KVM_CAP_PPC_FWNMI:
r = hv_enabled;
break;
#endif
case KVM_CAP_PPC_HTM:
r = cpu_has_feature(CPU_FTR_TM_COMP) &&
......@@ -1537,6 +1557,15 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
break;
}
#endif /* CONFIG_KVM_XICS */
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
case KVM_CAP_PPC_FWNMI:
r = -EINVAL;
if (!is_kvmppc_hv_enabled(vcpu->kvm))
break;
r = 0;
vcpu->kvm->arch.fwnmi_enabled = true;
break;
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
default:
r = -EINVAL;
break;
......@@ -1711,6 +1740,15 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = 0;
break;
}
case KVM_CAP_PPC_SMT: {
unsigned long mode = cap->args[0];
unsigned long flags = cap->args[1];
r = -EINVAL;
if (kvm->arch.kvm_ops->set_smt_mode)
r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
break;
}
#endif
default:
r = -EINVAL;
......
......@@ -925,6 +925,8 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_X86_GUEST_MWAIT 143
#define KVM_CAP_ARM_USER_IRQ 144
#define KVM_CAP_S390_CMMA_MIGRATION 145
#define KVM_CAP_PPC_FWNMI 146
#define KVM_CAP_PPC_SMT_POSSIBLE 147
#ifdef KVM_CAP_IRQ_ROUTING
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment