Commit 9d67121a authored by Paul Mackerras's avatar Paul Mackerras

Merge remote-tracking branch 'remotes/powerpc/topic/ppc-kvm' into kvm-ppc-next

This merges in the "ppc-kvm" topic branch of the powerpc tree to get a
series of commits that touch both general arch/powerpc code and KVM
code.  These commits will be merged both via the KVM tree and the
powerpc tree.
Signed-off-by: default avatarPaul Mackerras <paulus@ozlabs.org>
parents aa227864 83a05510
......@@ -1922,6 +1922,7 @@ registers, find a list below:
PPC | KVM_REG_PPC_TIDR | 64
PPC | KVM_REG_PPC_PSSCR | 64
PPC | KVM_REG_PPC_DEC_EXPIRY | 64
PPC | KVM_REG_PPC_PTCR | 64
PPC | KVM_REG_PPC_TM_GPR0 | 64
...
PPC | KVM_REG_PPC_TM_GPR31 | 64
......
......@@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache;
extern long flush_count_cache;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
#else
static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
bool preserve_nv) { }
static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
bool preserve_nv) { }
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
void kvmhv_save_host_pmu(void);
void kvmhv_load_host_pmu(void);
void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,
unsigned long dabrx);
#endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
......@@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
BUG();
}
static inline unsigned int ap_to_shift(unsigned long ap)
{
int psize;
for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
if (mmu_psize_defs[psize].ap == ap)
return mmu_psize_defs[psize].shift;
}
return -1;
}
static inline unsigned long get_sllp_encoding(int psize)
{
unsigned long sllp;
......
......@@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
unsigned long addr,
unsigned long page_size);
extern void radix__flush_pwc_lpid(unsigned int lpid);
extern void radix__flush_tlb_lpid(unsigned int lpid);
extern void radix__local_flush_tlb_lpid(unsigned int lpid);
extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
......
......@@ -322,6 +322,11 @@
#define H_GET_24X7_DATA 0xF07C
#define H_GET_PERF_COUNTER_INFO 0xF080
/* Platform-specific hcalls used for nested HV KVM */
#define H_SET_PARTITION_TABLE 0xF800
#define H_ENTER_NESTED 0xF804
#define H_TLB_INVALIDATE 0xF808
/* Values for 2nd argument to H_SET_MODE */
#define H_SET_MODE_RESOURCE_SET_CIABR 1
#define H_SET_MODE_RESOURCE_SET_DAWR 2
......@@ -461,6 +466,42 @@ struct h_cpu_char_result {
u64 behaviour;
};
/* Register state for entering a nested guest with H_ENTER_NESTED */
struct hv_guest_state {
u64 version; /* version of this structure layout */
u32 lpid;
u32 vcpu_token;
/* These registers are hypervisor privileged (at least for writing) */
u64 lpcr;
u64 pcr;
u64 amor;
u64 dpdes;
u64 hfscr;
s64 tb_offset;
u64 dawr0;
u64 dawrx0;
u64 ciabr;
u64 hdec_expiry;
u64 purr;
u64 spurr;
u64 ic;
u64 vtb;
u64 hdar;
u64 hdsisr;
u64 heir;
u64 asdr;
/* These are OS privileged but need to be set late in guest entry */
u64 srr0;
u64 srr1;
u64 sprg[4];
u64 pidr;
u64 cfar;
u64 ppr;
};
/* Latest version of hv_guest_state structure */
#define HV_GUEST_STATE_VERSION 1
#endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_HVCALL_H */
......@@ -84,7 +84,6 @@
#define BOOK3S_INTERRUPT_INST_STORAGE 0x400
#define BOOK3S_INTERRUPT_INST_SEGMENT 0x480
#define BOOK3S_INTERRUPT_EXTERNAL 0x500
#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL 0x501
#define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502
#define BOOK3S_INTERRUPT_ALIGNMENT 0x600
#define BOOK3S_INTERRUPT_PROGRAM 0x700
......@@ -134,8 +133,7 @@
#define BOOK3S_IRQPRIO_EXTERNAL 14
#define BOOK3S_IRQPRIO_DECREMENTER 15
#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 16
#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 17
#define BOOK3S_IRQPRIO_MAX 18
#define BOOK3S_IRQPRIO_MAX 17
#define BOOK3S_HFLAG_DCBZ32 0x1
#define BOOK3S_HFLAG_SLB 0x2
......
......@@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr);
extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, u64 root,
u64 *pte_ret_p);
extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, u64 table,
int table_index, u64 *pte_ret_p);
extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, bool data, bool iswrite);
extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
unsigned int shift, struct kvm_memory_slot *memslot,
unsigned int lpid);
extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
bool writing, unsigned long gpa,
unsigned int lpid);
extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
unsigned long gpa,
struct kvm_memory_slot *memslot,
bool writing, bool kvm_ro,
pte_t *inserted_pte, unsigned int *levelp);
extern int kvmppc_init_vm_radix(struct kvm *kvm);
extern void kvmppc_free_radix(struct kvm *kvm);
extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
unsigned int lpid);
extern int kvmppc_radix_init(void);
extern void kvmppc_radix_exit(void);
extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn);
extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
unsigned long gpa, unsigned int shift,
struct kvm_memory_slot *memslot,
unsigned int lpid);
extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn);
extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
......@@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
#endif
long kvmhv_nested_init(void);
void kvmhv_nested_exit(void);
void kvmhv_vm_nested_init(struct kvm *kvm);
long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
void kvmhv_release_all_nested(struct kvm *kvm);
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
u64 time_limit, unsigned long lpcr);
void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
struct hv_guest_state *hr);
long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
extern int kvm_irq_bypass;
......@@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
{
vcpu->arch.cr = val;
vcpu->arch.regs.ccr = val;
}
static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
{
return vcpu->arch.cr;
return vcpu->arch.regs.ccr;
}
static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
......@@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
/* TO = 31 for unconditional trap */
#define INS_TW 0x7fe00008
/* LPIDs we support with this build -- runtime limit may be lower */
#define KVMPPC_NR_LPIDS (LPID_RSVD + 1)
#define SPLIT_HACK_MASK 0xff000000
#define SPLIT_HACK_OFFS 0xfb000000
......
......@@ -23,6 +23,108 @@
#include <linux/string.h>
#include <asm/bitops.h>
#include <asm/book3s/64/mmu-hash.h>
#include <asm/cpu_has_feature.h>
#include <asm/ppc-opcode.h>
#ifdef CONFIG_PPC_PSERIES
static inline bool kvmhv_on_pseries(void)
{
return !cpu_has_feature(CPU_FTR_HVMODE);
}
#else
static inline bool kvmhv_on_pseries(void)
{
return false;
}
#endif
/*
* Structure for a nested guest, that is, for a guest that is managed by
* one of our guests.
*/
struct kvm_nested_guest {
struct kvm *l1_host; /* L1 VM that owns this nested guest */
int l1_lpid; /* lpid L1 guest thinks this guest is */
int shadow_lpid; /* real lpid of this nested guest */
pgd_t *shadow_pgtable; /* our page table for this guest */
u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */
u64 process_table; /* process table entry for this guest */
long refcnt; /* number of pointers to this struct */
struct mutex tlb_lock; /* serialize page faults and tlbies */
struct kvm_nested_guest *next;
cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
short prev_cpu[NR_CPUS];
};
/*
* We define a nested rmap entry as a single 64-bit quantity
* 0xFFF0000000000000 12-bit lpid field
* 0x000FFFFFFFFFF000 40-bit guest 4k page frame number
* 0x0000000000000001 1-bit single entry flag
*/
#define RMAP_NESTED_LPID_MASK 0xFFF0000000000000UL
#define RMAP_NESTED_LPID_SHIFT (52)
#define RMAP_NESTED_GPA_MASK 0x000FFFFFFFFFF000UL
#define RMAP_NESTED_IS_SINGLE_ENTRY 0x0000000000000001UL
/* Structure for a nested guest rmap entry */
struct rmap_nested {
struct llist_node list;
u64 rmap;
};
/*
* for_each_nest_rmap_safe - iterate over the list of nested rmap entries
* safe against removal of the list entry or NULL list
* @pos: a (struct rmap_nested *) to use as a loop cursor
* @node: pointer to the first entry
* NOTE: this can be NULL
* @rmapp: an (unsigned long *) in which to return the rmap entries on each
* iteration
* NOTE: this must point to already allocated memory
*
* The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the
* rmap entry in the memslot. The list is always terminated by a "single entry"
* stored in the list element of the final entry of the llist. If there is ONLY
* a single entry then this is itself in the rmap entry of the memslot, not a
* llist head pointer.
*
* Note that the iterator below assumes that a nested rmap entry is always
* non-zero. This is true for our usage because the LPID field is always
* non-zero (zero is reserved for the host).
*
* This should be used to iterate over the list of rmap_nested entries with
* processing done on the u64 rmap value given by each iteration. This is safe
* against removal of list entries and it is always safe to call free on (pos).
*
* e.g.
* struct rmap_nested *cursor;
* struct llist_node *first;
* unsigned long rmap;
* for_each_nest_rmap_safe(cursor, first, &rmap) {
* do_something(rmap);
* free(cursor);
* }
*/
#define for_each_nest_rmap_safe(pos, node, rmapp) \
for ((pos) = llist_entry((node), typeof(*(pos)), list); \
(node) && \
(*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \
((u64) (node)) : ((pos)->rmap))) && \
(((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \
((struct llist_node *) ((pos) = NULL)) : \
(pos)->list.next)), true); \
(pos) = llist_entry((node), typeof(*(pos)), list))
struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
bool create);
void kvmhv_put_nested(struct kvm_nested_guest *gp);
int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid);
/* Encoding of first parameter for H_TLB_INVALIDATE */
#define H_TLBIE_P1_ENC(ric, prs, r) (___PPC_RIC(ric) | ___PPC_PRS(prs) | \
___PPC_R(r))
/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
#define PPC_MIN_HPT_ORDER 18
......@@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
}
extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
extern void kvmhv_radix_debugfs_init(struct kvm *kvm);
extern void kvmhv_rm_send_ipi(int cpu);
......@@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
{
vcpu->arch.cr = vcpu->arch.cr_tm;
vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
vcpu->arch.regs.xer = vcpu->arch.xer_tm;
vcpu->arch.regs.link = vcpu->arch.lr_tm;
vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
......@@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
{
vcpu->arch.cr_tm = vcpu->arch.cr;
vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
vcpu->arch.xer_tm = vcpu->arch.regs.xer;
vcpu->arch.lr_tm = vcpu->arch.regs.link;
vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
......@@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
}
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
unsigned long gpa, unsigned int level,
unsigned long mmu_seq, unsigned int lpid,
unsigned long *rmapp, struct rmap_nested **n_rmap);
extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
struct rmap_nested **n_rmap);
extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
struct kvm_memory_slot *memslot,
unsigned long gpa, unsigned long hpa,
unsigned long nbytes);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#endif /* __ASM_KVM_BOOK3S_64_H__ */
......@@ -25,6 +25,9 @@
#define XICS_MFRR 0xc
#define XICS_IPI 2 /* interrupt source # for IPIs */
/* LPIDs we support with this build -- runtime limit may be lower */
#define KVMPPC_NR_LPIDS (LPID_RSVD + 1)
/* Maximum number of threads per physical core */
#define MAX_SMT_THREADS 8
......
......@@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
{
vcpu->arch.cr = val;
vcpu->arch.regs.ccr = val;
}
static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
{
return vcpu->arch.cr;
return vcpu->arch.regs.ccr;
}
static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
......
......@@ -46,6 +46,7 @@
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
#include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */
#define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES)
#define KVM_MAX_NESTED_GUESTS KVMPPC_NR_LPIDS
#else
#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
......@@ -94,6 +95,7 @@ struct dtl_entry;
struct kvmppc_vcpu_book3s;
struct kvmppc_book3s_shadow_vcpu;
struct kvm_nested_guest;
struct kvm_vm_stat {
ulong remote_tlb_flush;
......@@ -287,10 +289,12 @@ struct kvm_arch {
u8 radix;
u8 fwnmi_enabled;
bool threads_indep;
bool nested_enable;
pgd_t *pgtable;
u64 process_table;
struct dentry *debugfs_dir;
struct dentry *htab_dentry;
struct dentry *radix_dentry;
struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
......@@ -311,6 +315,9 @@ struct kvm_arch {
#endif
struct kvmppc_ops *kvm_ops;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
u64 l1_ptcr;
int max_nested_lpid;
struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
/* This array can grow quite large, keep it at the end */
struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
#endif
......@@ -360,7 +367,9 @@ struct kvmppc_pte {
bool may_write : 1;
bool may_execute : 1;
unsigned long wimg;
unsigned long rc;
u8 page_size; /* MMU_PAGE_xxx */
u8 page_shift;
};
struct kvmppc_mmu {
......@@ -537,8 +546,6 @@ struct kvm_vcpu_arch {
ulong tar;
#endif
u32 cr;
#ifdef CONFIG_PPC_BOOK3S
ulong hflags;
ulong guest_owned_ext;
......@@ -707,6 +714,7 @@ struct kvm_vcpu_arch {
u8 hcall_needed;
u8 epr_flags; /* KVMPPC_EPR_xxx */
u8 epr_needed;
u8 external_oneshot; /* clear external irq after delivery */
u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
......@@ -781,6 +789,10 @@ struct kvm_vcpu_arch {
u32 emul_inst;
u32 online;
/* For support of nested guests */
struct kvm_nested_guest *nested;
u32 nested_vcpu_id;
#endif
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
......
......@@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
(iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
(stt)->size, (ioba), (npages)) ? \
H_PARAMETER : H_SUCCESS)
extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt,
unsigned long tce);
extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
unsigned long *ua, unsigned long **prmap);
extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
unsigned long idx, unsigned long tce);
......@@ -585,6 +583,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
int level, bool line_status);
extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
#else
static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
u32 priority) { return -1; }
......@@ -607,6 +606,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
int level, bool line_status) { return -ENODEV; }
static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
#endif /* CONFIG_KVM_XIVE */
/*
......@@ -652,6 +652,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
unsigned long mfrr);
int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu);
/*
* Host-side operations we want to set up while running in real
......
......@@ -104,6 +104,7 @@
#define OP_31_XOP_LHZUX 311
#define OP_31_XOP_MSGSNDP 142
#define OP_31_XOP_MSGCLRP 174
#define OP_31_XOP_TLBIE 306
#define OP_31_XOP_MFSPR 339
#define OP_31_XOP_LWAX 341
#define OP_31_XOP_LHAX 343
......
......@@ -415,6 +415,7 @@
#define HFSCR_DSCR __MASK(FSCR_DSCR_LG)
#define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG)
#define HFSCR_FP __MASK(FSCR_FP_LG)
#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */
#define SPRN_TAR 0x32f /* Target Address Register */
#define SPRN_LPCR 0x13E /* LPAR Control Register */
#define LPCR_VPM0 ASM_CONST(0x8000000000000000)
......@@ -766,6 +767,7 @@
#define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */
#define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */
#define HSRR1_DENORM 0x00100000 /* Denorm exception */
#define HSRR1_HISI_WRITE 0x00010000 /* HISI bcs couldn't update mem */
#define SPRN_TBCTL 0x35f /* PA6T Timebase control register */
#define TBCTL_FREEZE 0x0000000000000000ull /* Freeze all tbs */
......
......@@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char {
#define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
#define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf)
#define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0)
/* Transactional Memory checkpointed state:
* This is all GPRs, all VSX regs and a subset of SPRs
......
......@@ -438,7 +438,7 @@ int main(void)
#ifdef CONFIG_PPC_BOOK3S
OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
#endif
OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
......@@ -503,6 +503,7 @@ int main(void)
OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
OFFSET(VCPU_CPU, kvm_vcpu, cpu);
OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
#endif
......@@ -695,7 +696,7 @@ int main(void)
#endif /* CONFIG_PPC_BOOK3S_64 */
#else /* CONFIG_PPC_BOOK3S */
OFFSET(VCPU_CR, kvm_vcpu, arch.cr);
OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
......
......@@ -147,8 +147,8 @@ __init_hvmode_206:
rldicl. r0,r3,4,63
bnelr
ld r5,CPU_SPEC_FEATURES(r4)
LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE)
xor r5,r5,r6
LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST)
andc r5,r5,r6
std r5,CPU_SPEC_FEATURES(r4)
blr
......
......@@ -75,7 +75,8 @@ kvm-hv-y += \
book3s_hv.o \
book3s_hv_interrupts.o \
book3s_64_mmu_hv.o \
book3s_64_mmu_radix.o
book3s_64_mmu_radix.o \
book3s_hv_nested.o
kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
book3s_hv_tm.o
......
......@@ -153,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break;
case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break;
case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break;
case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break;
case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break;
case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break;
case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break;
......@@ -239,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq)
{
unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL;
if (irq->irq == KVM_INTERRUPT_SET_LEVEL)
vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL;
/*
* This case (KVM_INTERRUPT_SET) should never actually arise for
* a pseries guest (because pseries guests expect their interrupt
* controllers to continue asserting an external interrupt request
* until it is acknowledged at the interrupt controller), but is
* included to avoid ABI breakage and potentially for other
* sorts of guest.
*
* There is a subtlety here: HV KVM does not test the
* external_oneshot flag in the code that synthesizes
* external interrupts for the guest just before entering
* the guest. That is OK even if userspace did do a
* KVM_INTERRUPT_SET on a pseries guest vcpu, because the
* caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick()
* which ends up doing a smp_send_reschedule(), which will
* pull the guest all the way out to the host, meaning that
* we will call kvmppc_core_prepare_to_enter() before entering
* the guest again, and that will handle the external_oneshot
* flag correctly.
*/
if (irq->irq == KVM_INTERRUPT_SET)
vcpu->arch.external_oneshot = 1;
kvmppc_book3s_queue_irqprio(vcpu, vec);
kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
}
void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
{
kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
}
void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
......@@ -281,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
vec = BOOK3S_INTERRUPT_DECREMENTER;
break;
case BOOK3S_IRQPRIO_EXTERNAL:
case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
vec = BOOK3S_INTERRUPT_EXTERNAL;
break;
......@@ -355,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
case BOOK3S_IRQPRIO_DECREMENTER:
/* DEC interrupts get cleared by mtdec */
return false;
case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
/* External interrupts get cleared by userspace */
case BOOK3S_IRQPRIO_EXTERNAL:
/*
* External interrupts get cleared by userspace
* except when set by the KVM_INTERRUPT ioctl with
* KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL).
*/
if (vcpu->arch.external_oneshot) {
vcpu->arch.external_oneshot = 0;
return true;
}
return false;
}
......
......@@ -268,14 +268,13 @@ int kvmppc_mmu_hv_init(void)
{
unsigned long host_lpid, rsvd_lpid;
if (!cpu_has_feature(CPU_FTR_HVMODE))
return -EINVAL;
if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
return -EINVAL;
/* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
host_lpid = mfspr(SPRN_LPID);
host_lpid = 0;
if (cpu_has_feature(CPU_FTR_HVMODE))
host_lpid = mfspr(SPRN_LPID);
rsvd_lpid = LPID_RSVD;
kvmppc_init_lpid(rsvd_lpid + 1);
......
......@@ -10,6 +10,9 @@
#include <linux/string.h>
#include <linux/kvm.h>
#include <linux/kvm_host.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/debugfs.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
......@@ -26,87 +29,74 @@
*/
static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, bool data, bool iswrite)
int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, u64 root,
u64 *pte_ret_p)
{
struct kvm *kvm = vcpu->kvm;
u32 pid;
int ret, level, ps;
__be64 prte, rpte;
unsigned long ptbl;
unsigned long root, pte, index;
unsigned long rts, bits, offset;
unsigned long gpa;
unsigned long proc_tbl_size;
/* Work out effective PID */
switch (eaddr >> 62) {
case 0:
pid = vcpu->arch.pid;
break;
case 3:
pid = 0;
break;
default:
return -EINVAL;
}
proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
if (pid * 16 >= proc_tbl_size)
return -EINVAL;
/* Read partition table to find root of tree for effective PID */
ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
if (ret)
return ret;
unsigned long rts, bits, offset, index;
u64 pte, base, gpa;
__be64 rpte;
root = be64_to_cpu(prte);
rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
((root & RTS2_MASK) >> RTS2_SHIFT);
bits = root & RPDS_MASK;
root = root & RPDB_MASK;
base = root & RPDB_MASK;
offset = rts + 31;
/* current implementations only support 52-bit space */
/* Current implementations only support 52-bit space */
if (offset != 52)
return -EINVAL;
/* Walk each level of the radix tree */
for (level = 3; level >= 0; --level) {
u64 addr;
/* Check a valid size */
if (level && bits != p9_supported_radix_bits[level])
return -EINVAL;
if (level == 0 && !(bits == 5 || bits == 9))
return -EINVAL;
offset -= bits;
index = (eaddr >> offset) & ((1UL << bits) - 1);
/* check that low bits of page table base are zero */
if (root & ((1UL << (bits + 3)) - 1))
/* Check that low bits of page table base are zero */
if (base & ((1UL << (bits + 3)) - 1))
return -EINVAL;
ret = kvm_read_guest(kvm, root + index * 8,
&rpte, sizeof(rpte));
if (ret)
/* Read the entry from guest memory */
addr = base + (index * sizeof(rpte));
ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
if (ret) {
if (pte_ret_p)
*pte_ret_p = addr;
return ret;
}
pte = __be64_to_cpu(rpte);
if (!(pte & _PAGE_PRESENT))
return -ENOENT;
/* Check if a leaf entry */
if (pte & _PAGE_PTE)
break;
bits = pte & 0x1f;
root = pte & 0x0fffffffffffff00ul;
/* Get ready to walk the next level */
base = pte & RPDB_MASK;
bits = pte & RPDS_MASK;
}
/* need a leaf at lowest level; 512GB pages not supported */
/* Need a leaf at lowest level; 512GB pages not supported */
if (level < 0 || level == 3)
return -EINVAL;
/* offset is now log base 2 of the page size */
/* We found a valid leaf PTE */
/* Offset is now log base 2 of the page size */
gpa = pte & 0x01fffffffffff000ul;
if (gpa & ((1ul << offset) - 1))
return -EINVAL;
gpa += eaddr & ((1ul << offset) - 1);
gpa |= eaddr & ((1ul << offset) - 1);
for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
if (offset == mmu_psize_defs[ps].shift)
break;
gpte->page_size = ps;
gpte->page_shift = offset;
gpte->eaddr = eaddr;
gpte->raddr = gpa;
......@@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
gpte->may_read = !!(pte & _PAGE_READ);
gpte->may_write = !!(pte & _PAGE_WRITE);
gpte->may_execute = !!(pte & _PAGE_EXEC);
gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
if (pte_ret_p)
*pte_ret_p = pte;
return 0;
}
/*
* Used to walk a partition or process table radix tree in guest memory
* Note: We exploit the fact that a partition table and a process
* table have the same layout, a partition-scoped page table and a
* process-scoped page table have the same layout, and the 2nd
* doubleword of a partition table entry has the same layout as
* the PTCR register.
*/
int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, u64 table,
int table_index, u64 *pte_ret_p)
{
struct kvm *kvm = vcpu->kvm;
int ret;
unsigned long size, ptbl, root;
struct prtb_entry entry;
if ((table & PRTS_MASK) > 24)
return -EINVAL;
size = 1ul << ((table & PRTS_MASK) + 12);
/* Is the table big enough to contain this entry? */
if ((table_index * sizeof(entry)) >= size)
return -EINVAL;
/* Read the table to find the root of the radix tree */
ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
if (ret)
return ret;
/* Root is stored in the first double word */
root = be64_to_cpu(entry.prtb0);
return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
}
int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, bool data, bool iswrite)
{
u32 pid;
u64 pte;
int ret;
/* Work out effective PID */
switch (eaddr >> 62) {
case 0:
pid = vcpu->arch.pid;
break;
case 3:
pid = 0;
break;
default:
return -EINVAL;
}
ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
vcpu->kvm->arch.process_table, pid, &pte);
if (ret)
return ret;
/* Check privilege (applies only to process scoped translations) */
if (kvmppc_get_msr(vcpu) & MSR_PR) {
if (pte & _PAGE_PRIVILEGED) {
gpte->may_read = 0;
......@@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
}
static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
unsigned int pshift)
unsigned int pshift, unsigned int lpid)
{
unsigned long psize = PAGE_SIZE;
int psi;
long rc;
unsigned long rb;
if (pshift)
psize = 1UL << pshift;
else
pshift = PAGE_SHIFT;
addr &= ~(psize - 1);
radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
if (!kvmhv_on_pseries()) {
radix__flush_tlb_lpid_page(lpid, addr, psize);
return;
}
psi = shift_to_mmu_psize(pshift);
rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
lpid, rb);
if (rc)
pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
}
static void kvmppc_radix_flush_pwc(struct kvm *kvm)
static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
{
radix__flush_pwc_lpid(kvm->arch.lpid);
long rc;
if (!kvmhv_on_pseries()) {
radix__flush_pwc_lpid(lpid);
return;
}
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
lpid, TLBIEL_INVAL_SET_LPID);
if (rc)
pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
}
static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
......@@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
kmem_cache_free(kvm_pmd_cache, pmdp);
}
static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
unsigned long gpa, unsigned int shift)
/* Called with kvm->mmu_lock held */
void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
unsigned int shift, struct kvm_memory_slot *memslot,
unsigned int lpid)
{
unsigned long page_size = 1ul << shift;
unsigned long old;
unsigned long gfn = gpa >> PAGE_SHIFT;
unsigned long page_size = PAGE_SIZE;
unsigned long hpa;
old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift);
if (old & _PAGE_DIRTY) {
unsigned long gfn = gpa >> PAGE_SHIFT;
struct kvm_memory_slot *memslot;
kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
/* The following only applies to L1 entries */
if (lpid != kvm->arch.lpid)
return;
if (!memslot) {
memslot = gfn_to_memslot(kvm, gfn);
if (memslot && memslot->dirty_bitmap)
kvmppc_update_dirty_map(memslot, gfn, page_size);
if (!memslot)
return;
}
if (shift)
page_size = 1ul << shift;
gpa &= ~(page_size - 1);
hpa = old & PTE_RPN_MASK;
kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
kvmppc_update_dirty_map(memslot, gfn, page_size);
}
/*
......@@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
* and emit a warning if encountered, but there may already be data
* corruption due to the unexpected mappings.
*/
static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
unsigned int lpid)
{
if (full) {
memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
......@@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
WARN_ON_ONCE(1);
kvmppc_unmap_pte(kvm, p,
pte_pfn(*p) << PAGE_SHIFT,
PAGE_SHIFT);
PAGE_SHIFT, NULL, lpid);
}
}
kvmppc_pte_free(pte);
}
static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
unsigned int lpid)
{
unsigned long im;
pmd_t *p = pmd;
......@@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
WARN_ON_ONCE(1);
kvmppc_unmap_pte(kvm, (pte_t *)p,
pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
PMD_SHIFT);
PMD_SHIFT, NULL, lpid);
}
} else {
pte_t *pte;
pte = pte_offset_map(p, 0);
kvmppc_unmap_free_pte(kvm, pte, full);
kvmppc_unmap_free_pte(kvm, pte, full, lpid);
pmd_clear(p);
}
}
kvmppc_pmd_free(pmd);
}
static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
unsigned int lpid)
{
unsigned long iu;
pud_t *p = pud;
......@@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
pmd_t *pmd;
pmd = pmd_offset(p, 0);
kvmppc_unmap_free_pmd(kvm, pmd, true);
kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
pud_clear(p);
}
}
pud_free(kvm->mm, pud);
}
void kvmppc_free_radix(struct kvm *kvm)
void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
{
unsigned long ig;
pgd_t *pgd;
if (!kvm->arch.pgtable)
return;
pgd = kvm->arch.pgtable;
for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
pud_t *pud;
if (!pgd_present(*pgd))
continue;
pud = pud_offset(pgd, 0);
kvmppc_unmap_free_pud(kvm, pud);
kvmppc_unmap_free_pud(kvm, pud, lpid);
pgd_clear(pgd);
}
pgd_free(kvm->mm, kvm->arch.pgtable);
kvm->arch.pgtable = NULL;
}
void kvmppc_free_radix(struct kvm *kvm)
{
if (kvm->arch.pgtable) {
kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
kvm->arch.lpid);
pgd_free(kvm->mm, kvm->arch.pgtable);
kvm->arch.pgtable = NULL;
}
}
static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
unsigned long gpa)
unsigned long gpa, unsigned int lpid)
{
pte_t *pte = pte_offset_kernel(pmd, 0);
......@@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
* flushing the PWC again.
*/
pmd_clear(pmd);
kvmppc_radix_flush_pwc(kvm);
kvmppc_radix_flush_pwc(kvm, lpid);
kvmppc_unmap_free_pte(kvm, pte, false);
kvmppc_unmap_free_pte(kvm, pte, false, lpid);
}
static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
unsigned long gpa)
unsigned long gpa, unsigned int lpid)
{
pmd_t *pmd = pmd_offset(pud, 0);
......@@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
* so can be freed without flushing the PWC again.
*/
pud_clear(pud);
kvmppc_radix_flush_pwc(kvm);
kvmppc_radix_flush_pwc(kvm, lpid);
kvmppc_unmap_free_pmd(kvm, pmd, false);
kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
}
/*
......@@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
*/
#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
unsigned int level, unsigned long mmu_seq)
int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
unsigned long gpa, unsigned int level,
unsigned long mmu_seq, unsigned int lpid,
unsigned long *rmapp, struct rmap_nested **n_rmap)
{
pgd_t *pgd;
pud_t *pud, *new_pud = NULL;
......@@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
int ret;
/* Traverse the guest's 2nd-level tree, allocate new levels needed */
pgd = kvm->arch.pgtable + pgd_index(gpa);
pgd = pgtable + pgd_index(gpa);
pud = NULL;
if (pgd_present(*pgd))
pud = pud_offset(pgd, gpa);
......@@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
goto out_unlock;
}
/* Valid 1GB page here already, remove it */
kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT);
kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
lpid);
}
if (level == 2) {
if (!pud_none(*pud)) {
......@@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
* install a large page, so remove and free the page
* table page.
*/
kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa);
kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
}
kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
if (rmapp && n_rmap)
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0;
goto out_unlock;
}
......@@ -458,7 +572,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) &
PTE_BITS_MUST_MATCH);
kvmppc_radix_update_pte(kvm, pmdp_ptep(pmd),
0, pte_val(pte), lgpa, PMD_SHIFT);
0, pte_val(pte), lgpa, PMD_SHIFT);
ret = 0;
goto out_unlock;
}
......@@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
goto out_unlock;
}
/* Valid 2MB page here already, remove it */
kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT);
kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
lpid);
}
if (level == 1) {
if (!pmd_none(*pmd)) {
......@@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
* install a large page, so remove and free the page
* table page.
*/
kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa);
kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
}
kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
if (rmapp && n_rmap)
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0;
goto out_unlock;
}
......@@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
goto out_unlock;
}
kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
if (rmapp && n_rmap)
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0;
out_unlock:
......@@ -521,95 +640,49 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
return ret;
}
int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr)
bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
unsigned long gpa, unsigned int lpid)
{
unsigned long pgflags;
unsigned int shift;
pte_t *ptep;
/*
* Need to set an R or C bit in the 2nd-level tables;
* since we are just helping out the hardware here,
* it is sufficient to do what the hardware does.
*/
pgflags = _PAGE_ACCESSED;
if (writing)
pgflags |= _PAGE_DIRTY;
/*
* We are walking the secondary (partition-scoped) page table here.
* We can do this without disabling irq because the Linux MM
* subsystem doesn't do THP splits and collapses on this tree.
*/
ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
return true;
}
return false;
}
int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
unsigned long gpa,
struct kvm_memory_slot *memslot,
bool writing, bool kvm_ro,
pte_t *inserted_pte, unsigned int *levelp)
{
struct kvm *kvm = vcpu->kvm;
unsigned long mmu_seq;
unsigned long gpa, gfn, hva;
struct kvm_memory_slot *memslot;
struct page *page = NULL;
long ret;
bool writing;
unsigned long mmu_seq;
unsigned long hva, gfn = gpa >> PAGE_SHIFT;
bool upgrade_write = false;
bool *upgrade_p = &upgrade_write;
pte_t pte, *ptep;
unsigned long pgflags;
unsigned int shift, level;
/* Check for unusual errors */
if (dsisr & DSISR_UNSUPP_MMU) {
pr_err("KVM: Got unsupported MMU fault\n");
return -EFAULT;
}
if (dsisr & DSISR_BADACCESS) {
/* Reflect to the guest as DSI */
pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
/* Translate the logical address and get the page */
gpa = vcpu->arch.fault_gpa & ~0xfffUL;
gpa &= ~0xF000000000000000ul;
gfn = gpa >> PAGE_SHIFT;
if (!(dsisr & DSISR_PRTABLE_FAULT))
gpa |= ea & 0xfff;
memslot = gfn_to_memslot(kvm, gfn);
/* No memslot means it's an emulated MMIO region */
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
DSISR_SET_RC)) {
/*
* Bad address in guest page table tree, or other
* unusual error - reflect it to the guest as DSI.
*/
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
dsisr & DSISR_ISSTORE);
}
writing = (dsisr & DSISR_ISSTORE) != 0;
if (memslot->flags & KVM_MEM_READONLY) {
if (writing) {
/* give the guest a DSI */
dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
upgrade_p = NULL;
}
if (dsisr & DSISR_SET_RC) {
/*
* Need to set an R or C bit in the 2nd-level tables;
* since we are just helping out the hardware here,
* it is sufficient to do what the hardware does.
*/
pgflags = _PAGE_ACCESSED;
if (writing)
pgflags |= _PAGE_DIRTY;
/*
* We are walking the secondary page table here. We can do this
* without disabling irq.
*/
spin_lock(&kvm->mmu_lock);
ptep = __find_linux_pte(kvm->arch.pgtable,
gpa, NULL, &shift);
if (ptep && pte_present(*ptep) &&
(!writing || pte_write(*ptep))) {
kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
gpa, shift);
dsisr &= ~DSISR_SET_RC;
}
spin_unlock(&kvm->mmu_lock);
if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
DSISR_PROTFAULT | DSISR_SET_RC)))
return RESUME_GUEST;
}
int ret;
/* used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
......@@ -622,7 +695,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
* is that the page is writable.
*/
hva = gfn_to_hva_memslot(memslot, gfn);
if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
upgrade_write = true;
} else {
unsigned long pfn;
......@@ -680,7 +753,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
}
/* Allocate space in the tree and write the PTE */
ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq);
ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
mmu_seq, kvm->arch.lpid, NULL, NULL);
if (inserted_pte)
*inserted_pte = pte;
if (levelp)
*levelp = level;
if (page) {
if (!ret && (pte_val(pte) & _PAGE_WRITE))
......@@ -688,6 +766,82 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
put_page(page);
}
return ret;
}
int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr)
{
struct kvm *kvm = vcpu->kvm;
unsigned long gpa, gfn;
struct kvm_memory_slot *memslot;
long ret;
bool writing = !!(dsisr & DSISR_ISSTORE);
bool kvm_ro = false;
/* Check for unusual errors */
if (dsisr & DSISR_UNSUPP_MMU) {
pr_err("KVM: Got unsupported MMU fault\n");
return -EFAULT;
}
if (dsisr & DSISR_BADACCESS) {
/* Reflect to the guest as DSI */
pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
/* Translate the logical address */
gpa = vcpu->arch.fault_gpa & ~0xfffUL;
gpa &= ~0xF000000000000000ul;
gfn = gpa >> PAGE_SHIFT;
if (!(dsisr & DSISR_PRTABLE_FAULT))
gpa |= ea & 0xfff;
/* Get the corresponding memslot */
memslot = gfn_to_memslot(kvm, gfn);
/* No memslot means it's an emulated MMIO region */
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
DSISR_SET_RC)) {
/*
* Bad address in guest page table tree, or other
* unusual error - reflect it to the guest as DSI.
*/
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
}
if (memslot->flags & KVM_MEM_READONLY) {
if (writing) {
/* give the guest a DSI */
kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
DSISR_PROTFAULT);
return RESUME_GUEST;
}
kvm_ro = true;
}
/* Failed to set the reference/change bits */
if (dsisr & DSISR_SET_RC) {
spin_lock(&kvm->mmu_lock);
if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
writing, gpa, kvm->arch.lpid))
dsisr &= ~DSISR_SET_RC;
spin_unlock(&kvm->mmu_lock);
if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
DSISR_PROTFAULT | DSISR_SET_RC)))
return RESUME_GUEST;
}
/* Try to insert a pte */
ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
kvm_ro, NULL, NULL);
if (ret == 0 || ret == -EAGAIN)
ret = RESUME_GUEST;
return ret;
......@@ -700,20 +854,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift;
unsigned long old;
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (ptep && pte_present(*ptep)) {
old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0,
gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift);
if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
unsigned long psize = PAGE_SIZE;
if (shift)
psize = 1ul << shift;
kvmppc_update_dirty_map(memslot, gfn, psize);
}
}
if (ptep && pte_present(*ptep))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
kvm->arch.lpid);
return 0;
}
......@@ -768,7 +913,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
ret = 1 << (shift - PAGE_SHIFT);
kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
}
return ret;
}
......@@ -853,6 +998,215 @@ static void pmd_ctor(void *addr)
memset(addr, 0, RADIX_PMD_TABLE_SIZE);
}
struct debugfs_radix_state {
struct kvm *kvm;
struct mutex mutex;
unsigned long gpa;
int lpid;
int chars_left;
int buf_index;
char buf[128];
u8 hdr;
};
static int debugfs_radix_open(struct inode *inode, struct file *file)
{
struct kvm *kvm = inode->i_private;
struct debugfs_radix_state *p;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return -ENOMEM;
kvm_get_kvm(kvm);
p->kvm = kvm;
mutex_init(&p->mutex);
file->private_data = p;
return nonseekable_open(inode, file);
}
static int debugfs_radix_release(struct inode *inode, struct file *file)
{
struct debugfs_radix_state *p = file->private_data;
kvm_put_kvm(p->kvm);
kfree(p);
return 0;
}
static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
struct debugfs_radix_state *p = file->private_data;
ssize_t ret, r;
unsigned long n;
struct kvm *kvm;
unsigned long gpa;
pgd_t *pgt;
struct kvm_nested_guest *nested;
pgd_t pgd, *pgdp;
pud_t pud, *pudp;
pmd_t pmd, *pmdp;
pte_t *ptep;
int shift;
unsigned long pte;
kvm = p->kvm;
if (!kvm_is_radix(kvm))
return 0;
ret = mutex_lock_interruptible(&p->mutex);
if (ret)
return ret;
if (p->chars_left) {
n = p->chars_left;
if (n > len)
n = len;
r = copy_to_user(buf, p->buf + p->buf_index, n);
n -= r;
p->chars_left -= n;
p->buf_index += n;
buf += n;
len -= n;
ret = n;
if (r) {
if (!n)
ret = -EFAULT;
goto out;
}
}
gpa = p->gpa;
nested = NULL;
pgt = NULL;
while (len != 0 && p->lpid >= 0) {
if (gpa >= RADIX_PGTABLE_RANGE) {
gpa = 0;
pgt = NULL;
if (nested) {
kvmhv_put_nested(nested);
nested = NULL;
}
p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
p->hdr = 0;
if (p->lpid < 0)
break;
}
if (!pgt) {
if (p->lpid == 0) {
pgt = kvm->arch.pgtable;
} else {
nested = kvmhv_get_nested(kvm, p->lpid, false);
if (!nested) {
gpa = RADIX_PGTABLE_RANGE;
continue;
}
pgt = nested->shadow_pgtable;
}
}
n = 0;
if (!p->hdr) {
if (p->lpid > 0)
n = scnprintf(p->buf, sizeof(p->buf),
"\nNested LPID %d: ", p->lpid);
n += scnprintf(p->buf + n, sizeof(p->buf) - n,
"pgdir: %lx\n", (unsigned long)pgt);
p->hdr = 1;
goto copy;
}
pgdp = pgt + pgd_index(gpa);
pgd = READ_ONCE(*pgdp);
if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
continue;
}
pudp = pud_offset(&pgd, gpa);
pud = READ_ONCE(*pudp);
if (!(pud_val(pud) & _PAGE_PRESENT)) {
gpa = (gpa & PUD_MASK) + PUD_SIZE;
continue;
}
if (pud_val(pud) & _PAGE_PTE) {
pte = pud_val(pud);
shift = PUD_SHIFT;
goto leaf;
}
pmdp = pmd_offset(&pud, gpa);
pmd = READ_ONCE(*pmdp);
if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
gpa = (gpa & PMD_MASK) + PMD_SIZE;
continue;
}
if (pmd_val(pmd) & _PAGE_PTE) {
pte = pmd_val(pmd);
shift = PMD_SHIFT;
goto leaf;
}
ptep = pte_offset_kernel(&pmd, gpa);
pte = pte_val(READ_ONCE(*ptep));
if (!(pte & _PAGE_PRESENT)) {
gpa += PAGE_SIZE;
continue;
}
shift = PAGE_SHIFT;
leaf:
n = scnprintf(p->buf, sizeof(p->buf),
" %lx: %lx %d\n", gpa, pte, shift);
gpa += 1ul << shift;
copy:
p->chars_left = n;
if (n > len)
n = len;
r = copy_to_user(buf, p->buf, n);
n -= r;
p->chars_left -= n;
p->buf_index = n;
buf += n;
len -= n;
ret += n;
if (r) {
if (!ret)
ret = -EFAULT;
break;
}
}
p->gpa = gpa;
if (nested)
kvmhv_put_nested(nested);
out:
mutex_unlock(&p->mutex);
return ret;
}
static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
return -EACCES;
}
static const struct file_operations debugfs_radix_fops = {
.owner = THIS_MODULE,
.open = debugfs_radix_open,
.release = debugfs_radix_release,
.read = debugfs_radix_read,
.write = debugfs_radix_write,
.llseek = generic_file_llseek,
};
void kvmhv_radix_debugfs_init(struct kvm *kvm)
{
kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
kvm->arch.debugfs_dir, kvm,
&debugfs_radix_fops);
}
int kvmppc_radix_init(void)
{
unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
......
......@@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
return ret;
}
static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long tce)
{
unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
enum dma_data_direction dir = iommu_tce_direction(tce);
struct kvmppc_spapr_tce_iommu_table *stit;
unsigned long ua = 0;
/* Allow userspace to poison TCE table */
if (dir == DMA_NONE)
return H_SUCCESS;
if (iommu_tce_check_gpa(stt->page_shift, gpa))
return H_TOO_HARD;
if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
return H_TOO_HARD;
list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
unsigned long hpa = 0;
struct mm_iommu_table_group_mem_t *mem;
long shift = stit->tbl->it_page_shift;
mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
if (!mem)
return H_TOO_HARD;
if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa))
return H_TOO_HARD;
}
return H_SUCCESS;
}
static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
{
unsigned long hpa = 0;
......@@ -401,7 +435,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
long ret;
if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
return H_HARDWARE;
return H_TOO_HARD;
if (dir == DMA_NONE)
return H_SUCCESS;
......@@ -449,15 +483,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
return H_TOO_HARD;
if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
return H_HARDWARE;
return H_TOO_HARD;
if (mm_iommu_mapped_inc(mem))
return H_CLOSED;
return H_TOO_HARD;
ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
if (WARN_ON_ONCE(ret)) {
mm_iommu_mapped_dec(mem);
return H_HARDWARE;
return H_TOO_HARD;
}
if (dir != DMA_NONE)
......@@ -517,8 +551,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
idx = srcu_read_lock(&vcpu->kvm->srcu);
if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) {
if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
ret = H_PARAMETER;
goto unlock_exit;
}
......@@ -533,14 +566,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
entry, ua, dir);
if (ret == H_SUCCESS)
continue;
if (ret == H_TOO_HARD)
if (ret != H_SUCCESS) {
kvmppc_clear_tce(stit->tbl, entry);
goto unlock_exit;
WARN_ON_ONCE(1);
kvmppc_clear_tce(stit->tbl, entry);
}
}
kvmppc_tce_put(stt, entry, tce);
......@@ -583,7 +612,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
return ret;
idx = srcu_read_lock(&vcpu->kvm->srcu);
if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
ret = H_TOO_HARD;
goto unlock_exit;
}
......@@ -599,10 +628,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
ret = kvmppc_tce_validate(stt, tce);
if (ret != H_SUCCESS)
goto unlock_exit;
}
for (i = 0; i < npages; ++i) {
/*
* This looks unsafe, because we validate, then regrab
* the TCE from userspace which could have been changed by
* another thread.
*
* But it actually is safe, because the relevant checks will be
* re-executed in the following code. If userspace tries to
* change this dodgily it will result in a messier failure mode
* but won't threaten the host.
*/
if (get_user(tce, tces + i)) {
ret = H_TOO_HARD;
goto unlock_exit;
}
tce = be64_to_cpu(tce);
if (kvmppc_gpa_to_ua(vcpu->kvm,
tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
&ua, NULL))
if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
return H_PARAMETER;
list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
......@@ -610,14 +655,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
stit->tbl, entry + i, ua,
iommu_tce_direction(tce));
if (ret == H_SUCCESS)
continue;
if (ret == H_TOO_HARD)
if (ret != H_SUCCESS) {
kvmppc_clear_tce(stit->tbl, entry);
goto unlock_exit;
WARN_ON_ONCE(1);
kvmppc_clear_tce(stit->tbl, entry);
}
}
kvmppc_tce_put(stt, entry + i, tce);
......
......@@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
}
EXPORT_SYMBOL_GPL(kvmppc_find_table);
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/*
* Validates TCE address.
* At the moment flags and page mask are validated.
......@@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table);
* to the table and user space is supposed to process them), we can skip
* checking other things (such as TCE is a guest RAM address or the page
* was actually allocated).
*
* WARNING: This will be called in real-mode on HV KVM and virtual
* mode on PR KVM
*/
long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long tce)
{
unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
enum dma_data_direction dir = iommu_tce_direction(tce);
struct kvmppc_spapr_tce_iommu_table *stit;
unsigned long ua = 0;
/* Allow userspace to poison TCE table */
if (dir == DMA_NONE)
......@@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
if (iommu_tce_check_gpa(stt->page_shift, gpa))
return H_PARAMETER;
if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
return H_TOO_HARD;
list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
unsigned long hpa = 0;
struct mm_iommu_table_group_mem_t *mem;
long shift = stit->tbl->it_page_shift;
mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift);
if (!mem)
return H_TOO_HARD;
if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
return H_TOO_HARD;
}
return H_SUCCESS;
}
EXPORT_SYMBOL_GPL(kvmppc_tce_validate);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
/* Note on the use of page_address() in real mode,
*
......@@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
}
EXPORT_SYMBOL_GPL(kvmppc_tce_put);
long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
unsigned long *ua, unsigned long **prmap)
{
unsigned long gfn = gpa >> PAGE_SHIFT;
unsigned long gfn = tce >> PAGE_SHIFT;
struct kvm_memory_slot *memslot;
memslot = search_memslots(kvm_memslots(kvm), gfn);
......@@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
return -EINVAL;
*ua = __gfn_to_hva_memslot(memslot, gfn) |
(gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
(tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
if (prmap)
......@@ -184,7 +201,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
return 0;
}
EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua);
EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua);
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
......@@ -300,10 +317,10 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
&hpa)))
return H_HARDWARE;
return H_TOO_HARD;
if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
return H_CLOSED;
return H_TOO_HARD;
ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
if (ret) {
......@@ -368,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
if (ret != H_SUCCESS)
return ret;
ret = kvmppc_tce_validate(stt, tce);
ret = kvmppc_rm_tce_validate(stt, tce);
if (ret != H_SUCCESS)
return ret;
dir = iommu_tce_direction(tce);
if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm,
tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
return H_PARAMETER;
entry = ioba >> stt->page_shift;
......@@ -387,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
stit->tbl, entry, ua, dir);
if (ret == H_SUCCESS)
continue;
if (ret == H_TOO_HARD)
if (ret != H_SUCCESS) {
kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
return ret;
WARN_ON_ONCE_RM(1);
kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
}
}
kvmppc_tce_put(stt, entry, tce);
......@@ -480,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
*/
struct mm_iommu_table_group_mem_t *mem;
if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL))
if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
return H_TOO_HARD;
mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
......@@ -496,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
* We do not require memory to be preregistered in this case
* so lock rmap and do __find_linux_pte_or_hugepte().
*/
if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
return H_TOO_HARD;
rmap = (void *) vmalloc_to_phys(rmap);
if (WARN_ON_ONCE_RM(!rmap))
return H_HARDWARE;
return H_TOO_HARD;
/*
* Synchronize with the MMU notifier callbacks in
......@@ -521,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
for (i = 0; i < npages; ++i) {
unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
ret = kvmppc_tce_validate(stt, tce);
ret = kvmppc_rm_tce_validate(stt, tce);
if (ret != H_SUCCESS)
goto unlock_exit;
}
for (i = 0; i < npages; ++i) {
unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
ua = 0;
if (kvmppc_gpa_to_ua(vcpu->kvm,
tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
&ua, NULL))
if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
return H_PARAMETER;
list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
......@@ -536,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
stit->tbl, entry + i, ua,
iommu_tce_direction(tce));
if (ret == H_SUCCESS)
continue;
if (ret == H_TOO_HARD)
if (ret != H_SUCCESS) {
kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
entry);
goto unlock_exit;
WARN_ON_ONCE_RM(1);
kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
}
}
kvmppc_tce_put(stt, entry + i, tce);
......
......@@ -36,7 +36,6 @@
#define OP_31_XOP_MTSR 210
#define OP_31_XOP_MTSRIN 242
#define OP_31_XOP_TLBIEL 274
#define OP_31_XOP_TLBIE 306
/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
#define OP_31_XOP_FAKE_SC1 308
#define OP_31_XOP_SLBMTE 402
......@@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
vcpu->arch.tar_tm = vcpu->arch.tar;
vcpu->arch.lr_tm = vcpu->arch.regs.link;
vcpu->arch.cr_tm = vcpu->arch.cr;
vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
vcpu->arch.xer_tm = vcpu->arch.regs.xer;
vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
}
......@@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
vcpu->arch.tar = vcpu->arch.tar_tm;
vcpu->arch.regs.link = vcpu->arch.lr_tm;
vcpu->arch.cr = vcpu->arch.cr_tm;
vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
vcpu->arch.regs.xer = vcpu->arch.xer_tm;
vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
}
......@@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
uint64_t texasr;
/* CR0 = 0 | MSR[TS] | 0 */
vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
<< CR0_SHIFT);
......@@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
tm_abort(ra_val);
/* CR0 = 0 | MSR[TS] | 0 */
vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) |
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
<< CR0_SHIFT);
......@@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
preempt_disable();
vcpu->arch.cr = (CR0_TBEGIN_FAILURE |
(vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)));
vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE |
(vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)));
vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
(((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
......
......@@ -50,6 +50,7 @@
#include <asm/reg.h>
#include <asm/ppc-opcode.h>
#include <asm/asm-prototypes.h>
#include <asm/archrandom.h>
#include <asm/debug.h>
#include <asm/disassemble.h>
#include <asm/cputable.h>
......@@ -177,6 +178,10 @@ static bool kvmppc_ipi_thread(int cpu)
{
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
/* If we're a nested hypervisor, fall back to ordinary IPIs for now */
if (kvmhv_on_pseries())
return false;
/* On POWER9 we can use msgsnd to IPI any cpu */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu);
......@@ -414,8 +419,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n",
vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
pr_err("cr = %.8lx xer = %.16lx dsisr = %.8x\n",
vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
pr_err("fault dar = %.16lx dsisr = %.8x\n",
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
......@@ -734,8 +739,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
/*
* Ensure that the read of vcore->dpdes comes after the read
* of vcpu->doorbell_request. This barrier matches the
* lwsync in book3s_hv_rmhandlers.S just before the
* fast_guest_return label.
* smb_wmb() in kvmppc_guest_entry_inject().
*/
smp_rmb();
vc = vcpu->arch.vcore;
......@@ -916,6 +920,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
break;
}
return RESUME_HOST;
case H_SET_DABR:
ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
break;
case H_SET_XDABR:
ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5));
break;
case H_GET_TCE:
ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5));
if (ret == H_TOO_HARD)
return RESUME_HOST;
break;
case H_PUT_TCE:
ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5),
......@@ -939,6 +956,33 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
if (ret == H_TOO_HARD)
return RESUME_HOST;
break;
case H_RANDOM:
if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
ret = H_HARDWARE;
break;
case H_SET_PARTITION_TABLE:
ret = H_FUNCTION;
if (vcpu->kvm->arch.nested_enable)
ret = kvmhv_set_partition_table(vcpu);
break;
case H_ENTER_NESTED:
ret = H_FUNCTION;
if (!vcpu->kvm->arch.nested_enable)
break;
ret = kvmhv_enter_nested_guest(vcpu);
if (ret == H_INTERRUPT) {
kvmppc_set_gpr(vcpu, 3, 0);
return -EINTR;
}
break;
case H_TLB_INVALIDATE:
ret = H_FUNCTION;
if (!vcpu->kvm->arch.nested_enable)
break;
ret = kvmhv_do_nested_tlbie(vcpu);
break;
default:
return RESUME_HOST;
}
......@@ -947,6 +991,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
}
/*
* Handle H_CEDE in the nested virtualization case where we haven't
* called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
* This has to be done early, not in kvmppc_pseries_do_hcall(), so
* that the cede logic in kvmppc_run_single_vcpu() works properly.
*/
static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
{
vcpu->arch.shregs.msr |= MSR_EE;
vcpu->arch.ceded = 1;
smp_mb();
if (vcpu->arch.prodded) {
vcpu->arch.prodded = 0;
smp_mb();
vcpu->arch.ceded = 0;
}
}
static int kvmppc_hcall_impl_hv(unsigned long cmd)
{
switch (cmd) {
......@@ -1089,7 +1151,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
}
/* Called with vcpu->arch.vcore->lock held */
static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct task_struct *tsk)
{
......@@ -1194,7 +1255,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
vcpu->arch.fault_dsisr = 0;
vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
DSISR_SRR1_MATCH_64S;
if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
r = RESUME_PAGE_FAULT;
break;
/*
......@@ -1210,10 +1274,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
swab32(vcpu->arch.emul_inst) :
vcpu->arch.emul_inst;
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
/* Need vcore unlocked to call kvmppc_get_last_inst */
spin_unlock(&vcpu->arch.vcore->lock);
r = kvmppc_emulate_debug_inst(run, vcpu);
spin_lock(&vcpu->arch.vcore->lock);
} else {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
......@@ -1229,12 +1290,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
r = EMULATE_FAIL;
if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
cpu_has_feature(CPU_FTR_ARCH_300)) {
/* Need vcore unlocked to call kvmppc_get_last_inst */
spin_unlock(&vcpu->arch.vcore->lock);
cpu_has_feature(CPU_FTR_ARCH_300))
r = kvmppc_emulate_doorbell_instr(vcpu);
spin_lock(&vcpu->arch.vcore->lock);
}
if (r == EMULATE_FAIL) {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
......@@ -1269,6 +1326,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
return r;
}
static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
{
int r;
int srcu_idx;
vcpu->stat.sum_exits++;
/*
* This can happen if an interrupt occurs in the last stages
* of guest entry or the first stages of guest exit (i.e. after
* setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
* and before setting it to KVM_GUEST_MODE_HOST_HV).
* That can happen due to a bug, or due to a machine check
* occurring at just the wrong time.
*/
if (vcpu->arch.shregs.msr & MSR_HV) {
pr_emerg("KVM trap in HV mode while nested!\n");
pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
vcpu->arch.trap, kvmppc_get_pc(vcpu),
vcpu->arch.shregs.msr);
kvmppc_dump_regs(vcpu);
return RESUME_HOST;
}
switch (vcpu->arch.trap) {
/* We're good on these - the host merely wanted to get our attention */
case BOOK3S_INTERRUPT_HV_DECREMENTER:
vcpu->stat.dec_exits++;
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_EXTERNAL:
vcpu->stat.ext_intr_exits++;
r = RESUME_HOST;
break;
case BOOK3S_INTERRUPT_H_DOORBELL:
case BOOK3S_INTERRUPT_H_VIRT:
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST;
break;
/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
case BOOK3S_INTERRUPT_HMI:
case BOOK3S_INTERRUPT_PERFMON:
case BOOK3S_INTERRUPT_SYSTEM_RESET:
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
/* Pass the machine check to the L1 guest */
r = RESUME_HOST;
/* Print the MCE event to host console. */
machine_check_print_event_info(&vcpu->arch.mce_evt, false);
break;
/*
* We get these next two if the guest accesses a page which it thinks
* it has mapped but which is not actually present, either because
* it is for an emulated I/O device or because the corresonding
* host page has been paged out.
*/
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvmhv_nested_page_fault(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
DSISR_SRR1_MATCH_64S;
if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvmhv_nested_page_fault(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
break;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
case BOOK3S_INTERRUPT_HV_SOFTPATCH:
/*
* This occurs for various TM-related instructions that
* we need to emulate on POWER9 DD2.2. We have already
* handled the cases where the guest was in real-suspend
* mode and was transitioning to transactional state.
*/
r = kvmhv_p9_tm_emulation(vcpu);
break;
#endif
case BOOK3S_INTERRUPT_HV_RM_HARD:
vcpu->arch.trap = 0;
r = RESUME_GUEST;
if (!xive_enabled())
kvmppc_xics_rm_complete(vcpu, 0);
break;
default:
r = RESUME_HOST;
break;
}
return r;
}
static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs)
{
......@@ -1559,6 +1714,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_ONLINE:
*val = get_reg_val(id, vcpu->arch.online);
break;
case KVM_REG_PPC_PTCR:
*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
break;
default:
r = -EINVAL;
break;
......@@ -1790,6 +1948,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
atomic_dec(&vcpu->arch.vcore->online_count);
vcpu->arch.online = i;
break;
case KVM_REG_PPC_PTCR:
vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
break;
default:
r = -EINVAL;
break;
......@@ -2023,15 +2184,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
* Set the default HFSCR for the guest from the host value.
* This value is only used on POWER9.
* On POWER9, we want to virtualize the doorbell facility, so we
* turn off the HFSCR bit, which causes those instructions to trap.
* don't set the HFSCR_MSGP bit, and that causes those instructions
* to trap and then we emulate them.
*/
vcpu->arch.hfscr = mfspr(SPRN_HFSCR);
if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
if (cpu_has_feature(CPU_FTR_HVMODE)) {
vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
vcpu->arch.hfscr |= HFSCR_TM;
}
if (cpu_has_feature(CPU_FTR_TM_COMP))
vcpu->arch.hfscr |= HFSCR_TM;
else if (!cpu_has_feature(CPU_FTR_TM_COMP))
vcpu->arch.hfscr &= ~HFSCR_TM;
if (cpu_has_feature(CPU_FTR_ARCH_300))
vcpu->arch.hfscr &= ~HFSCR_MSGP;
kvmppc_mmu_book3s_hv_init(vcpu);
......@@ -2246,10 +2410,18 @@ static void kvmppc_release_hwthread(int cpu)
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
{
struct kvm_nested_guest *nested = vcpu->arch.nested;
cpumask_t *cpu_in_guest;
int i;
cpu = cpu_first_thread_sibling(cpu);
cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
if (nested) {
cpumask_set_cpu(cpu, &nested->need_tlb_flush);
cpu_in_guest = &nested->cpu_in_guest;
} else {
cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
cpu_in_guest = &kvm->arch.cpu_in_guest;
}
/*
* Make sure setting of bit in need_tlb_flush precedes
* testing of cpu_in_guest bits. The matching barrier on
......@@ -2257,13 +2429,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
*/
smp_mb();
for (i = 0; i < threads_per_core; ++i)
if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest))
if (cpumask_test_cpu(cpu + i, cpu_in_guest))
smp_call_function_single(cpu + i, do_nothing, NULL, 1);
}
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
{
struct kvm_nested_guest *nested = vcpu->arch.nested;
struct kvm *kvm = vcpu->kvm;
int prev_cpu;
if (!cpu_has_feature(CPU_FTR_HVMODE))
return;
if (nested)
prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
else
prev_cpu = vcpu->arch.prev_cpu;
/*
* With radix, the guest can do TLB invalidations itself,
......@@ -2277,12 +2459,46 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
* ran to flush the TLB. The TLB is shared between threads,
* so we use a single bit in .need_tlb_flush for all 4 threads.
*/
if (vcpu->arch.prev_cpu != pcpu) {
if (vcpu->arch.prev_cpu >= 0 &&
cpu_first_thread_sibling(vcpu->arch.prev_cpu) !=
if (prev_cpu != pcpu) {
if (prev_cpu >= 0 &&
cpu_first_thread_sibling(prev_cpu) !=
cpu_first_thread_sibling(pcpu))
radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu);
vcpu->arch.prev_cpu = pcpu;
radix_flush_cpu(kvm, prev_cpu, vcpu);
if (nested)
nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
else
vcpu->arch.prev_cpu = pcpu;
}
}
static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
struct kvm_nested_guest *nested)
{
cpumask_t *need_tlb_flush;
int lpid;
if (!cpu_has_feature(CPU_FTR_HVMODE))
return;
if (cpu_has_feature(CPU_FTR_ARCH_300))
pcpu &= ~0x3UL;
if (nested) {
lpid = nested->shadow_lpid;
need_tlb_flush = &nested->need_tlb_flush;
} else {
lpid = kvm->arch.lpid;
need_tlb_flush = &kvm->arch.need_tlb_flush;
}
mtspr(SPRN_LPID, lpid);
isync();
smp_mb();
if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
radix__local_flush_tlb_lpid_guest(lpid);
/* Clear the bit after the TLB flush */
cpumask_clear_cpu(pcpu, need_tlb_flush);
}
}
......@@ -2608,6 +2824,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
spin_lock(&vc->lock);
now = get_tb();
for_each_runnable_thread(i, vcpu, vc) {
/*
* It's safe to unlock the vcore in the loop here, because
* for_each_runnable_thread() is safe against removal of
* the vcpu, and the vcore state is VCORE_EXITING here,
* so any vcpus becoming runnable will have their arch.trap
* set to zero and can't actually run in the guest.
*/
spin_unlock(&vc->lock);
/* cancel pending dec exception if dec is positive */
if (now < vcpu->arch.dec_expires &&
kvmppc_core_pending_dec(vcpu))
......@@ -2623,6 +2847,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
vcpu->arch.ret = ret;
vcpu->arch.trap = 0;
spin_lock(&vc->lock);
if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
if (vcpu->arch.pending_exceptions)
kvmppc_core_prepare_to_enter(vcpu);
......@@ -2971,8 +3196,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
spin_unlock(&core_info.vc[sub]->lock);
if (kvm_is_radix(vc->kvm)) {
int tmp = pcpu;
/*
* Do we need to flush the process scoped TLB for the LPAR?
*
......@@ -2983,17 +3206,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
*
* Hash must be flushed in realmode in order to use tlbiel.
*/
mtspr(SPRN_LPID, vc->kvm->arch.lpid);
isync();
if (cpu_has_feature(CPU_FTR_ARCH_300))
tmp &= ~0x3UL;
if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
/* Clear the bit after the TLB flush */
cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
}
kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
}
/*
......@@ -3087,6 +3300,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
trace_kvmppc_run_core(vc, 1);
}
/*
* Load up hypervisor-mode registers on P9.
*/
static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
s64 hdec;
u64 tb, purr, spurr;
int trap;
unsigned long host_hfscr = mfspr(SPRN_HFSCR);
unsigned long host_ciabr = mfspr(SPRN_CIABR);
unsigned long host_dawr = mfspr(SPRN_DAWR);
unsigned long host_dawrx = mfspr(SPRN_DAWRX);
unsigned long host_psscr = mfspr(SPRN_PSSCR);
unsigned long host_pidr = mfspr(SPRN_PID);
hdec = time_limit - mftb();
if (hdec < 0)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
mtspr(SPRN_HDEC, hdec);
if (vc->tb_offset) {
u64 new_tb = mftb() + vc->tb_offset;
mtspr(SPRN_TBU40, new_tb);
tb = mftb();
if ((tb & 0xffffff) < (new_tb & 0xffffff))
mtspr(SPRN_TBU40, new_tb + 0x1000000);
vc->tb_offset_applied = vc->tb_offset;
}
if (vc->pcr)
mtspr(SPRN_PCR, vc->pcr);
mtspr(SPRN_DPDES, vc->dpdes);
mtspr(SPRN_VTB, vc->vtb);
local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
mtspr(SPRN_PURR, vcpu->arch.purr);
mtspr(SPRN_SPURR, vcpu->arch.spurr);
if (cpu_has_feature(CPU_FTR_DAWR)) {
mtspr(SPRN_DAWR, vcpu->arch.dawr);
mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
}
mtspr(SPRN_CIABR, vcpu->arch.ciabr);
mtspr(SPRN_IC, vcpu->arch.ic);
mtspr(SPRN_PID, vcpu->arch.pid);
mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
(local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
mtspr(SPRN_AMOR, ~0UL);
mtspr(SPRN_LPCR, lpcr);
isync();
kvmppc_xive_push_vcpu(vcpu);
mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
trap = __kvmhv_vcpu_entry_p9(vcpu);
/* Advance host PURR/SPURR by the amount used by guest */
purr = mfspr(SPRN_PURR);
spurr = mfspr(SPRN_SPURR);
mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
purr - vcpu->arch.purr);
mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
spurr - vcpu->arch.spurr);
vcpu->arch.purr = purr;
vcpu->arch.spurr = spurr;
vcpu->arch.ic = mfspr(SPRN_IC);
vcpu->arch.pid = mfspr(SPRN_PID);
vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
mtspr(SPRN_PSSCR, host_psscr);
mtspr(SPRN_HFSCR, host_hfscr);
mtspr(SPRN_CIABR, host_ciabr);
mtspr(SPRN_DAWR, host_dawr);
mtspr(SPRN_DAWRX, host_dawrx);
mtspr(SPRN_PID, host_pidr);
/*
* Since this is radix, do a eieio; tlbsync; ptesync sequence in
* case we interrupted the guest between a tlbie and a ptesync.
*/
asm volatile("eieio; tlbsync; ptesync");
mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */
isync();
vc->dpdes = mfspr(SPRN_DPDES);
vc->vtb = mfspr(SPRN_VTB);
mtspr(SPRN_DPDES, 0);
if (vc->pcr)
mtspr(SPRN_PCR, 0);
if (vc->tb_offset_applied) {
u64 new_tb = mftb() - vc->tb_offset_applied;
mtspr(SPRN_TBU40, new_tb);
tb = mftb();
if ((tb & 0xffffff) < (new_tb & 0xffffff))
mtspr(SPRN_TBU40, new_tb + 0x1000000);
vc->tb_offset_applied = 0;
}
mtspr(SPRN_HDEC, 0x7fffffff);
mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
return trap;
}
/*
* Virtual-mode guest entry for POWER9 and later when the host and
* guest are both using the radix MMU. The LPIDR has already been set.
*/
int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long host_dscr = mfspr(SPRN_DSCR);
unsigned long host_tidr = mfspr(SPRN_TIDR);
unsigned long host_iamr = mfspr(SPRN_IAMR);
s64 dec;
u64 tb;
int trap, save_pmu;
dec = mfspr(SPRN_DEC);
tb = mftb();
if (dec < 512)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
local_paca->kvm_hstate.dec_expires = dec + tb;
if (local_paca->kvm_hstate.dec_expires < time_limit)
time_limit = local_paca->kvm_hstate.dec_expires;
vcpu->arch.ceded = 0;
kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */
kvmppc_subcore_enter_guest();
vc->entry_exit_map = 1;
vc->in_guest = 1;
if (vcpu->arch.vpa.pinned_addr) {
struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
lp->yield_count = cpu_to_be32(yield_count);
vcpu->arch.vpa.dirty = 1;
}
if (cpu_has_feature(CPU_FTR_TM) ||
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
kvmhv_load_guest_pmu(vcpu);
msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
load_fp_state(&vcpu->arch.fp);
#ifdef CONFIG_ALTIVEC
load_vr_state(&vcpu->arch.vr);
#endif
mtspr(SPRN_DSCR, vcpu->arch.dscr);
mtspr(SPRN_IAMR, vcpu->arch.iamr);
mtspr(SPRN_PSPB, vcpu->arch.pspb);
mtspr(SPRN_FSCR, vcpu->arch.fscr);
mtspr(SPRN_TAR, vcpu->arch.tar);
mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
mtspr(SPRN_BESCR, vcpu->arch.bescr);
mtspr(SPRN_WORT, vcpu->arch.wort);
mtspr(SPRN_TIDR, vcpu->arch.tid);
mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
mtspr(SPRN_AMR, vcpu->arch.amr);
mtspr(SPRN_UAMOR, vcpu->arch.uamor);
if (!(vcpu->arch.ctrl & 1))
mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
if (kvmhv_on_pseries()) {
/* call our hypervisor to load up HV regs and go */
struct hv_guest_state hvregs;
kvmhv_save_hv_regs(vcpu, &hvregs);
hvregs.lpcr = lpcr;
vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
hvregs.version = HV_GUEST_STATE_VERSION;
if (vcpu->arch.nested) {
hvregs.lpid = vcpu->arch.nested->shadow_lpid;
hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
} else {
hvregs.lpid = vcpu->kvm->arch.lpid;
hvregs.vcpu_token = vcpu->vcpu_id;
}
hvregs.hdec_expiry = time_limit;
trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
__pa(&vcpu->arch.regs));
kvmhv_restore_hv_return_state(vcpu, &hvregs);
vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
/* H_CEDE has to be handled now, not later */
if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
kvmppc_nested_cede(vcpu);
trap = 0;
}
} else {
trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
}
vcpu->arch.slb_max = 0;
dec = mfspr(SPRN_DEC);
tb = mftb();
vcpu->arch.dec_expires = dec + tb;
vcpu->cpu = -1;
vcpu->arch.thread_cpu = -1;
vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
vcpu->arch.iamr = mfspr(SPRN_IAMR);
vcpu->arch.pspb = mfspr(SPRN_PSPB);
vcpu->arch.fscr = mfspr(SPRN_FSCR);
vcpu->arch.tar = mfspr(SPRN_TAR);
vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
vcpu->arch.bescr = mfspr(SPRN_BESCR);
vcpu->arch.wort = mfspr(SPRN_WORT);
vcpu->arch.tid = mfspr(SPRN_TIDR);
vcpu->arch.amr = mfspr(SPRN_AMR);
vcpu->arch.uamor = mfspr(SPRN_UAMOR);
vcpu->arch.dscr = mfspr(SPRN_DSCR);
mtspr(SPRN_PSPB, 0);
mtspr(SPRN_WORT, 0);
mtspr(SPRN_AMR, 0);
mtspr(SPRN_UAMOR, 0);
mtspr(SPRN_DSCR, host_dscr);
mtspr(SPRN_TIDR, host_tidr);
mtspr(SPRN_IAMR, host_iamr);
mtspr(SPRN_PSPB, 0);
msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
store_fp_state(&vcpu->arch.fp);
#ifdef CONFIG_ALTIVEC
store_vr_state(&vcpu->arch.vr);
#endif
if (cpu_has_feature(CPU_FTR_TM) ||
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
save_pmu = 1;
if (vcpu->arch.vpa.pinned_addr) {
struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
lp->yield_count = cpu_to_be32(yield_count);
vcpu->arch.vpa.dirty = 1;
save_pmu = lp->pmcregs_in_use;
}
kvmhv_save_guest_pmu(vcpu, save_pmu);
vc->entry_exit_map = 0x101;
vc->in_guest = 0;
mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
kvmhv_load_host_pmu();
kvmppc_subcore_exit_guest();
return trap;
}
/*
* Wait for some other vcpu thread to execute us, and
* wake us up when we need to handle something in the host.
......@@ -3264,6 +3771,11 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
}
/*
* This never fails for a radix guest, as none of the operations it does
* for a radix guest can fail or have a way to report failure.
* kvmhv_run_single_vcpu() relies on this fact.
*/
static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
{
int r = 0;
......@@ -3413,6 +3925,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return vcpu->arch.ret;
}
int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
int trap, r, pcpu;
int srcu_idx;
struct kvmppc_vcore *vc;
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *nested = vcpu->arch.nested;
trace_kvmppc_run_vcpu_enter(vcpu);
kvm_run->exit_reason = 0;
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
vc = vcpu->arch.vcore;
vcpu->arch.ceded = 0;
vcpu->arch.run_task = current;
vcpu->arch.kvm_run = kvm_run;
vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
vcpu->arch.busy_preempt = TB_NIL;
vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
vc->runnable_threads[0] = vcpu;
vc->n_runnable = 1;
vc->runner = vcpu;
/* See if the MMU is ready to go */
if (!kvm->arch.mmu_ready)
kvmhv_setup_mmu(vcpu);
if (need_resched())
cond_resched();
kvmppc_update_vpas(vcpu);
init_vcore_to_run(vc);
vc->preempt_tb = TB_NIL;
preempt_disable();
pcpu = smp_processor_id();
vc->pcpu = pcpu;
kvmppc_prepare_radix_vcpu(vcpu, pcpu);
local_irq_disable();
hard_irq_disable();
if (signal_pending(current))
goto sigpend;
if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
goto out;
if (!nested) {
kvmppc_core_prepare_to_enter(vcpu);
if (vcpu->arch.doorbell_request) {
vc->dpdes = 1;
smp_wmb();
vcpu->arch.doorbell_request = 0;
}
if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
&vcpu->arch.pending_exceptions))
lpcr |= LPCR_MER;
} else if (vcpu->arch.pending_exceptions ||
vcpu->arch.doorbell_request ||
xive_interrupt_pending(vcpu)) {
vcpu->arch.ret = RESUME_HOST;
goto out;
}
kvmppc_clear_host_core(pcpu);
local_paca->kvm_hstate.tid = 0;
local_paca->kvm_hstate.napping = 0;
local_paca->kvm_hstate.kvm_split_mode = NULL;
kvmppc_start_thread(vcpu, vc);
kvmppc_create_dtl_entry(vcpu, vc);
trace_kvm_guest_enter(vcpu);
vc->vcore_state = VCORE_RUNNING;
trace_kvmppc_run_core(vc, 0);
if (cpu_has_feature(CPU_FTR_HVMODE))
kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested);
trace_hardirqs_on();
guest_enter_irqoff();
srcu_idx = srcu_read_lock(&kvm->srcu);
this_cpu_disable_ftrace();
trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
vcpu->arch.trap = trap;
this_cpu_enable_ftrace();
srcu_read_unlock(&kvm->srcu, srcu_idx);
if (cpu_has_feature(CPU_FTR_HVMODE)) {
mtspr(SPRN_LPID, kvm->arch.host_lpid);
isync();
}
trace_hardirqs_off();
set_irq_happened(trap);
kvmppc_set_host_core(pcpu);
local_irq_enable();
guest_exit();
cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
preempt_enable();
/* cancel pending decrementer exception if DEC is now positive */
if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
kvmppc_core_dequeue_dec(vcpu);
trace_kvm_guest_exit(vcpu);
r = RESUME_GUEST;
if (trap) {
if (!nested)
r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
else
r = kvmppc_handle_nested_exit(vcpu);
}
vcpu->arch.ret = r;
if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
!kvmppc_vcpu_woken(vcpu)) {
kvmppc_set_timer(vcpu);
while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
if (signal_pending(current)) {
vcpu->stat.signal_exits++;
kvm_run->exit_reason = KVM_EXIT_INTR;
vcpu->arch.ret = -EINTR;
break;
}
spin_lock(&vc->lock);
kvmppc_vcore_blocked(vc);
spin_unlock(&vc->lock);
}
}
vcpu->arch.ceded = 0;
vc->vcore_state = VCORE_INACTIVE;
trace_kvmppc_run_core(vc, 1);
done:
kvmppc_remove_runnable(vc, vcpu);
trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
return vcpu->arch.ret;
sigpend:
vcpu->stat.signal_exits++;
kvm_run->exit_reason = KVM_EXIT_INTR;
vcpu->arch.ret = -EINTR;
out:
local_irq_enable();
preempt_enable();
goto done;
}
static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
{
int r;
......@@ -3488,7 +4165,11 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
do {
r = kvmppc_run_vcpu(run, vcpu);
if (kvm->arch.threads_indep && kvm_is_radix(kvm))
r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
vcpu->arch.vcore->lpcr);
else
r = kvmppc_run_vcpu(run, vcpu);
if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
!(vcpu->arch.shregs.msr & MSR_PR)) {
......@@ -3731,8 +4412,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
dw1 = PATB_GR | kvm->arch.process_table;
}
mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
}
/*
......@@ -3828,6 +4508,10 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
{
if (kvm->arch.nested_enable) {
kvm->arch.nested_enable = false;
kvmhv_release_all_nested(kvm);
}
kvmppc_free_radix(kvm);
kvmppc_update_lpcr(kvm, LPCR_VPM1,
LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
......@@ -3849,6 +4533,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
kvmppc_free_hpt(&kvm->arch.hpt);
kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
kvmppc_rmap_reset(kvm);
kvm->arch.radix = 1;
return 0;
}
......@@ -3948,6 +4633,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
kvmppc_alloc_host_rm_ops();
kvmhv_vm_nested_init(kvm);
/*
* Since we don't flush the TLB when tearing down a VM,
* and this lpid might have previously been used,
......@@ -3966,9 +4653,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
/* Init LPCR for virtual RMA mode */
kvm->arch.host_lpid = mfspr(SPRN_LPID);
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
lpcr &= LPCR_PECE | LPCR_LPES;
if (cpu_has_feature(CPU_FTR_HVMODE)) {
kvm->arch.host_lpid = mfspr(SPRN_LPID);
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
lpcr &= LPCR_PECE | LPCR_LPES;
} else {
lpcr = 0;
}
lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
LPCR_VPM0 | LPCR_VPM1;
kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
......@@ -4035,8 +4726,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
* On POWER9, we only need to do this if the "indep_threads_mode"
* module parameter has been set to N.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.threads_indep = indep_threads_mode;
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
kvm->arch.threads_indep = true;
} else {
kvm->arch.threads_indep = indep_threads_mode;
}
}
if (!kvm->arch.threads_indep)
kvm_hv_vm_activated();
......@@ -4059,6 +4756,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
snprintf(buf, sizeof(buf), "vm%d", current->pid);
kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
kvmppc_mmu_debugfs_init(kvm);
if (radix_enabled())
kvmhv_radix_debugfs_init(kvm);
return 0;
}
......@@ -4081,13 +4780,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
kvmppc_free_vcores(kvm);
kvmppc_free_lpid(kvm->arch.lpid);
if (kvm_is_radix(kvm))
kvmppc_free_radix(kvm);
else
kvmppc_free_hpt(&kvm->arch.hpt);
/* Perform global invalidation and return lpid to the pool */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
if (kvm->arch.nested_enable)
kvmhv_release_all_nested(kvm);
kvm->arch.process_table = 0;
kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
}
kvmppc_free_lpid(kvm->arch.lpid);
kvmppc_free_pimap(kvm);
}
......@@ -4112,11 +4819,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
static int kvmppc_core_check_processor_compat_hv(void)
{
if (!cpu_has_feature(CPU_FTR_HVMODE) ||
!cpu_has_feature(CPU_FTR_ARCH_206))
return -EIO;
if (cpu_has_feature(CPU_FTR_HVMODE) &&
cpu_has_feature(CPU_FTR_ARCH_206))
return 0;
return 0;
/* POWER9 in radix mode is capable of being a nested hypervisor. */
if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
return 0;
return -EIO;
}
#ifdef CONFIG_KVM_XICS
......@@ -4434,6 +5145,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if (radix && !radix_enabled())
return -EINVAL;
/* If we're a nested hypervisor, we currently only support radix */
if (kvmhv_on_pseries() && !radix)
return -EINVAL;
mutex_lock(&kvm->lock);
if (radix != kvm_is_radix(kvm)) {
if (kvm->arch.mmu_ready) {
......@@ -4555,6 +5270,10 @@ static int kvmppc_book3s_init_hv(void)
if (r < 0)
return -ENODEV;
r = kvmhv_nested_init();
if (r)
return r;
r = kvm_init_subcore_bitmap();
if (r)
return r;
......@@ -4565,7 +5284,8 @@ static int kvmppc_book3s_init_hv(void)
* indirectly, via OPAL.
*/
#ifdef CONFIG_SMP
if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) {
if (!xive_enabled() && !kvmhv_on_pseries() &&
!local_paca->kvm_hstate.xics_phys) {
struct device_node *np;
np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
......@@ -4613,6 +5333,7 @@ static void kvmppc_book3s_exit_hv(void)
if (kvmppc_radix_possible())
kvmppc_radix_exit();
kvmppc_hv_ops = NULL;
kvmhv_nested_exit();
}
module_init(kvmppc_book3s_init_hv);
......
......@@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu)
void __iomem *xics_phys;
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
/* For a nested hypervisor, use the XICS via hcall */
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
IPI_PRIORITY);
return;
}
/* On POWER9 we can use msgsnd for any destination cpu. */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu);
......@@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again)
return 1;
/* Now read the interrupt from the ICP */
xics_phys = local_paca->kvm_hstate.xics_phys;
rc = 0;
if (!xics_phys)
rc = opal_int_get_xirr(&xirr, false);
else
xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
xirr = cpu_to_be32(retbuf[0]);
} else {
xics_phys = local_paca->kvm_hstate.xics_phys;
rc = 0;
if (!xics_phys)
rc = opal_int_get_xirr(&xirr, false);
else
xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
}
if (rc < 0)
return 1;
......@@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again)
*/
if (xisr == XICS_IPI) {
rc = 0;
if (xics_phys) {
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
plpar_hcall_raw(H_IPI, retbuf,
hard_smp_processor_id(), 0xff);
plpar_hcall_raw(H_EOI, retbuf, h_xirr);
} else if (xics_phys) {
__raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
__raw_rm_writel(xirr, xics_phys + XICS_XIRR);
} else {
......@@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again)
/* We raced with the host,
* we need to resend that IPI, bummer
*/
if (xics_phys)
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
plpar_hcall_raw(H_IPI, retbuf,
hard_smp_processor_id(),
IPI_PRIORITY);
} else if (xics_phys)
__raw_rm_writeb(IPI_PRIORITY,
xics_phys + XICS_MFRR);
else
......@@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
smp_mb();
local_paca->kvm_hstate.kvm_split_mode = NULL;
}
/*
* Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
* Can we inject a Decrementer or a External interrupt?
*/
void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
{
int ext;
unsigned long vec = 0;
unsigned long lpcr;
/* Insert EXTERNAL bit into LPCR at the MER bit position */
ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
lpcr = mfspr(SPRN_LPCR);
lpcr |= ext << LPCR_MER_SH;
mtspr(SPRN_LPCR, lpcr);
isync();
if (vcpu->arch.shregs.msr & MSR_EE) {
if (ext) {
vec = BOOK3S_INTERRUPT_EXTERNAL;
} else {
long int dec = mfspr(SPRN_DEC);
if (!(lpcr & LPCR_LD))
dec = (int) dec;
if (dec < 0)
vec = BOOK3S_INTERRUPT_DECREMENTER;
}
}
if (vec) {
unsigned long msr, old_msr = vcpu->arch.shregs.msr;
kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
kvmppc_set_srr1(vcpu, old_msr);
kvmppc_set_pc(vcpu, vec);
msr = vcpu->arch.intr_msr;
if (MSR_TM_ACTIVE(old_msr))
msr |= MSR_TS_S;
vcpu->arch.shregs.msr = msr;
}
if (vcpu->arch.doorbell_request) {
mtspr(SPRN_DPDES, 1);
vcpu->arch.vcore->dpdes = 1;
smp_wmb();
vcpu->arch.doorbell_request = 0;
}
}
......@@ -64,52 +64,7 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/* Save host PMU registers */
BEGIN_FTR_SECTION
/* Work around P8 PMAE bug */
li r3, -1
clrrdi r3, r3, 10
mfspr r8, SPRN_MMCR2
mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */
isync
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mfspr r7, SPRN_MMCR0 /* save MMCR0 */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */
mfspr r6, SPRN_MMCRA
/* Clear MMCRA in order to disable SDAR updates */
li r5, 0
mtspr SPRN_MMCRA, r5
isync
lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */
cmpwi r5, 0
beq 31f /* skip if not */
mfspr r5, SPRN_MMCR1
mfspr r9, SPRN_SIAR
mfspr r10, SPRN_SDAR
std r7, HSTATE_MMCR0(r13)
std r5, HSTATE_MMCR1(r13)
std r6, HSTATE_MMCRA(r13)
std r9, HSTATE_SIAR(r13)
std r10, HSTATE_SDAR(r13)
BEGIN_FTR_SECTION
mfspr r9, SPRN_SIER
std r8, HSTATE_MMCR2(r13)
std r9, HSTATE_SIER(r13)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mfspr r3, SPRN_PMC1
mfspr r5, SPRN_PMC2
mfspr r6, SPRN_PMC3
mfspr r7, SPRN_PMC4
mfspr r8, SPRN_PMC5
mfspr r9, SPRN_PMC6
stw r3, HSTATE_PMC1(r13)
stw r5, HSTATE_PMC2(r13)
stw r6, HSTATE_PMC3(r13)
stw r7, HSTATE_PMC4(r13)
stw r8, HSTATE_PMC5(r13)
stw r9, HSTATE_PMC6(r13)
31:
bl kvmhv_save_host_pmu
/*
* Put whatever is in the decrementer into the
......@@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
blr
_GLOBAL(kvmhv_save_host_pmu)
BEGIN_FTR_SECTION
/* Work around P8 PMAE bug */
li r3, -1
clrrdi r3, r3, 10
mfspr r8, SPRN_MMCR2
mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */
isync
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mfspr r7, SPRN_MMCR0 /* save MMCR0 */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */
mfspr r6, SPRN_MMCRA
/* Clear MMCRA in order to disable SDAR updates */
li r5, 0
mtspr SPRN_MMCRA, r5
isync
lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */
cmpwi r5, 0
beq 31f /* skip if not */
mfspr r5, SPRN_MMCR1
mfspr r9, SPRN_SIAR
mfspr r10, SPRN_SDAR
std r7, HSTATE_MMCR0(r13)
std r5, HSTATE_MMCR1(r13)
std r6, HSTATE_MMCRA(r13)
std r9, HSTATE_SIAR(r13)
std r10, HSTATE_SDAR(r13)
BEGIN_FTR_SECTION
mfspr r9, SPRN_SIER
std r8, HSTATE_MMCR2(r13)
std r9, HSTATE_SIER(r13)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mfspr r3, SPRN_PMC1
mfspr r5, SPRN_PMC2
mfspr r6, SPRN_PMC3
mfspr r7, SPRN_PMC4
mfspr r8, SPRN_PMC5
mfspr r9, SPRN_PMC6
stw r3, HSTATE_PMC1(r13)
stw r5, HSTATE_PMC2(r13)
stw r6, HSTATE_PMC3(r13)
stw r7, HSTATE_PMC4(r13)
stw r8, HSTATE_PMC5(r13)
stw r9, HSTATE_PMC6(r13)
31: blr
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corporation, 2018
* Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
* Paul Mackerras <paulus@ozlabs.org>
*
* Description: KVM functions specific to running nested KVM-HV guests
* on Book3S processors (specifically POWER9 and later).
*/
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <linux/llist.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
#include <asm/mmu.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/pte-walk.h>
#include <asm/reg.h>
static struct patb_entry *pseries_partition_tb;
static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
hr->pcr = vc->pcr;
hr->dpdes = vc->dpdes;
hr->hfscr = vcpu->arch.hfscr;
hr->tb_offset = vc->tb_offset;
hr->dawr0 = vcpu->arch.dawr;
hr->dawrx0 = vcpu->arch.dawrx;
hr->ciabr = vcpu->arch.ciabr;
hr->purr = vcpu->arch.purr;
hr->spurr = vcpu->arch.spurr;
hr->ic = vcpu->arch.ic;
hr->vtb = vc->vtb;
hr->srr0 = vcpu->arch.shregs.srr0;
hr->srr1 = vcpu->arch.shregs.srr1;
hr->sprg[0] = vcpu->arch.shregs.sprg0;
hr->sprg[1] = vcpu->arch.shregs.sprg1;
hr->sprg[2] = vcpu->arch.shregs.sprg2;
hr->sprg[3] = vcpu->arch.shregs.sprg3;
hr->pidr = vcpu->arch.pid;
hr->cfar = vcpu->arch.cfar;
hr->ppr = vcpu->arch.ppr;
}
static void byteswap_pt_regs(struct pt_regs *regs)
{
unsigned long *addr = (unsigned long *) regs;
for (; addr < ((unsigned long *) (regs + 1)); addr++)
*addr = swab64(*addr);
}
static void byteswap_hv_regs(struct hv_guest_state *hr)
{
hr->version = swab64(hr->version);
hr->lpid = swab32(hr->lpid);
hr->vcpu_token = swab32(hr->vcpu_token);
hr->lpcr = swab64(hr->lpcr);
hr->pcr = swab64(hr->pcr);
hr->amor = swab64(hr->amor);
hr->dpdes = swab64(hr->dpdes);
hr->hfscr = swab64(hr->hfscr);
hr->tb_offset = swab64(hr->tb_offset);
hr->dawr0 = swab64(hr->dawr0);
hr->dawrx0 = swab64(hr->dawrx0);
hr->ciabr = swab64(hr->ciabr);
hr->hdec_expiry = swab64(hr->hdec_expiry);
hr->purr = swab64(hr->purr);
hr->spurr = swab64(hr->spurr);
hr->ic = swab64(hr->ic);
hr->vtb = swab64(hr->vtb);
hr->hdar = swab64(hr->hdar);
hr->hdsisr = swab64(hr->hdsisr);
hr->heir = swab64(hr->heir);
hr->asdr = swab64(hr->asdr);
hr->srr0 = swab64(hr->srr0);
hr->srr1 = swab64(hr->srr1);
hr->sprg[0] = swab64(hr->sprg[0]);
hr->sprg[1] = swab64(hr->sprg[1]);
hr->sprg[2] = swab64(hr->sprg[2]);
hr->sprg[3] = swab64(hr->sprg[3]);
hr->pidr = swab64(hr->pidr);
hr->cfar = swab64(hr->cfar);
hr->ppr = swab64(hr->ppr);
}
static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
hr->dpdes = vc->dpdes;
hr->hfscr = vcpu->arch.hfscr;
hr->purr = vcpu->arch.purr;
hr->spurr = vcpu->arch.spurr;
hr->ic = vcpu->arch.ic;
hr->vtb = vc->vtb;
hr->srr0 = vcpu->arch.shregs.srr0;
hr->srr1 = vcpu->arch.shregs.srr1;
hr->sprg[0] = vcpu->arch.shregs.sprg0;
hr->sprg[1] = vcpu->arch.shregs.sprg1;
hr->sprg[2] = vcpu->arch.shregs.sprg2;
hr->sprg[3] = vcpu->arch.shregs.sprg3;
hr->pidr = vcpu->arch.pid;
hr->cfar = vcpu->arch.cfar;
hr->ppr = vcpu->arch.ppr;
switch (trap) {
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
hr->hdar = vcpu->arch.fault_dar;
hr->hdsisr = vcpu->arch.fault_dsisr;
hr->asdr = vcpu->arch.fault_gpa;
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
hr->asdr = vcpu->arch.fault_gpa;
break;
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
hr->heir = vcpu->arch.emul_inst;
break;
}
}
static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
{
/*
* Don't let L1 enable features for L2 which we've disabled for L1,
* but preserve the interrupt cause field.
*/
hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
/* Don't let data address watchpoint match in hypervisor state */
hr->dawrx0 &= ~DAWRX_HYP;
/* Don't let completed instruction address breakpt match in HV state */
if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
hr->ciabr &= ~CIABR_PRIV;
}
static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
vc->pcr = hr->pcr;
vc->dpdes = hr->dpdes;
vcpu->arch.hfscr = hr->hfscr;
vcpu->arch.dawr = hr->dawr0;
vcpu->arch.dawrx = hr->dawrx0;
vcpu->arch.ciabr = hr->ciabr;
vcpu->arch.purr = hr->purr;
vcpu->arch.spurr = hr->spurr;
vcpu->arch.ic = hr->ic;
vc->vtb = hr->vtb;
vcpu->arch.shregs.srr0 = hr->srr0;
vcpu->arch.shregs.srr1 = hr->srr1;
vcpu->arch.shregs.sprg0 = hr->sprg[0];
vcpu->arch.shregs.sprg1 = hr->sprg[1];
vcpu->arch.shregs.sprg2 = hr->sprg[2];
vcpu->arch.shregs.sprg3 = hr->sprg[3];
vcpu->arch.pid = hr->pidr;
vcpu->arch.cfar = hr->cfar;
vcpu->arch.ppr = hr->ppr;
}
void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
vc->dpdes = hr->dpdes;
vcpu->arch.hfscr = hr->hfscr;
vcpu->arch.purr = hr->purr;
vcpu->arch.spurr = hr->spurr;
vcpu->arch.ic = hr->ic;
vc->vtb = hr->vtb;
vcpu->arch.fault_dar = hr->hdar;
vcpu->arch.fault_dsisr = hr->hdsisr;
vcpu->arch.fault_gpa = hr->asdr;
vcpu->arch.emul_inst = hr->heir;
vcpu->arch.shregs.srr0 = hr->srr0;
vcpu->arch.shregs.srr1 = hr->srr1;
vcpu->arch.shregs.sprg0 = hr->sprg[0];
vcpu->arch.shregs.sprg1 = hr->sprg[1];
vcpu->arch.shregs.sprg2 = hr->sprg[2];
vcpu->arch.shregs.sprg3 = hr->sprg[3];
vcpu->arch.pid = hr->pidr;
vcpu->arch.cfar = hr->cfar;
vcpu->arch.ppr = hr->ppr;
}
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
{
long int err, r;
struct kvm_nested_guest *l2;
struct pt_regs l2_regs, saved_l1_regs;
struct hv_guest_state l2_hv, saved_l1_hv;
struct kvmppc_vcore *vc = vcpu->arch.vcore;
u64 hv_ptr, regs_ptr;
u64 hdec_exp;
s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
u64 mask;
unsigned long lpcr;
if (vcpu->kvm->arch.l1_ptcr == 0)
return H_NOT_AVAILABLE;
/* copy parameters in */
hv_ptr = kvmppc_get_gpr(vcpu, 4);
err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
sizeof(struct hv_guest_state));
if (err)
return H_PARAMETER;
if (kvmppc_need_byteswap(vcpu))
byteswap_hv_regs(&l2_hv);
if (l2_hv.version != HV_GUEST_STATE_VERSION)
return H_P2;
regs_ptr = kvmppc_get_gpr(vcpu, 5);
err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
sizeof(struct pt_regs));
if (err)
return H_PARAMETER;
if (kvmppc_need_byteswap(vcpu))
byteswap_pt_regs(&l2_regs);
if (l2_hv.vcpu_token >= NR_CPUS)
return H_PARAMETER;
/* translate lpid */
l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
if (!l2)
return H_PARAMETER;
if (!l2->l1_gr_to_hr) {
mutex_lock(&l2->tlb_lock);
kvmhv_update_ptbl_cache(l2);
mutex_unlock(&l2->tlb_lock);
}
/* save l1 values of things */
vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
saved_l1_regs = vcpu->arch.regs;
kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
/* convert TB values/offsets to host (L0) values */
hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
vc->tb_offset += l2_hv.tb_offset;
/* set L1 state to L2 state */
vcpu->arch.nested = l2;
vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
vcpu->arch.regs = l2_regs;
vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
LPCR_LPES | LPCR_MER;
lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
sanitise_hv_regs(vcpu, &l2_hv);
restore_hv_regs(vcpu, &l2_hv);
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
do {
if (mftb() >= hdec_exp) {
vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
r = RESUME_HOST;
break;
}
r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
lpcr);
} while (is_kvmppc_resume_guest(r));
/* save L2 state for return */
l2_regs = vcpu->arch.regs;
l2_regs.msr = vcpu->arch.shregs.msr;
delta_purr = vcpu->arch.purr - l2_hv.purr;
delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
delta_ic = vcpu->arch.ic - l2_hv.ic;
delta_vtb = vc->vtb - l2_hv.vtb;
save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
/* restore L1 state */
vcpu->arch.nested = NULL;
vcpu->arch.regs = saved_l1_regs;
vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
/* set L1 MSR TS field according to L2 transaction state */
if (l2_regs.msr & MSR_TS_MASK)
vcpu->arch.shregs.msr |= MSR_TS_S;
vc->tb_offset = saved_l1_hv.tb_offset;
restore_hv_regs(vcpu, &saved_l1_hv);
vcpu->arch.purr += delta_purr;
vcpu->arch.spurr += delta_spurr;
vcpu->arch.ic += delta_ic;
vc->vtb += delta_vtb;
kvmhv_put_nested(l2);
/* copy l2_hv_state and regs back to guest */
if (kvmppc_need_byteswap(vcpu)) {
byteswap_hv_regs(&l2_hv);
byteswap_pt_regs(&l2_regs);
}
err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
sizeof(struct hv_guest_state));
if (err)
return H_AUTHORITY;
err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
sizeof(struct pt_regs));
if (err)
return H_AUTHORITY;
if (r == -EINTR)
return H_INTERRUPT;
return vcpu->arch.trap;
}
long kvmhv_nested_init(void)
{
long int ptb_order;
unsigned long ptcr;
long rc;
if (!kvmhv_on_pseries())
return 0;
if (!radix_enabled())
return -ENODEV;
/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
if (ptb_order < 8)
ptb_order = 8;
pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
GFP_KERNEL);
if (!pseries_partition_tb) {
pr_err("kvm-hv: failed to allocated nested partition table\n");
return -ENOMEM;
}
ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
if (rc != H_SUCCESS) {
pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
rc);
kfree(pseries_partition_tb);
pseries_partition_tb = NULL;
return -ENODEV;
}
return 0;
}
void kvmhv_nested_exit(void)
{
/*
* N.B. the kvmhv_on_pseries() test is there because it enables
* the compiler to remove the call to plpar_hcall_norets()
* when CONFIG_PPC_PSERIES=n.
*/
if (kvmhv_on_pseries() && pseries_partition_tb) {
plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
kfree(pseries_partition_tb);
pseries_partition_tb = NULL;
}
}
static void kvmhv_flush_lpid(unsigned int lpid)
{
long rc;
if (!kvmhv_on_pseries()) {
radix__flush_tlb_lpid(lpid);
return;
}
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
lpid, TLBIEL_INVAL_SET_LPID);
if (rc)
pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
}
void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
{
if (!kvmhv_on_pseries()) {
mmu_partition_table_set_entry(lpid, dw0, dw1);
return;
}
pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
/* L0 will do the necessary barriers */
kvmhv_flush_lpid(lpid);
}
static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
{
unsigned long dw0;
dw0 = PATB_HR | radix__get_tree_size() |
__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
}
void kvmhv_vm_nested_init(struct kvm *kvm)
{
kvm->arch.max_nested_lpid = -1;
}
/*
* Handle the H_SET_PARTITION_TABLE hcall.
* r4 = guest real address of partition table + log_2(size) - 12
* (formatted as for the PTCR).
*/
long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
int srcu_idx;
long ret = H_SUCCESS;
srcu_idx = srcu_read_lock(&kvm->srcu);
/*
* Limit the partition table to 4096 entries (because that's what
* hardware supports), and check the base address.
*/
if ((ptcr & PRTS_MASK) > 12 - 8 ||
!kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
ret = H_PARAMETER;
srcu_read_unlock(&kvm->srcu, srcu_idx);
if (ret == H_SUCCESS)
kvm->arch.l1_ptcr = ptcr;
return ret;
}
/*
* Reload the partition table entry for a guest.
* Caller must hold gp->tlb_lock.
*/
static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
{
int ret;
struct patb_entry ptbl_entry;
unsigned long ptbl_addr;
struct kvm *kvm = gp->l1_host;
ret = -EFAULT;
ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
ret = kvm_read_guest(kvm, ptbl_addr,
&ptbl_entry, sizeof(ptbl_entry));
if (ret) {
gp->l1_gr_to_hr = 0;
gp->process_table = 0;
} else {
gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
gp->process_table = be64_to_cpu(ptbl_entry.patb1);
}
kvmhv_set_nested_ptbl(gp);
}
struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
{
struct kvm_nested_guest *gp;
long shadow_lpid;
gp = kzalloc(sizeof(*gp), GFP_KERNEL);
if (!gp)
return NULL;
gp->l1_host = kvm;
gp->l1_lpid = lpid;
mutex_init(&gp->tlb_lock);
gp->shadow_pgtable = pgd_alloc(kvm->mm);
if (!gp->shadow_pgtable)
goto out_free;
shadow_lpid = kvmppc_alloc_lpid();
if (shadow_lpid < 0)
goto out_free2;
gp->shadow_lpid = shadow_lpid;
memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
return gp;
out_free2:
pgd_free(kvm->mm, gp->shadow_pgtable);
out_free:
kfree(gp);
return NULL;
}
/*
* Free up any resources allocated for a nested guest.
*/
static void kvmhv_release_nested(struct kvm_nested_guest *gp)
{
struct kvm *kvm = gp->l1_host;
if (gp->shadow_pgtable) {
/*
* No vcpu is using this struct and no call to
* kvmhv_get_nested can find this struct,
* so we don't need to hold kvm->mmu_lock.
*/
kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
gp->shadow_lpid);
pgd_free(kvm->mm, gp->shadow_pgtable);
}
kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
kvmppc_free_lpid(gp->shadow_lpid);
kfree(gp);
}
static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
{
struct kvm *kvm = gp->l1_host;
int lpid = gp->l1_lpid;
long ref;
spin_lock(&kvm->mmu_lock);
if (gp == kvm->arch.nested_guests[lpid]) {
kvm->arch.nested_guests[lpid] = NULL;
if (lpid == kvm->arch.max_nested_lpid) {
while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
;
kvm->arch.max_nested_lpid = lpid;
}
--gp->refcnt;
}
ref = gp->refcnt;
spin_unlock(&kvm->mmu_lock);
if (ref == 0)
kvmhv_release_nested(gp);
}
/*
* Free up all nested resources allocated for this guest.
* This is called with no vcpus of the guest running, when
* switching the guest to HPT mode or when destroying the
* guest.
*/
void kvmhv_release_all_nested(struct kvm *kvm)
{
int i;
struct kvm_nested_guest *gp;
struct kvm_nested_guest *freelist = NULL;
struct kvm_memory_slot *memslot;
int srcu_idx;
spin_lock(&kvm->mmu_lock);
for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
gp = kvm->arch.nested_guests[i];
if (!gp)
continue;
kvm->arch.nested_guests[i] = NULL;
if (--gp->refcnt == 0) {
gp->next = freelist;
freelist = gp;
}
}
kvm->arch.max_nested_lpid = -1;
spin_unlock(&kvm->mmu_lock);
while ((gp = freelist) != NULL) {
freelist = gp->next;
kvmhv_release_nested(gp);
}
srcu_idx = srcu_read_lock(&kvm->srcu);
kvm_for_each_memslot(memslot, kvm_memslots(kvm))
kvmhv_free_memslot_nest_rmap(memslot);
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
/* caller must hold gp->tlb_lock */
static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
{
struct kvm *kvm = gp->l1_host;
spin_lock(&kvm->mmu_lock);
kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
spin_unlock(&kvm->mmu_lock);
kvmhv_flush_lpid(gp->shadow_lpid);
kvmhv_update_ptbl_cache(gp);
if (gp->l1_gr_to_hr == 0)
kvmhv_remove_nested(gp);
}
struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
bool create)
{
struct kvm_nested_guest *gp, *newgp;
if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
return NULL;
spin_lock(&kvm->mmu_lock);
gp = kvm->arch.nested_guests[l1_lpid];
if (gp)
++gp->refcnt;
spin_unlock(&kvm->mmu_lock);
if (gp || !create)
return gp;
newgp = kvmhv_alloc_nested(kvm, l1_lpid);
if (!newgp)
return NULL;
spin_lock(&kvm->mmu_lock);
if (kvm->arch.nested_guests[l1_lpid]) {
/* someone else beat us to it */
gp = kvm->arch.nested_guests[l1_lpid];
} else {
kvm->arch.nested_guests[l1_lpid] = newgp;
++newgp->refcnt;
gp = newgp;
newgp = NULL;
if (l1_lpid > kvm->arch.max_nested_lpid)
kvm->arch.max_nested_lpid = l1_lpid;
}
++gp->refcnt;
spin_unlock(&kvm->mmu_lock);
if (newgp)
kvmhv_release_nested(newgp);
return gp;
}
void kvmhv_put_nested(struct kvm_nested_guest *gp)
{
struct kvm *kvm = gp->l1_host;
long ref;
spin_lock(&kvm->mmu_lock);
ref = --gp->refcnt;
spin_unlock(&kvm->mmu_lock);
if (ref == 0)
kvmhv_release_nested(gp);
}
static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
{
if (lpid > kvm->arch.max_nested_lpid)
return NULL;
return kvm->arch.nested_guests[lpid];
}
static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
{
return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
RMAP_NESTED_GPA_MASK));
}
void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
struct rmap_nested **n_rmap)
{
struct llist_node *entry = ((struct llist_head *) rmapp)->first;
struct rmap_nested *cursor;
u64 rmap, new_rmap = (*n_rmap)->rmap;
/* Are there any existing entries? */
if (!(*rmapp)) {
/* No -> use the rmap as a single entry */
*rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
return;
}
/* Do any entries match what we're trying to insert? */
for_each_nest_rmap_safe(cursor, entry, &rmap) {
if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
return;
}
/* Do we need to create a list or just add the new entry? */
rmap = *rmapp;
if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
*rmapp = 0UL;
llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
(*n_rmap)->list.next = (struct llist_node *) rmap;
/* Set NULL so not freed by caller */
*n_rmap = NULL;
}
static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
unsigned long hpa, unsigned long mask)
{
struct kvm_nested_guest *gp;
unsigned long gpa;
unsigned int shift, lpid;
pte_t *ptep;
gpa = n_rmap & RMAP_NESTED_GPA_MASK;
lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
gp = kvmhv_find_nested(kvm, lpid);
if (!gp)
return;
/* Find and invalidate the pte */
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
/* Don't spuriously invalidate ptes if the pfn has changed */
if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
}
static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
unsigned long hpa, unsigned long mask)
{
struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
struct rmap_nested *cursor;
unsigned long rmap;
for_each_nest_rmap_safe(cursor, entry, &rmap) {
kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
kfree(cursor);
}
}
/* called with kvm->mmu_lock held */
void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
struct kvm_memory_slot *memslot,
unsigned long gpa, unsigned long hpa,
unsigned long nbytes)
{
unsigned long gfn, end_gfn;
unsigned long addr_mask;
if (!memslot)
return;
gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
end_gfn = gfn + (nbytes >> PAGE_SHIFT);
addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
hpa &= addr_mask;
for (; gfn < end_gfn; gfn++) {
unsigned long *rmap = &memslot->arch.rmap[gfn];
kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
}
}
static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
{
unsigned long page;
for (page = 0; page < free->npages; page++) {
unsigned long rmap, *rmapp = &free->arch.rmap[page];
struct rmap_nested *cursor;
struct llist_node *entry;
entry = llist_del_all((struct llist_head *) rmapp);
for_each_nest_rmap_safe(cursor, entry, &rmap)
kfree(cursor);
}
}
static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp,
long gpa, int *shift_ret)
{
struct kvm *kvm = vcpu->kvm;
bool ret = false;
pte_t *ptep;
int shift;
spin_lock(&kvm->mmu_lock);
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
if (!shift)
shift = PAGE_SHIFT;
if (ptep && pte_present(*ptep)) {
kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
ret = true;
}
spin_unlock(&kvm->mmu_lock);
if (shift_ret)
*shift_ret = shift;
return ret;
}
static inline int get_ric(unsigned int instr)
{
return (instr >> 18) & 0x3;
}
static inline int get_prs(unsigned int instr)
{
return (instr >> 17) & 0x1;
}
static inline int get_r(unsigned int instr)
{
return (instr >> 16) & 0x1;
}
static inline int get_lpid(unsigned long r_val)
{
return r_val & 0xffffffff;
}
static inline int get_is(unsigned long r_val)
{
return (r_val >> 10) & 0x3;
}
static inline int get_ap(unsigned long r_val)
{
return (r_val >> 5) & 0x7;
}
static inline long get_epn(unsigned long r_val)
{
return r_val >> 12;
}
static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
int ap, long epn)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *gp;
long npages;
int shift, shadow_shift;
unsigned long addr;
shift = ap_to_shift(ap);
addr = epn << 12;
if (shift < 0)
/* Invalid ap encoding */
return -EINVAL;
addr &= ~((1UL << shift) - 1);
npages = 1UL << (shift - PAGE_SHIFT);
gp = kvmhv_get_nested(kvm, lpid, false);
if (!gp) /* No such guest -> nothing to do */
return 0;
mutex_lock(&gp->tlb_lock);
/* There may be more than one host page backing this single guest pte */
do {
kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
npages -= 1UL << (shadow_shift - PAGE_SHIFT);
addr += 1UL << shadow_shift;
} while (npages > 0);
mutex_unlock(&gp->tlb_lock);
kvmhv_put_nested(gp);
return 0;
}
static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp, int ric)
{
struct kvm *kvm = vcpu->kvm;
mutex_lock(&gp->tlb_lock);
switch (ric) {
case 0:
/* Invalidate TLB */
spin_lock(&kvm->mmu_lock);
kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
gp->shadow_lpid);
kvmhv_flush_lpid(gp->shadow_lpid);
spin_unlock(&kvm->mmu_lock);
break;
case 1:
/*
* Invalidate PWC
* We don't cache this -> nothing to do
*/
break;
case 2:
/* Invalidate TLB, PWC and caching of partition table entries */
kvmhv_flush_nested(gp);
break;
default:
break;
}
mutex_unlock(&gp->tlb_lock);
}
static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *gp;
int i;
spin_lock(&kvm->mmu_lock);
for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
gp = kvm->arch.nested_guests[i];
if (gp) {
spin_unlock(&kvm->mmu_lock);
kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
spin_lock(&kvm->mmu_lock);
}
}
spin_unlock(&kvm->mmu_lock);
}
static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
unsigned long rsval, unsigned long rbval)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *gp;
int r, ric, prs, is, ap;
int lpid;
long epn;
int ret = 0;
ric = get_ric(instr);
prs = get_prs(instr);
r = get_r(instr);
lpid = get_lpid(rsval);
is = get_is(rbval);
/*
* These cases are invalid and are not handled:
* r != 1 -> Only radix supported
* prs == 1 -> Not HV privileged
* ric == 3 -> No cluster bombs for radix
* is == 1 -> Partition scoped translations not associated with pid
* (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
*/
if ((!r) || (prs) || (ric == 3) || (is == 1) ||
((!is) && (ric == 1 || ric == 2)))
return -EINVAL;
switch (is) {
case 0:
/*
* We know ric == 0
* Invalidate TLB for a given target address
*/
epn = get_epn(rbval);
ap = get_ap(rbval);
ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
break;
case 2:
/* Invalidate matching LPID */
gp = kvmhv_get_nested(kvm, lpid, false);
if (gp) {
kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
kvmhv_put_nested(gp);
}
break;
case 3:
/* Invalidate ALL LPIDs */
kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
/*
* This handles the H_TLB_INVALIDATE hcall.
* Parameters are (r4) tlbie instruction code, (r5) rS contents,
* (r6) rB contents.
*/
long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
{
int ret;
ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
if (ret)
return H_PARAMETER;
return H_SUCCESS;
}
/* Used to convert a nested guest real address to a L1 guest real address */
static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp,
unsigned long n_gpa, unsigned long dsisr,
struct kvmppc_pte *gpte_p)
{
u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
int ret;
ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
&fault_addr);
if (ret) {
/* We didn't find a pte */
if (ret == -EINVAL) {
/* Unsupported mmu config */
flags |= DSISR_UNSUPP_MMU;
} else if (ret == -ENOENT) {
/* No translation found */
flags |= DSISR_NOHPTE;
} else if (ret == -EFAULT) {
/* Couldn't access L1 real address */
flags |= DSISR_PRTABLE_FAULT;
vcpu->arch.fault_gpa = fault_addr;
} else {
/* Unknown error */
return ret;
}
goto forward_to_l1;
} else {
/* We found a pte -> check permissions */
if (dsisr & DSISR_ISSTORE) {
/* Can we write? */
if (!gpte_p->may_write) {
flags |= DSISR_PROTFAULT;
goto forward_to_l1;
}
} else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
/* Can we execute? */
if (!gpte_p->may_execute) {
flags |= SRR1_ISI_N_OR_G;
goto forward_to_l1;
}
} else {
/* Can we read? */
if (!gpte_p->may_read && !gpte_p->may_write) {
flags |= DSISR_PROTFAULT;
goto forward_to_l1;
}
}
}
return 0;
forward_to_l1:
vcpu->arch.fault_dsisr = flags;
if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
vcpu->arch.shregs.msr &= ~0x783f0000ul;
vcpu->arch.shregs.msr |= flags;
}
return RESUME_HOST;
}
static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp,
unsigned long n_gpa,
struct kvmppc_pte gpte,
unsigned long dsisr)
{
struct kvm *kvm = vcpu->kvm;
bool writing = !!(dsisr & DSISR_ISSTORE);
u64 pgflags;
bool ret;
/* Are the rc bits set in the L1 partition scoped pte? */
pgflags = _PAGE_ACCESSED;
if (writing)
pgflags |= _PAGE_DIRTY;
if (pgflags & ~gpte.rc)
return RESUME_HOST;
spin_lock(&kvm->mmu_lock);
/* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
gpte.raddr, kvm->arch.lpid);
spin_unlock(&kvm->mmu_lock);
if (!ret)
return -EINVAL;
/* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
gp->shadow_lpid);
if (!ret)
return -EINVAL;
return 0;
}
static inline int kvmppc_radix_level_to_shift(int level)
{
switch (level) {
case 2:
return PUD_SHIFT;
case 1:
return PMD_SHIFT;
default:
return PAGE_SHIFT;
}
}
static inline int kvmppc_radix_shift_to_level(int shift)
{
if (shift == PUD_SHIFT)
return 2;
if (shift == PMD_SHIFT)
return 1;
if (shift == PAGE_SHIFT)
return 0;
WARN_ON_ONCE(1);
return 0;
}
/* called with gp->tlb_lock held */
static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_memory_slot *memslot;
struct rmap_nested *n_rmap;
struct kvmppc_pte gpte;
pte_t pte, *pte_p;
unsigned long mmu_seq;
unsigned long dsisr = vcpu->arch.fault_dsisr;
unsigned long ea = vcpu->arch.fault_dar;
unsigned long *rmapp;
unsigned long n_gpa, gpa, gfn, perm = 0UL;
unsigned int shift, l1_shift, level;
bool writing = !!(dsisr & DSISR_ISSTORE);
bool kvm_ro = false;
long int ret;
if (!gp->l1_gr_to_hr) {
kvmhv_update_ptbl_cache(gp);
if (!gp->l1_gr_to_hr)
return RESUME_HOST;
}
/* Convert the nested guest real address into a L1 guest real address */
n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
if (!(dsisr & DSISR_PRTABLE_FAULT))
n_gpa |= ea & 0xFFF;
ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
/*
* If the hardware found a translation but we don't now have a usable
* translation in the l1 partition-scoped tree, remove the shadow pte
* and let the guest retry.
*/
if (ret == RESUME_HOST &&
(dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G |
DSISR_BAD_COPYPASTE)))
goto inval;
if (ret)
return ret;
/* Failed to set the reference/change bits */
if (dsisr & DSISR_SET_RC) {
ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
if (ret == RESUME_HOST)
return ret;
if (ret)
goto inval;
dsisr &= ~DSISR_SET_RC;
if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
DSISR_PROTFAULT)))
return RESUME_GUEST;
}
/*
* We took an HISI or HDSI while we were running a nested guest which
* means we have no partition scoped translation for that. This means
* we need to insert a pte for the mapping into our shadow_pgtable.
*/
l1_shift = gpte.page_shift;
if (l1_shift < PAGE_SHIFT) {
/* We don't support l1 using a page size smaller than our own */
pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
l1_shift, PAGE_SHIFT);
return -EINVAL;
}
gpa = gpte.raddr;
gfn = gpa >> PAGE_SHIFT;
/* 1. Get the corresponding host memslot */
memslot = gfn_to_memslot(kvm, gfn);
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
/* unusual error -> reflect to the guest as a DSI */
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
/* passthrough of emulated MMIO case... */
pr_err("emulated MMIO passthrough?\n");
return -EINVAL;
}
if (memslot->flags & KVM_MEM_READONLY) {
if (writing) {
/* Give the guest a DSI */
kvmppc_core_queue_data_storage(vcpu, ea,
DSISR_ISSTORE | DSISR_PROTFAULT);
return RESUME_GUEST;
}
kvm_ro = true;
}
/* 2. Find the host pte for this L1 guest real address */
/* Used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
/* See if can find translation in our partition scoped tables for L1 */
pte = __pte(0);
spin_lock(&kvm->mmu_lock);
pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (!shift)
shift = PAGE_SHIFT;
if (pte_p)
pte = *pte_p;
spin_unlock(&kvm->mmu_lock);
if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
/* No suitable pte found -> try to insert a mapping */
ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
writing, kvm_ro, &pte, &level);
if (ret == -EAGAIN)
return RESUME_GUEST;
else if (ret)
return ret;
shift = kvmppc_radix_level_to_shift(level);
}
/* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
/* The permissions is the combination of the host and l1 guest ptes */
perm |= gpte.may_read ? 0UL : _PAGE_READ;
perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
pte = __pte(pte_val(pte) & ~perm);
/* What size pte can we insert? */
if (shift > l1_shift) {
u64 mask;
unsigned int actual_shift = PAGE_SHIFT;
if (PMD_SHIFT < l1_shift)
actual_shift = PMD_SHIFT;
mask = (1UL << shift) - (1UL << actual_shift);
pte = __pte(pte_val(pte) | (gpa & mask));
shift = actual_shift;
}
level = kvmppc_radix_shift_to_level(shift);
n_gpa &= ~((1UL << shift) - 1);
/* 4. Insert the pte into our shadow_pgtable */
n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
if (!n_rmap)
return RESUME_GUEST; /* Let the guest try again */
n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
(((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
if (n_rmap)
kfree(n_rmap);
if (ret == -EAGAIN)
ret = RESUME_GUEST; /* Let the guest try again */
return ret;
inval:
kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
return RESUME_GUEST;
}
long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
{
struct kvm_nested_guest *gp = vcpu->arch.nested;
long int ret;
mutex_lock(&gp->tlb_lock);
ret = __kvmhv_nested_page_fault(vcpu, gp);
mutex_unlock(&gp->tlb_lock);
return ret;
}
int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
{
int ret = -1;
spin_lock(&kvm->mmu_lock);
while (++lpid <= kvm->arch.max_nested_lpid) {
if (kvm->arch.nested_guests[lpid]) {
ret = lpid;
break;
}
}
spin_unlock(&kvm->mmu_lock);
return ret;
}
......@@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void)
local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
}
EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest);
void kvmppc_subcore_exit_guest(void)
{
......@@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void)
local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
}
EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest);
static bool kvmppc_tb_resync_required(void)
{
......@@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void)
} else {
wait_for_tb_resync();
}
/*
* Reset tb_offset_applied so the guest exit code won't try
* to subtract the previous timebase offset from the timebase.
*/
if (local_paca->kvm_hstate.kvm_vcore)
local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0;
return 0;
}
......@@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
/* Mark the target VCPU as having an interrupt pending */
vcpu->stat.queue_intr++;
set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions);
set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
/* Kick self ? Just set MER and return */
if (vcpu == this_vcpu) {
......@@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
{
/* Note: Only called on self ! */
clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
&vcpu->arch.pending_exceptions);
clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
}
......@@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
void __iomem *xics_phys;
int64_t rc;
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
iosync();
plpar_hcall_raw(H_EOI, retbuf, hwirq);
return;
}
rc = pnv_opal_pci_msi_eoi(c, hwirq);
if (rc)
......
......@@ -28,6 +28,7 @@
#include <asm/exception-64s.h>
#include <asm/kvm_book3s_asm.h>
#include <asm/book3s/64/mmu-hash.h>
#include <asm/export.h>
#include <asm/tm.h>
#include <asm/opal.h>
#include <asm/xive-regs.h>
......@@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#define NAPPING_NOVCPU 2
/* Stack frame offsets for kvmppc_hv_entry */
#define SFS 160
#define SFS 208
#define STACK_SLOT_TRAP (SFS-4)
#define STACK_SLOT_SHORT_PATH (SFS-8)
#define STACK_SLOT_TID (SFS-16)
#define STACK_SLOT_PSSCR (SFS-24)
#define STACK_SLOT_PID (SFS-32)
......@@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#define STACK_SLOT_DAWR (SFS-56)
#define STACK_SLOT_DAWRX (SFS-64)
#define STACK_SLOT_HFSCR (SFS-72)
/* the following is used by the P9 short path */
#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */
/*
* Call kvmppc_hv_entry in real mode.
......@@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtspr SPRN_SPRG_VDSO_WRITE,r3
/* Reload the host's PMU registers */
lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
cmpwi r4, 0
beq 23f /* skip if not */
BEGIN_FTR_SECTION
ld r3, HSTATE_MMCR0(r13)
andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
cmpwi r4, MMCR0_PMAO
beql kvmppc_fix_pmao
END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
lwz r3, HSTATE_PMC1(r13)
lwz r4, HSTATE_PMC2(r13)
lwz r5, HSTATE_PMC3(r13)
lwz r6, HSTATE_PMC4(r13)
lwz r8, HSTATE_PMC5(r13)
lwz r9, HSTATE_PMC6(r13)
mtspr SPRN_PMC1, r3
mtspr SPRN_PMC2, r4
mtspr SPRN_PMC3, r5
mtspr SPRN_PMC4, r6
mtspr SPRN_PMC5, r8
mtspr SPRN_PMC6, r9
ld r3, HSTATE_MMCR0(r13)
ld r4, HSTATE_MMCR1(r13)
ld r5, HSTATE_MMCRA(r13)
ld r6, HSTATE_SIAR(r13)
ld r7, HSTATE_SDAR(r13)
mtspr SPRN_MMCR1, r4
mtspr SPRN_MMCRA, r5
mtspr SPRN_SIAR, r6
mtspr SPRN_SDAR, r7
BEGIN_FTR_SECTION
ld r8, HSTATE_MMCR2(r13)
ld r9, HSTATE_SIER(r13)
mtspr SPRN_MMCR2, r8
mtspr SPRN_SIER, r9
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
23:
bl kvmhv_load_host_pmu
/*
* Reload DEC. HDEC interrupts were disabled when
......@@ -796,66 +762,23 @@ BEGIN_FTR_SECTION
b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
mr r3, r4
ld r4, VCPU_MSR(r3)
li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_restore_tm_hv
nop
ld r4, HSTATE_KVM_VCPU(r13)
91:
#endif
/* Load guest PMU registers */
/* R4 is live here (vcpu pointer) */
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
isync
BEGIN_FTR_SECTION
ld r3, VCPU_MMCR(r4)
andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
cmpwi r5, MMCR0_PMAO
beql kvmppc_fix_pmao
END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */
lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */
lwz r6, VCPU_PMC + 8(r4)
lwz r7, VCPU_PMC + 12(r4)
lwz r8, VCPU_PMC + 16(r4)
lwz r9, VCPU_PMC + 20(r4)
mtspr SPRN_PMC1, r3
mtspr SPRN_PMC2, r5
mtspr SPRN_PMC3, r6
mtspr SPRN_PMC4, r7
mtspr SPRN_PMC5, r8
mtspr SPRN_PMC6, r9
ld r3, VCPU_MMCR(r4)
ld r5, VCPU_MMCR + 8(r4)
ld r6, VCPU_MMCR + 16(r4)
ld r7, VCPU_SIAR(r4)
ld r8, VCPU_SDAR(r4)
mtspr SPRN_MMCR1, r5
mtspr SPRN_MMCRA, r6
mtspr SPRN_SIAR, r7
mtspr SPRN_SDAR, r8
BEGIN_FTR_SECTION
ld r5, VCPU_MMCR + 24(r4)
ld r6, VCPU_SIER(r4)
mtspr SPRN_MMCR2, r5
mtspr SPRN_SIER, r6
BEGIN_FTR_SECTION_NESTED(96)
lwz r7, VCPU_PMC + 24(r4)
lwz r8, VCPU_PMC + 28(r4)
ld r9, VCPU_MMCR + 32(r4)
mtspr SPRN_SPMC1, r7
mtspr SPRN_SPMC2, r8
mtspr SPRN_MMCRS, r9
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
/* Load guest PMU registers; r4 = vcpu pointer here */
mr r3, r4
bl kvmhv_load_guest_pmu
/* Load up FP, VMX and VSX registers */
ld r4, HSTATE_KVM_VCPU(r13)
bl kvmppc_load_fp
ld r14, VCPU_GPR(R14)(r4)
......@@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
no_xive:
#endif /* CONFIG_KVM_XICS */
deliver_guest_interrupt:
ld r6, VCPU_CTR(r4)
ld r7, VCPU_XER(r4)
mtctr r6
mtxer r7
li r0, 0
stw r0, STACK_SLOT_SHORT_PATH(r1)
kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */
ld r10, VCPU_PC(r4)
ld r11, VCPU_MSR(r4)
deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */
/* Check if we can deliver an external or decrementer interrupt now */
ld r0, VCPU_PENDING_EXC(r4)
BEGIN_FTR_SECTION
/* On POWER9, also check for emulated doorbell interrupt */
lbz r3, VCPU_DBELL_REQ(r4)
or r0, r0, r3
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
cmpdi r0, 0
beq 71f
mr r3, r4
bl kvmppc_guest_entry_inject_int
ld r4, HSTATE_KVM_VCPU(r13)
71:
ld r6, VCPU_SRR0(r4)
ld r7, VCPU_SRR1(r4)
mtspr SPRN_SRR0, r6
mtspr SPRN_SRR1, r7
fast_guest_entry_c:
ld r10, VCPU_PC(r4)
ld r11, VCPU_MSR(r4)
/* r11 = vcpu->arch.msr & ~MSR_HV */
rldicl r11, r11, 63 - MSR_HV_LG, 1
rotldi r11, r11, 1 + MSR_HV_LG
ori r11, r11, MSR_ME
/* Check if we can deliver an external or decrementer interrupt now */
ld r0, VCPU_PENDING_EXC(r4)
rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63
cmpdi cr1, r0, 0
andi. r8, r11, MSR_EE
mfspr r8, SPRN_LPCR
/* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
mtspr SPRN_LPCR, r8
isync
beq 5f
li r0, BOOK3S_INTERRUPT_EXTERNAL
bne cr1, 12f
mfspr r0, SPRN_DEC
BEGIN_FTR_SECTION
/* On POWER9 check whether the guest has large decrementer enabled */
andis. r8, r8, LPCR_LD@h
bne 15f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r0, r0
15: cmpdi r0, 0
li r0, BOOK3S_INTERRUPT_DECREMENTER
bge 5f
12: mtspr SPRN_SRR0, r10
mr r10,r0
mtspr SPRN_SRR1, r11
mr r9, r4
bl kvmppc_msr_interrupt
5:
BEGIN_FTR_SECTION
b fast_guest_return
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/* On POWER9, check for pending doorbell requests */
lbz r0, VCPU_DBELL_REQ(r4)
cmpwi r0, 0
beq fast_guest_return
ld r5, HSTATE_KVM_VCORE(r13)
/* Set DPDES register so the CPU will take a doorbell interrupt */
li r0, 1
mtspr SPRN_DPDES, r0
std r0, VCORE_DPDES(r5)
/* Make sure other cpus see vcore->dpdes set before dbell req clear */
lwsync
/* Clear the pending doorbell request */
li r0, 0
stb r0, VCPU_DBELL_REQ(r4)
ld r6, VCPU_CTR(r4)
ld r7, VCPU_XER(r4)
mtctr r6
mtxer r7
/*
* Required state:
......@@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld r5, VCPU_LR(r4)
lwz r6, VCPU_CR(r4)
ld r6, VCPU_CR(r4)
mtlr r5
mtcr r6
......@@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
HRFI_TO_GUEST
b .
/*
* Enter the guest on a P9 or later system where we have exactly
* one vcpu per vcore and we don't need to go to real mode
* (which implies that host and guest are both using radix MMU mode).
* r3 = vcpu pointer
* Most SPRs and all the VSRs have been loaded already.
*/
_GLOBAL(__kvmhv_vcpu_entry_p9)
EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
mflr r0
std r0, PPC_LR_STKOFF(r1)
stdu r1, -SFS(r1)
li r0, 1
stw r0, STACK_SLOT_SHORT_PATH(r1)
std r3, HSTATE_KVM_VCPU(r13)
mfcr r4
stw r4, SFS+8(r1)
std r1, HSTATE_HOST_R1(r13)
reg = 14
.rept 18
std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
reg = reg + 1
.endr
reg = 14
.rept 18
ld reg, __VCPU_GPR(reg)(r3)
reg = reg + 1
.endr
mfmsr r10
std r10, HSTATE_HOST_MSR(r13)
mr r4, r3
b fast_guest_entry_c
guest_exit_short_path:
li r0, KVM_GUEST_MODE_NONE
stb r0, HSTATE_IN_GUEST(r13)
reg = 14
.rept 18
std reg, __VCPU_GPR(reg)(r9)
reg = reg + 1
.endr
reg = 14
.rept 18
ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
reg = reg + 1
.endr
lwz r4, SFS+8(r1)
mtcr r4
mr r3, r12 /* trap number */
addi r1, r1, SFS
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
/* If we are in real mode, do a rfid to get back to the caller */
mfmsr r4
andi. r5, r4, MSR_IR
bnelr
rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */
mtspr SPRN_SRR0, r0
ld r10, HSTATE_HOST_MSR(r13)
rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
mtspr SPRN_SRR1, r10
RFI_TO_KERNEL
b .
secondary_too_late:
li r12, 0
stw r12, STACK_SLOT_TRAP(r1)
......@@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv:
std r3, VCPU_GPR(R12)(r9)
/* CR is in the high half of r12 */
srdi r4, r12, 32
stw r4, VCPU_CR(r9)
std r4, VCPU_CR(r9)
BEGIN_FTR_SECTION
ld r3, HSTATE_CFAR(r13)
std r3, VCPU_CFAR(r9)
......@@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
std r3, VCPU_CTR(r9)
std r4, VCPU_XER(r9)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/* For softpatch interrupt, go off and do TM instruction emulation */
cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
beq kvmppc_tm_emul
#endif
/* Save more register state */
mfdar r3
mfdsisr r4
std r3, VCPU_DAR(r9)
stw r4, VCPU_DSISR(r9)
/* If this is a page table miss then see if it's theirs or ours */
cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
beq kvmppc_hdsi
std r3, VCPU_FAULT_DAR(r9)
stw r4, VCPU_FAULT_DSISR(r9)
cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE
beq kvmppc_hisi
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/* For softpatch interrupt, go off and do TM instruction emulation */
cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
beq kvmppc_tm_emul
#endif
/* See if this is a leftover HDEC interrupt */
cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
bne 2f
......@@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
BEGIN_FTR_SECTION
PPC_MSGSYNC
lwsync
/* always exit if we're running a nested guest */
ld r0, VCPU_NESTED(r9)
cmpdi r0, 0
bne guest_exit_cont
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r0, HSTATE_HOST_IPI(r13)
cmpwi r0, 0
beq 4f
beq maybe_reenter_guest
b guest_exit_cont
3:
/* If it's a hypervisor facility unavailable interrupt, save HFSCR */
......@@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
14:
/* External interrupt ? */
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
bne+ guest_exit_cont
/* External interrupt, first check for host_ipi. If this is
* set, we know the host wants us out so let's do it now
*/
bl kvmppc_read_intr
/*
* Restore the active volatile registers after returning from
* a C function.
*/
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_EXTERNAL
/*
* kvmppc_read_intr return codes:
*
* Exit to host (r3 > 0)
* 1 An interrupt is pending that needs to be handled by the host
* Exit guest and return to host by branching to guest_exit_cont
*
* 2 Passthrough that needs completion in the host
* Exit guest and return to host by branching to guest_exit_cont
* However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
* to indicate to the host to complete handling the interrupt
*
* Before returning to guest, we check if any CPU is heading out
* to the host and if so, we head out also. If no CPUs are heading
* check return values <= 0.
*
* Return to guest (r3 <= 0)
* 0 No external interrupt is pending
* -1 A guest wakeup IPI (which has now been cleared)
* In either case, we return to guest to deliver any pending
* guest interrupts.
*
* -2 A PCI passthrough external interrupt was handled
* (interrupt was delivered directly to guest)
* Return to guest to deliver any pending guest interrupts.
*/
cmpdi r3, 1
ble 1f
/* Return code = 2 */
li r12, BOOK3S_INTERRUPT_HV_RM_HARD
stw r12, VCPU_TRAP(r9)
b guest_exit_cont
1: /* Return code <= 1 */
cmpdi r3, 0
bgt guest_exit_cont
/* Return code <= 0 */
4: ld r5, HSTATE_KVM_VCORE(r13)
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
mr r4, r9
blt deliver_guest_interrupt
guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
/* Save more register state */
mfdar r6
mfdsisr r7
std r6, VCPU_DAR(r9)
stw r7, VCPU_DSISR(r9)
/* don't overwrite fault_dar/fault_dsisr if HDSI */
cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
beq mc_cont
std r6, VCPU_FAULT_DAR(r9)
stw r7, VCPU_FAULT_DSISR(r9)
beq kvmppc_guest_external
/* See if it is a machine check */
cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
beq machine_check_realmode
mc_cont:
/* Or a hypervisor maintenance interrupt */
cmpwi r12, BOOK3S_INTERRUPT_HMI
beq hmi_realmode
guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
addi r3, r9, VCPU_TB_RMEXIT
mr r4, r9
......@@ -1552,6 +1465,11 @@ mc_cont:
1:
#endif /* CONFIG_KVM_XICS */
/* If we came in through the P9 short path, go back out to C now */
lwz r0, STACK_SLOT_SHORT_PATH(r1)
cmpwi r0, 0
bne guest_exit_short_path
/* For hash guest, read the guest SLB and save it away */
ld r5, VCPU_KVM(r9)
lbz r0, KVM_RADIX(r5)
......@@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION
b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
mr r3, r9
ld r4, VCPU_MSR(r3)
li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_save_tm_hv
nop
ld r9, HSTATE_KVM_VCPU(r13)
91:
#endif
......@@ -1802,90 +1722,19 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
25:
/* Save PMU registers if requested */
/* r8 and cr0.eq are live here */
mr r3, r9
li r4, 1
beq 21f /* if no VPA, save PMU stuff anyway */
lbz r4, LPPACA_PMCINUSE(r8)
21: bl kvmhv_save_guest_pmu
ld r9, HSTATE_KVM_VCPU(r13)
/* Restore host values of some registers */
BEGIN_FTR_SECTION
/*
* POWER8 seems to have a hardware bug where setting
* MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
* when some counters are already negative doesn't seem
* to cause a performance monitor alert (and hence interrupt).
* The effect of this is that when saving the PMU state,
* if there is no PMU alert pending when we read MMCR0
* before freezing the counters, but one becomes pending
* before we read the counters, we lose it.
* To work around this, we need a way to freeze the counters
* before reading MMCR0. Normally, freezing the counters
* is done by writing MMCR0 (to set MMCR0[FC]) which
* unavoidably writes MMCR0[PMA0] as well. On POWER8,
* we can also freeze the counters using MMCR2, by writing
* 1s to all the counter freeze condition bits (there are
* 9 bits each for 6 counters).
*/
li r3, -1 /* set all freeze bits */
clrrdi r3, r3, 10
mfspr r10, SPRN_MMCR2
mtspr SPRN_MMCR2, r3
isync
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mfspr r4, SPRN_MMCR0 /* save MMCR0 */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
mfspr r6, SPRN_MMCRA
/* Clear MMCRA in order to disable SDAR updates */
li r7, 0
mtspr SPRN_MMCRA, r7
isync
beq 21f /* if no VPA, save PMU stuff anyway */
lbz r7, LPPACA_PMCINUSE(r8)
cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */
bne 21f
std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */
b 22f
21: mfspr r5, SPRN_MMCR1
mfspr r7, SPRN_SIAR
mfspr r8, SPRN_SDAR
std r4, VCPU_MMCR(r9)
std r5, VCPU_MMCR + 8(r9)
std r6, VCPU_MMCR + 16(r9)
BEGIN_FTR_SECTION
std r10, VCPU_MMCR + 24(r9)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
std r7, VCPU_SIAR(r9)
std r8, VCPU_SDAR(r9)
mfspr r3, SPRN_PMC1
mfspr r4, SPRN_PMC2
mfspr r5, SPRN_PMC3
mfspr r6, SPRN_PMC4
mfspr r7, SPRN_PMC5
mfspr r8, SPRN_PMC6
stw r3, VCPU_PMC(r9)
stw r4, VCPU_PMC + 4(r9)
stw r5, VCPU_PMC + 8(r9)
stw r6, VCPU_PMC + 12(r9)
stw r7, VCPU_PMC + 16(r9)
stw r8, VCPU_PMC + 20(r9)
BEGIN_FTR_SECTION
mfspr r5, SPRN_SIER
std r5, VCPU_SIER(r9)
BEGIN_FTR_SECTION_NESTED(96)
mfspr r6, SPRN_SPMC1
mfspr r7, SPRN_SPMC2
mfspr r8, SPRN_MMCRS
stw r6, VCPU_PMC + 24(r9)
stw r7, VCPU_PMC + 28(r9)
std r8, VCPU_MMCR + 32(r9)
lis r4, 0x8000
mtspr SPRN_MMCRS, r4
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
22:
/* Restore host values of some registers */
BEGIN_FTR_SECTION
ld r5, STACK_SLOT_CIABR(r1)
ld r6, STACK_SLOT_DAWR(r1)
ld r7, STACK_SLOT_DAWRX(r1)
mtspr SPRN_CIABR, r5
ld r5, STACK_SLOT_CIABR(r1)
ld r6, STACK_SLOT_DAWR(r1)
ld r7, STACK_SLOT_DAWRX(r1)
mtspr SPRN_CIABR, r5
/*
* If the DAWR doesn't work, it's ok to write these here as
* this value should always be zero
......@@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION
mtspr SPRN_DPDES, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* If HMI, call kvmppc_realmode_hmi_handler() */
lwz r12, STACK_SLOT_TRAP(r1)
cmpwi r12, BOOK3S_INTERRUPT_HMI
bne 27f
bl kvmppc_realmode_hmi_handler
nop
cmpdi r3, 0
/*
* At this point kvmppc_realmode_hmi_handler may have resync-ed
* the TB, and if it has, we must not subtract the guest timebase
* offset from the timebase. So, skip it.
*
* Also, do not call kvmppc_subcore_exit_guest() because it has
* been invoked as part of kvmppc_realmode_hmi_handler().
*/
beq 30f
27:
/* Subtract timebase offset from timebase */
ld r8, VCORE_TB_OFFSET_APPL(r5)
cmpdi r8,0
......@@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
addis r8,r8,0x100 /* if so, increment upper 40 bits */
mtspr SPRN_TBU40,r8
17: bl kvmppc_subcore_exit_guest
17:
/*
* If this is an HMI, we called kvmppc_realmode_hmi_handler
* above, which may or may not have already called
* kvmppc_subcore_exit_guest. Fortunately, all that
* kvmppc_subcore_exit_guest does is clear a flag, so calling
* it again here is benign even if kvmppc_realmode_hmi_handler
* has already called it.
*/
bl kvmppc_subcore_exit_guest
nop
30: ld r5,HSTATE_KVM_VCORE(r13)
ld r4,VCORE_KVM(r5) /* pointer to struct kvm */
......@@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
mtlr r0
blr
kvmppc_guest_external:
/* External interrupt, first check for host_ipi. If this is
* set, we know the host wants us out so let's do it now
*/
bl kvmppc_read_intr
/*
* Restore the active volatile registers after returning from
* a C function.
*/
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_EXTERNAL
/*
* kvmppc_read_intr return codes:
*
* Exit to host (r3 > 0)
* 1 An interrupt is pending that needs to be handled by the host
* Exit guest and return to host by branching to guest_exit_cont
*
* 2 Passthrough that needs completion in the host
* Exit guest and return to host by branching to guest_exit_cont
* However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
* to indicate to the host to complete handling the interrupt
*
* Before returning to guest, we check if any CPU is heading out
* to the host and if so, we head out also. If no CPUs are heading
* check return values <= 0.
*
* Return to guest (r3 <= 0)
* 0 No external interrupt is pending
* -1 A guest wakeup IPI (which has now been cleared)
* In either case, we return to guest to deliver any pending
* guest interrupts.
*
* -2 A PCI passthrough external interrupt was handled
* (interrupt was delivered directly to guest)
* Return to guest to deliver any pending guest interrupts.
*/
cmpdi r3, 1
ble 1f
/* Return code = 2 */
li r12, BOOK3S_INTERRUPT_HV_RM_HARD
stw r12, VCPU_TRAP(r9)
b guest_exit_cont
1: /* Return code <= 1 */
cmpdi r3, 0
bgt guest_exit_cont
/* Return code <= 0 */
maybe_reenter_guest:
ld r5, HSTATE_KVM_VCORE(r13)
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
mr r4, r9
blt deliver_guest_interrupt
b guest_exit_cont
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
* Softpatch interrupt for transactional memory emulation cases
......@@ -2302,6 +2203,10 @@ hcall_try_real_mode:
andi. r0,r11,MSR_PR
/* sc 1 from userspace - reflect to guest syscall */
bne sc_1_fast_return
/* sc 1 from nested guest - give it to L1 to handle */
ld r0, VCPU_NESTED(r9)
cmpdi r0, 0
bne guest_exit_cont
clrrdi r3,r3,2
cmpldi r3,hcall_real_table_end - hcall_real_table
bge guest_exit_cont
......@@ -2561,6 +2466,7 @@ hcall_real_table:
hcall_real_table_end:
_GLOBAL(kvmppc_h_set_xdabr)
EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
andi. r0, r5, DABRX_USER | DABRX_KERNEL
beq 6f
li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
......@@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr)
blr
_GLOBAL(kvmppc_h_set_dabr)
EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
li r5, DABRX_USER | DABRX_KERNEL
3:
BEGIN_FTR_SECTION
......@@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION
b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
ld r3, HSTATE_KVM_VCPU(r13)
ld r4, VCPU_MSR(r3)
li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_save_tm_hv
nop
91:
#endif
......@@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION
b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/
mr r3, r4
ld r4, VCPU_MSR(r3)
li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_restore_tm_hv
nop
ld r4, HSTATE_KVM_VCPU(r13)
91:
#endif
......@@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
mr r9, r4
cmpdi r3, 0
bgt guest_exit_cont
/* see if any other thread is already exiting */
lwz r0,VCORE_ENTRY_EXIT(r5)
cmpwi r0,0x100
bge guest_exit_cont
b kvmppc_cede_reentry /* if not go back to guest */
b maybe_reenter_guest
/* cede when already previously prodded case */
kvm_cede_prodded:
......@@ -2947,12 +2852,12 @@ machine_check_realmode:
*/
ld r11, VCPU_MSR(r9)
rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
bne mc_cont /* if so, exit to host */
bne guest_exit_cont /* if so, exit to host */
/* Check if guest is capable of handling NMI exit */
ld r10, VCPU_KVM(r9)
lbz r10, KVM_FWNMI(r10)
cmpdi r10, 1 /* FWNMI capable? */
beq mc_cont /* if so, exit with KVM_EXIT_NMI. */
beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */
/* if not, fall through for backward compatibility. */
andi. r10, r11, MSR_RI /* check for unrecoverable exception */
......@@ -2965,6 +2870,21 @@ machine_check_realmode:
bl kvmppc_msr_interrupt
2: b fast_interrupt_c_return
/*
* Call C code to handle a HMI in real mode.
* Only the primary thread does the call, secondary threads are handled
* by calling hmi_exception_realmode() after kvmppc_hv_entry returns.
* r9 points to the vcpu on entry
*/
hmi_realmode:
lbz r0, HSTATE_PTID(r13)
cmpwi r0, 0
bne guest_exit_cont
bl kvmppc_realmode_hmi_handler
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_HMI
b guest_exit_cont
/*
* Check the reason we woke from nap, and take appropriate action.
* Returns (in r3):
......@@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
* Save transactional state and TM-related registers.
* Called with r3 pointing to the vcpu struct and r4 containing
* the guest MSR value.
* This can modify all checkpointed registers, but
* r5 is non-zero iff non-volatile register state needs to be maintained.
* If r5 == 0, this can modify all checkpointed registers, but
* restores r1 and r2 before exit.
*/
kvmppc_save_tm_hv:
_GLOBAL_TOC(kvmppc_save_tm_hv)
EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv)
/* See if we need to handle fake suspend mode */
BEGIN_FTR_SECTION
b __kvmppc_save_tm
......@@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
nop
std r1, HSTATE_HOST_R1(r13)
/* Clear the MSR RI since r1, r13 may be foobar. */
li r5, 0
mtmsrd r5, 1
/* We have to treclaim here because that's the only way to do S->N */
li r3, TM_CAUSE_KVM_RESCHED
TRECLAIM(R3)
......@@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
* We were in fake suspend, so we are not going to save the
* register state as the guest checkpointed state (since
* we already have it), therefore we can now use any volatile GPR.
* In fact treclaim in fake suspend state doesn't modify
* any registers.
*/
/* Reload PACA pointer, stack pointer and TOC. */
GET_PACA(r13)
ld r1, HSTATE_HOST_R1(r13)
ld r2, PACATOC(r13)
/* Set MSR RI now we have r1 and r13 back. */
li r5, MSR_RI
mtmsrd r5, 1
HMT_MEDIUM
ld r6, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r6
BEGIN_FTR_SECTION_NESTED(96)
BEGIN_FTR_SECTION
bl pnv_power9_force_smt4_release
END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
nop
4:
......@@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
* Restore transactional state and TM-related registers.
* Called with r3 pointing to the vcpu struct
* and r4 containing the guest MSR value.
* r5 is non-zero iff non-volatile register state needs to be maintained.
* This potentially modifies all checkpointed registers.
* It restores r1 and r2 from the PACA.
*/
kvmppc_restore_tm_hv:
_GLOBAL_TOC(kvmppc_restore_tm_hv)
EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv)
/*
* If we are doing TM emulation for the guest on a POWER9 DD2,
* then we don't actually do a trechkpt -- we either set up
......@@ -3423,6 +3332,194 @@ kvmppc_msr_interrupt:
1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
blr
/*
* Load up guest PMU state. R3 points to the vcpu struct.
*/
_GLOBAL(kvmhv_load_guest_pmu)
EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu)
mr r4, r3
mflr r0
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
isync
BEGIN_FTR_SECTION
ld r3, VCPU_MMCR(r4)
andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
cmpwi r5, MMCR0_PMAO
beql kvmppc_fix_pmao
END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */
lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */
lwz r6, VCPU_PMC + 8(r4)
lwz r7, VCPU_PMC + 12(r4)
lwz r8, VCPU_PMC + 16(r4)
lwz r9, VCPU_PMC + 20(r4)
mtspr SPRN_PMC1, r3
mtspr SPRN_PMC2, r5
mtspr SPRN_PMC3, r6
mtspr SPRN_PMC4, r7
mtspr SPRN_PMC5, r8
mtspr SPRN_PMC6, r9
ld r3, VCPU_MMCR(r4)
ld r5, VCPU_MMCR + 8(r4)
ld r6, VCPU_MMCR + 16(r4)
ld r7, VCPU_SIAR(r4)
ld r8, VCPU_SDAR(r4)
mtspr SPRN_MMCR1, r5
mtspr SPRN_MMCRA, r6
mtspr SPRN_SIAR, r7
mtspr SPRN_SDAR, r8
BEGIN_FTR_SECTION
ld r5, VCPU_MMCR + 24(r4)
ld r6, VCPU_SIER(r4)
mtspr SPRN_MMCR2, r5
mtspr SPRN_SIER, r6
BEGIN_FTR_SECTION_NESTED(96)
lwz r7, VCPU_PMC + 24(r4)
lwz r8, VCPU_PMC + 28(r4)
ld r9, VCPU_MMCR + 32(r4)
mtspr SPRN_SPMC1, r7
mtspr SPRN_SPMC2, r8
mtspr SPRN_MMCRS, r9
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
mtlr r0
blr
/*
* Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
*/
_GLOBAL(kvmhv_load_host_pmu)
EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu)
mflr r0
lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
cmpwi r4, 0
beq 23f /* skip if not */
BEGIN_FTR_SECTION
ld r3, HSTATE_MMCR0(r13)
andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
cmpwi r4, MMCR0_PMAO
beql kvmppc_fix_pmao
END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
lwz r3, HSTATE_PMC1(r13)
lwz r4, HSTATE_PMC2(r13)
lwz r5, HSTATE_PMC3(r13)
lwz r6, HSTATE_PMC4(r13)
lwz r8, HSTATE_PMC5(r13)
lwz r9, HSTATE_PMC6(r13)
mtspr SPRN_PMC1, r3
mtspr SPRN_PMC2, r4
mtspr SPRN_PMC3, r5
mtspr SPRN_PMC4, r6
mtspr SPRN_PMC5, r8
mtspr SPRN_PMC6, r9
ld r3, HSTATE_MMCR0(r13)
ld r4, HSTATE_MMCR1(r13)
ld r5, HSTATE_MMCRA(r13)
ld r6, HSTATE_SIAR(r13)
ld r7, HSTATE_SDAR(r13)
mtspr SPRN_MMCR1, r4
mtspr SPRN_MMCRA, r5
mtspr SPRN_SIAR, r6
mtspr SPRN_SDAR, r7
BEGIN_FTR_SECTION
ld r8, HSTATE_MMCR2(r13)
ld r9, HSTATE_SIER(r13)
mtspr SPRN_MMCR2, r8
mtspr SPRN_SIER, r9
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
mtlr r0
23: blr
/*
* Save guest PMU state into the vcpu struct.
* r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
*/
_GLOBAL(kvmhv_save_guest_pmu)
EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu)
mr r9, r3
mr r8, r4
BEGIN_FTR_SECTION
/*
* POWER8 seems to have a hardware bug where setting
* MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
* when some counters are already negative doesn't seem
* to cause a performance monitor alert (and hence interrupt).
* The effect of this is that when saving the PMU state,
* if there is no PMU alert pending when we read MMCR0
* before freezing the counters, but one becomes pending
* before we read the counters, we lose it.
* To work around this, we need a way to freeze the counters
* before reading MMCR0. Normally, freezing the counters
* is done by writing MMCR0 (to set MMCR0[FC]) which
* unavoidably writes MMCR0[PMA0] as well. On POWER8,
* we can also freeze the counters using MMCR2, by writing
* 1s to all the counter freeze condition bits (there are
* 9 bits each for 6 counters).
*/
li r3, -1 /* set all freeze bits */
clrrdi r3, r3, 10
mfspr r10, SPRN_MMCR2
mtspr SPRN_MMCR2, r3
isync
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mfspr r4, SPRN_MMCR0 /* save MMCR0 */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
mfspr r6, SPRN_MMCRA
/* Clear MMCRA in order to disable SDAR updates */
li r7, 0
mtspr SPRN_MMCRA, r7
isync
cmpwi r8, 0 /* did they ask for PMU stuff to be saved? */
bne 21f
std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */
b 22f
21: mfspr r5, SPRN_MMCR1
mfspr r7, SPRN_SIAR
mfspr r8, SPRN_SDAR
std r4, VCPU_MMCR(r9)
std r5, VCPU_MMCR + 8(r9)
std r6, VCPU_MMCR + 16(r9)
BEGIN_FTR_SECTION
std r10, VCPU_MMCR + 24(r9)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
std r7, VCPU_SIAR(r9)
std r8, VCPU_SDAR(r9)
mfspr r3, SPRN_PMC1
mfspr r4, SPRN_PMC2
mfspr r5, SPRN_PMC3
mfspr r6, SPRN_PMC4
mfspr r7, SPRN_PMC5
mfspr r8, SPRN_PMC6
stw r3, VCPU_PMC(r9)
stw r4, VCPU_PMC + 4(r9)
stw r5, VCPU_PMC + 8(r9)
stw r6, VCPU_PMC + 12(r9)
stw r7, VCPU_PMC + 16(r9)
stw r8, VCPU_PMC + 20(r9)
BEGIN_FTR_SECTION
mfspr r5, SPRN_SIER
std r5, VCPU_SIER(r9)
BEGIN_FTR_SECTION_NESTED(96)
mfspr r6, SPRN_SPMC1
mfspr r7, SPRN_SPMC2
mfspr r8, SPRN_MMCRS
stw r6, VCPU_PMC + 24(r9)
stw r7, VCPU_PMC + 28(r9)
std r8, VCPU_MMCR + 32(r9)
lis r4, 0x8000
mtspr SPRN_MMCRS, r4
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
22: blr
/*
* This works around a hardware bug on POWER8E processors, where
* writing a 1 to the MMCR0[PMAO] bit doesn't generate a
......
......@@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
}
/* Set CR0 to indicate previous transactional state */
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
/* L=1 => tresume, L=0 => tsuspend */
if (instr & (1 << 21)) {
......@@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
copy_from_checkpoint(vcpu);
/* Set CR0 to indicate previous transactional state */
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
return RESUME_GUEST;
......@@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
copy_to_checkpoint(vcpu);
/* Set CR0 to indicate previous transactional state */
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) |
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
vcpu->arch.shregs.msr = msr | MSR_TS_S;
return RESUME_GUEST;
......
......@@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu)
if (instr & (1 << 21))
vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
/* Set CR0 to 0b0010 */
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000;
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
0x20000000;
return 1;
}
......@@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu)
vcpu->arch.shregs.msr &= ~MSR_TS_MASK; /* go to N state */
vcpu->arch.regs.nip = vcpu->arch.tfhar;
copy_from_checkpoint(vcpu);
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000;
vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000;
}
......@@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
svcpu->gpr[11] = vcpu->arch.regs.gpr[11];
svcpu->gpr[12] = vcpu->arch.regs.gpr[12];
svcpu->gpr[13] = vcpu->arch.regs.gpr[13];
svcpu->cr = vcpu->arch.cr;
svcpu->cr = vcpu->arch.regs.ccr;
svcpu->xer = vcpu->arch.regs.xer;
svcpu->ctr = vcpu->arch.regs.ctr;
svcpu->lr = vcpu->arch.regs.link;
......@@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
vcpu->arch.regs.gpr[11] = svcpu->gpr[11];
vcpu->arch.regs.gpr[12] = svcpu->gpr[12];
vcpu->arch.regs.gpr[13] = svcpu->gpr[13];
vcpu->arch.cr = svcpu->cr;
vcpu->arch.regs.ccr = svcpu->cr;
vcpu->arch.regs.xer = svcpu->xer;
vcpu->arch.regs.ctr = svcpu->ctr;
vcpu->arch.regs.link = svcpu->lr;
......@@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_EXTERNAL:
case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
case BOOK3S_INTERRUPT_EXTERNAL_HV:
case BOOK3S_INTERRUPT_H_VIRT:
vcpu->stat.ext_intr_exits++;
......
......@@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp,
*/
if (new.out_ee) {
kvmppc_book3s_queue_irqprio(icp->vcpu,
BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
BOOK3S_INTERRUPT_EXTERNAL);
if (!change_self)
kvmppc_fast_vcpu_kick(icp->vcpu);
}
......@@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
u32 xirr;
/* First, remove EE from the processor */
kvmppc_book3s_dequeue_irqprio(icp->vcpu,
BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
/*
* ICP State: Accept_Interrupt
......@@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
* We can remove EE from the current processor, the update
* transaction will set it again if needed
*/
kvmppc_book3s_dequeue_irqprio(icp->vcpu,
BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
do {
old_state = new_state = READ_ONCE(icp->state);
......@@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
* Deassert the CPU interrupt request.
* icp_try_update will reassert it if necessary.
*/
kvmppc_book3s_dequeue_irqprio(icp->vcpu,
BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
/*
* Note that if we displace an interrupt from old_state.xisr,
......@@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
}
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
if (cpu_has_feature(CPU_FTR_ARCH_206)) {
if (cpu_has_feature(CPU_FTR_ARCH_206) &&
cpu_has_feature(CPU_FTR_HVMODE)) {
/* Enable real mode support */
xics->real_mode = ENABLE_REALMODE;
xics->real_mode_dbg = DEBUG_REALMODE;
......
......@@ -61,6 +61,69 @@
*/
#define XIVE_Q_GAP 2
/*
* Push a vcpu's context to the XIVE on guest entry.
* This assumes we are in virtual mode (MMU on)
*/
void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
{
void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
u64 pq;
if (!tima)
return;
eieio();
__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
vcpu->arch.xive_pushed = 1;
eieio();
/*
* We clear the irq_pending flag. There is a small chance of a
* race vs. the escalation interrupt happening on another
* processor setting it again, but the only consequence is to
* cause a spurious wakeup on the next H_CEDE, which is not an
* issue.
*/
vcpu->arch.irq_pending = 0;
/*
* In single escalation mode, if the escalation interrupt is
* on, we mask it.
*/
if (vcpu->arch.xive_esc_on) {
pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
XIVE_ESB_SET_PQ_01));
mb();
/*
* We have a possible subtle race here: The escalation
* interrupt might have fired and be on its way to the
* host queue while we mask it, and if we unmask it
* early enough (re-cede right away), there is a
* theorical possibility that it fires again, thus
* landing in the target queue more than once which is
* a big no-no.
*
* Fortunately, solving this is rather easy. If the
* above load setting PQ to 01 returns a previous
* value where P is set, then we know the escalation
* interrupt is somewhere on its way to the host. In
* that case we simply don't clear the xive_esc_on
* flag below. It will be eventually cleared by the
* handler for the escalation interrupt.
*
* Then, when doing a cede, we check that flag again
* before re-enabling the escalation interrupt, and if
* set, we abort the cede.
*/
if (!(pq & XIVE_ESB_VAL_P))
/* Now P is 0, we can clear the flag */
vcpu->arch.xive_esc_on = 0;
}
}
EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
/*
* This is a simple trigger for a generic XIVE IRQ. This must
* only be called for interrupts that support a trigger page
......
......@@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
/* First collect pending bits from HW */
GLUE(X_PFX,ack_pending)(xc);
/*
* Cleanup the old-style bits if needed (they may have been
* set by pull or an escalation interrupts).
*/
if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
&vcpu->arch.pending_exceptions);
pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
xc->pending, xc->hw_cppr, xc->cppr);
......
......@@ -182,7 +182,7 @@
*/
PPC_LL r4, PACACURRENT(r13)
PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4)
stw r10, VCPU_CR(r4)
PPC_STL r10, VCPU_CR(r4)
PPC_STL r11, VCPU_GPR(R4)(r4)
PPC_STL r5, VCPU_GPR(R5)(r4)
PPC_STL r6, VCPU_GPR(R6)(r4)
......@@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
PPC_STL r4, VCPU_GPR(R4)(r11)
PPC_LL r4, THREAD_NORMSAVE(0)(r10)
PPC_STL r5, VCPU_GPR(R5)(r11)
stw r13, VCPU_CR(r11)
PPC_STL r13, VCPU_CR(r11)
mfspr r5, \srr0
PPC_STL r3, VCPU_GPR(R10)(r11)
PPC_LL r3, THREAD_NORMSAVE(2)(r10)
......@@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
PPC_STL r4, VCPU_GPR(R4)(r11)
PPC_LL r4, GPR9(r8)
PPC_STL r5, VCPU_GPR(R5)(r11)
stw r9, VCPU_CR(r11)
PPC_STL r9, VCPU_CR(r11)
mfspr r5, \srr0
PPC_STL r3, VCPU_GPR(R8)(r11)
PPC_LL r3, GPR10(r8)
......@@ -643,7 +643,7 @@ lightweight_exit:
PPC_LL r3, VCPU_LR(r4)
PPC_LL r5, VCPU_XER(r4)
PPC_LL r6, VCPU_CTR(r4)
lwz r7, VCPU_CR(r4)
PPC_LL r7, VCPU_CR(r4)
PPC_LL r8, VCPU_PC(r4)
PPC_LD(r9, VCPU_SHARED_MSR, r11)
PPC_LL r0, VCPU_GPR(R0)(r4)
......
......@@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
emulated = EMULATE_FAIL;
vcpu->arch.regs.msr = vcpu->arch.shared->msr;
vcpu->arch.regs.ccr = vcpu->arch.cr;
if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) {
int type = op.type & INSTR_TYPE_MASK;
int size = GETSIZE(op.type);
......
......@@ -594,7 +594,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && radix_enabled());
break;
case KVM_CAP_PPC_MMU_HASH_V3:
r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
cpu_has_feature(CPU_FTR_HVMODE));
break;
#endif
case KVM_CAP_SYNC_MMU:
......
......@@ -28,17 +28,25 @@
* Save transactional state and TM-related registers.
* Called with:
* - r3 pointing to the vcpu struct
* - r4 points to the MSR with current TS bits:
* - r4 containing the MSR with current TS bits:
* (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
* This can modify all checkpointed registers, but
* restores r1, r2 before exit.
* - r5 containing a flag indicating that non-volatile registers
* must be preserved.
* If r5 == 0, this can modify all checkpointed registers, but
* restores r1, r2 before exit. If r5 != 0, this restores the
* MSR TM/FP/VEC/VSX bits to their state on entry.
*/
_GLOBAL(__kvmppc_save_tm)
mflr r0
std r0, PPC_LR_STKOFF(r1)
stdu r1, -SWITCH_FRAME_SIZE(r1)
mr r9, r3
cmpdi cr7, r5, 0
/* Turn on TM. */
mfmsr r8
mr r10, r8
li r0, 1
rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
ori r8, r8, MSR_FP
......@@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm)
std r1, HSTATE_SCRATCH2(r13)
std r3, HSTATE_SCRATCH1(r13)
/* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */
mfcr r6
SAVE_GPR(6, r1)
/* Save DSCR so we can restore it to avoid running with user value */
mfspr r7, SPRN_DSCR
SAVE_GPR(7, r1)
/*
* We are going to do treclaim., which will modify all checkpointed
* registers. Save the non-volatile registers on the stack if
* preservation of non-volatile state has been requested.
*/
beq cr7, 3f
SAVE_NVGPRS(r1)
/* MSR[TS] will be 0 (non-transactional) once we do treclaim. */
li r0, 0
rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
SAVE_GPR(10, r1) /* final MSR value */
3:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
BEGIN_FTR_SECTION
/* Emulation of the treclaim instruction needs TEXASR before treclaim */
......@@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
std r9, PACATMSCRATCH(r13)
ld r9, HSTATE_SCRATCH1(r13)
/* Get a few more GPRs free. */
std r29, VCPU_GPRS_TM(29)(r9)
std r30, VCPU_GPRS_TM(30)(r9)
std r31, VCPU_GPRS_TM(31)(r9)
/* Save away PPR and DSCR soon so don't run with user values. */
mfspr r31, SPRN_PPR
/* Save away PPR soon so we don't run with user value. */
std r0, VCPU_GPRS_TM(0)(r9)
mfspr r0, SPRN_PPR
HMT_MEDIUM
mfspr r30, SPRN_DSCR
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
ld r29, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r29
#endif
/* Save all but r9, r13 & r29-r31 */
reg = 0
/* Reload stack pointer. */
std r1, VCPU_GPRS_TM(1)(r9)
ld r1, HSTATE_SCRATCH2(r13)
/* Set MSR RI now we have r1 and r13 back. */
std r2, VCPU_GPRS_TM(2)(r9)
li r2, MSR_RI
mtmsrd r2, 1
/* Reload TOC pointer. */
ld r2, PACATOC(r13)
/* Save all but r0-r2, r9 & r13 */
reg = 3
.rept 29
.if (reg != 9) && (reg != 13)
std reg, VCPU_GPRS_TM(reg)(r9)
......@@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
ld r4, PACATMSCRATCH(r13)
std r4, VCPU_GPRS_TM(9)(r9)
/* Reload stack pointer and TOC. */
ld r1, HSTATE_SCRATCH2(r13)
ld r2, PACATOC(r13)
/* Set MSR RI now we have r1 and r13 back. */
li r5, MSR_RI
mtmsrd r5, 1
/* Restore host DSCR and CR values, after saving guest values */
mfcr r6
mfspr r7, SPRN_DSCR
stw r6, VCPU_CR_TM(r9)
std r7, VCPU_DSCR_TM(r9)
REST_GPR(6, r1)
REST_GPR(7, r1)
mtcr r6
mtspr SPRN_DSCR, r7
/* Save away checkpinted SPRs. */
std r31, VCPU_PPR_TM(r9)
std r30, VCPU_DSCR_TM(r9)
/* Save away checkpointed SPRs. */
std r0, VCPU_PPR_TM(r9)
mflr r5
mfcr r6
mfctr r7
mfspr r8, SPRN_AMR
mfspr r10, SPRN_TAR
mfxer r11
std r5, VCPU_LR_TM(r9)
stw r6, VCPU_CR_TM(r9)
std r7, VCPU_CTR_TM(r9)
std r8, VCPU_AMR_TM(r9)
std r10, VCPU_TAR_TM(r9)
std r11, VCPU_XER_TM(r9)
/* Restore r12 as trap number. */
lwz r12, VCPU_TRAP(r9)
/* Save FP/VSX. */
addi r3, r9, VCPU_FPRS_TM
bl store_fp_state
......@@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
bl store_vr_state
mfspr r6, SPRN_VRSAVE
stw r6, VCPU_VRSAVE_TM(r9)
/* Restore non-volatile registers if requested to */
beq cr7, 1f
REST_NVGPRS(r1)
REST_GPR(10, r1)
1:
/*
* We need to save these SPRs after the treclaim so that the software
......@@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
*/
mfspr r7, SPRN_TEXASR
std r7, VCPU_TEXASR(r9)
11:
mfspr r5, SPRN_TFHAR
mfspr r6, SPRN_TFIAR
std r5, VCPU_TFHAR(r9)
std r6, VCPU_TFIAR(r9)
/* Restore MSR state if requested */
beq cr7, 2f
mtmsrd r10, 0
2:
addi r1, r1, SWITCH_FRAME_SIZE
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
blr
......@@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
* be invoked from C function by PR KVM only.
*/
_GLOBAL(_kvmppc_save_tm_pr)
mflr r5
std r5, PPC_LR_STKOFF(r1)
stdu r1, -SWITCH_FRAME_SIZE(r1)
SAVE_NVGPRS(r1)
/* save MSR since TM/math bits might be impacted
* by __kvmppc_save_tm().
*/
mfmsr r5
SAVE_GPR(5, r1)
/* also save DSCR/CR/TAR so that it can be recovered later */
mfspr r6, SPRN_DSCR
SAVE_GPR(6, r1)
mfcr r7
stw r7, _CCR(r1)
mflr r0
std r0, PPC_LR_STKOFF(r1)
stdu r1, -PPC_MIN_STKFRM(r1)
mfspr r8, SPRN_TAR
SAVE_GPR(8, r1)
std r8, PPC_MIN_STKFRM-8(r1)
li r5, 1 /* preserve non-volatile registers */
bl __kvmppc_save_tm
REST_GPR(8, r1)
ld r8, PPC_MIN_STKFRM-8(r1)
mtspr SPRN_TAR, r8
ld r7, _CCR(r1)
mtcr r7
REST_GPR(6, r1)
mtspr SPRN_DSCR, r6
/* need preserve current MSR's MSR_TS bits */
REST_GPR(5, r1)
mfmsr r6
rldicl r6, r6, 64 - MSR_TS_S_LG, 62
rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
mtmsrd r5
REST_NVGPRS(r1)
addi r1, r1, SWITCH_FRAME_SIZE
ld r5, PPC_LR_STKOFF(r1)
mtlr r5
addi r1, r1, PPC_MIN_STKFRM
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
blr
EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
......@@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
* - r4 is the guest MSR with desired TS bits:
* For HV KVM, it is VCPU_MSR
* For PR KVM, it is provided by caller
* This potentially modifies all checkpointed registers.
* It restores r1, r2 from the PACA.
* - r5 containing a flag indicating that non-volatile registers
* must be preserved.
* If r5 == 0, this potentially modifies all checkpointed registers, but
* restores r1, r2 from the PACA before exit.
* If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry.
*/
_GLOBAL(__kvmppc_restore_tm)
mflr r0
std r0, PPC_LR_STKOFF(r1)
cmpdi cr7, r5, 0
/* Turn on TM/FP/VSX/VMX so we can restore them. */
mfmsr r5
mr r10, r5
li r6, MSR_TM >> 32
sldi r6, r6, 32
or r5, r5, r6
......@@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm)
mr r5, r4
rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
beqlr /* TM not active in guest */
std r1, HSTATE_SCRATCH2(r13)
beq 9f /* TM not active in guest */
/* Make sure the failure summary is set, otherwise we'll program check
* when we trechkpt. It's possible that this might have been not set
......@@ -255,6 +270,26 @@ _GLOBAL(__kvmppc_restore_tm)
oris r7, r7, (TEXASR_FS)@h
mtspr SPRN_TEXASR, r7
/*
* Make a stack frame and save non-volatile registers if requested.
*/
stdu r1, -SWITCH_FRAME_SIZE(r1)
std r1, HSTATE_SCRATCH2(r13)
mfcr r6
mfspr r7, SPRN_DSCR
SAVE_GPR(2, r1)
SAVE_GPR(6, r1)
SAVE_GPR(7, r1)
beq cr7, 4f
SAVE_NVGPRS(r1)
/* MSR[TS] will be 1 (suspended) once we do trechkpt */
li r0, 1
rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
SAVE_GPR(10, r1) /* final MSR value */
4:
/*
* We need to load up the checkpointed state for the guest.
* We need to do this early as it will blow away any GPRs, VSRs and
......@@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm)
ld r29, VCPU_DSCR_TM(r3)
ld r30, VCPU_PPR_TM(r3)
std r2, PACATMSCRATCH(r13) /* Save TOC */
/* Clear the MSR RI since r1, r13 are all going to be foobar. */
li r5, 0
mtmsrd r5, 1
......@@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm)
/* Now let's get back the state we need. */
HMT_MEDIUM
GET_PACA(r13)
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
ld r29, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r29
#endif
ld r1, HSTATE_SCRATCH2(r13)
ld r2, PACATMSCRATCH(r13)
REST_GPR(7, r1)
mtspr SPRN_DSCR, r7
/* Set the MSR RI since we have our registers back. */
li r5, MSR_RI
mtmsrd r5, 1
/* Restore TOC pointer and CR */
REST_GPR(2, r1)
REST_GPR(6, r1)
mtcr r6
/* Restore non-volatile registers if requested to. */
beq cr7, 5f
REST_GPR(10, r1)
REST_NVGPRS(r1)
5: addi r1, r1, SWITCH_FRAME_SIZE
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
9: /* Restore MSR bits if requested */
beqlr cr7
mtmsrd r10, 0
blr
/*
......@@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm)
* can be invoked from C function by PR KVM only.
*/
_GLOBAL(_kvmppc_restore_tm_pr)
mflr r5
std r5, PPC_LR_STKOFF(r1)
stdu r1, -SWITCH_FRAME_SIZE(r1)
SAVE_NVGPRS(r1)
/* save MSR to avoid TM/math bits change */
mfmsr r5
SAVE_GPR(5, r1)
/* also save DSCR/CR/TAR so that it can be recovered later */
mfspr r6, SPRN_DSCR
SAVE_GPR(6, r1)
mfcr r7
stw r7, _CCR(r1)
mflr r0
std r0, PPC_LR_STKOFF(r1)
stdu r1, -PPC_MIN_STKFRM(r1)
/* save TAR so that it can be recovered later */
mfspr r8, SPRN_TAR
SAVE_GPR(8, r1)
std r8, PPC_MIN_STKFRM-8(r1)
li r5, 1
bl __kvmppc_restore_tm
REST_GPR(8, r1)
ld r8, PPC_MIN_STKFRM-8(r1)
mtspr SPRN_TAR, r8
ld r7, _CCR(r1)
mtcr r7
REST_GPR(6, r1)
mtspr SPRN_DSCR, r6
/* need preserve current MSR's MSR_TS bits */
REST_GPR(5, r1)
mfmsr r6
rldicl r6, r6, 64 - MSR_TS_S_LG, 62
rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
mtmsrd r5
REST_NVGPRS(r1)
addi r1, r1, SWITCH_FRAME_SIZE
ld r5, PPC_LR_STKOFF(r1)
mtlr r5
addi r1, r1, PPC_MIN_STKFRM
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
blr
EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
......
......@@ -14,7 +14,6 @@
{0x400, "INST_STORAGE"}, \
{0x480, "INST_SEGMENT"}, \
{0x500, "EXTERNAL"}, \
{0x501, "EXTERNAL_LEVEL"}, \
{0x502, "EXTERNAL_HV"}, \
{0x600, "ALIGNMENT"}, \
{0x700, "PROGRAM"}, \
......
......@@ -830,6 +830,15 @@ void radix__flush_pwc_lpid(unsigned int lpid)
}
EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
/*
* Flush partition scoped translations from LPID (=LPIDR)
*/
void radix__flush_tlb_lpid(unsigned int lpid)
{
_tlbie_lpid(lpid, RIC_FLUSH_ALL);
}
EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
/*
* Flush partition scoped translations from LPID (=LPIDR)
*/
......
......@@ -15,7 +15,6 @@
{0x400, "INST_STORAGE"}, \
{0x480, "INST_SEGMENT"}, \
{0x500, "EXTERNAL"}, \
{0x501, "EXTERNAL_LEVEL"}, \
{0x502, "EXTERNAL_HV"}, \
{0x600, "ALIGNMENT"}, \
{0x700, "PROGRAM"}, \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment