Commit 7dd2157c authored by Paolo Bonzini's avatar Paolo Bonzini

Merge tag 'kvm-ppc-next-4.20-1' of...

Merge tag 'kvm-ppc-next-4.20-1' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc into HEAD

PPC KVM update for 4.20.

The major new feature here is nested HV KVM support.  This allows the
HV KVM module to load inside a radix guest on POWER9 and run radix
guests underneath it.  These nested guests can run in supervisor mode
and don't require any additional instructions to be emulated, unlike
with PR KVM, and so performance is much better than with PR KVM, and
is very close to the performance of a non-nested guest.  A nested
hypervisor (a guest with nested guests) can be migrated to another
host and will bring all its nested guests along with it.  A nested
guest can also itself run guests, and so on down to any desired depth
of nesting.

Apart from that there are a series of updates for IOMMU handling from
Alexey Kardashevskiy, a "one VM per core" mode for HV KVM for
security-paranoid applications, and a small fix for PR KVM.
parents dd5bd0a6 901f8c3f
...@@ -1922,6 +1922,7 @@ registers, find a list below: ...@@ -1922,6 +1922,7 @@ registers, find a list below:
PPC | KVM_REG_PPC_TIDR | 64 PPC | KVM_REG_PPC_TIDR | 64
PPC | KVM_REG_PPC_PSSCR | 64 PPC | KVM_REG_PPC_PSSCR | 64
PPC | KVM_REG_PPC_DEC_EXPIRY | 64 PPC | KVM_REG_PPC_DEC_EXPIRY | 64
PPC | KVM_REG_PPC_PTCR | 64
PPC | KVM_REG_PPC_TM_GPR0 | 64 PPC | KVM_REG_PPC_TM_GPR0 | 64
... ...
PPC | KVM_REG_PPC_TM_GPR31 | 64 PPC | KVM_REG_PPC_TM_GPR31 | 64
...@@ -2269,6 +2270,10 @@ The supported flags are: ...@@ -2269,6 +2270,10 @@ The supported flags are:
The emulated MMU supports 1T segments in addition to the The emulated MMU supports 1T segments in addition to the
standard 256M ones. standard 256M ones.
- KVM_PPC_NO_HASH
This flag indicates that HPT guests are not supported by KVM,
thus all guests must use radix MMU mode.
The "slb_size" field indicates how many SLB entries are supported The "slb_size" field indicates how many SLB entries are supported
The "sps" array contains 8 entries indicating the supported base The "sps" array contains 8 entries indicating the supported base
...@@ -4531,6 +4536,20 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise, ...@@ -4531,6 +4536,20 @@ With this capability, a guest may read the MSR_PLATFORM_INFO MSR. Otherwise,
a #GP would be raised when the guest tries to access. Currently, this a #GP would be raised when the guest tries to access. Currently, this
capability does not enable write permissions of this MSR for the guest. capability does not enable write permissions of this MSR for the guest.
7.16 KVM_CAP_PPC_NESTED_HV
Architectures: ppc
Parameters: none
Returns: 0 on success, -EINVAL when the implementation doesn't support
nested-HV virtualization.
HV-KVM on POWER9 and later systems allows for "nested-HV"
virtualization, which provides a way for a guest VM to run guests that
can run using the CPU's supervisor mode (privileged non-hypervisor
state). Enabling this capability on a VM depends on the CPU having
the necessary functionality and on the facility being enabled with a
kvm-hv module parameter.
8. Other capabilities. 8. Other capabilities.
---------------------- ----------------------
......
...@@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache; ...@@ -150,4 +150,25 @@ extern s32 patch__memset_nocache, patch__memcpy_nocache;
extern long flush_count_cache; extern long flush_count_cache;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr, bool preserve_nv);
#else
static inline void kvmppc_save_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
bool preserve_nv) { }
static inline void kvmppc_restore_tm_hv(struct kvm_vcpu *vcpu, u64 msr,
bool preserve_nv) { }
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
void kvmhv_save_host_pmu(void);
void kvmhv_load_host_pmu(void);
void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu);
long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr,
unsigned long dabrx);
#endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */ #endif /* _ASM_POWERPC_ASM_PROTOTYPES_H */
...@@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize) ...@@ -203,6 +203,18 @@ static inline unsigned int mmu_psize_to_shift(unsigned int mmu_psize)
BUG(); BUG();
} }
static inline unsigned int ap_to_shift(unsigned long ap)
{
int psize;
for (psize = 0; psize < MMU_PAGE_COUNT; psize++) {
if (mmu_psize_defs[psize].ap == ap)
return mmu_psize_defs[psize].shift;
}
return -1;
}
static inline unsigned long get_sllp_encoding(int psize) static inline unsigned long get_sllp_encoding(int psize)
{ {
unsigned long sllp; unsigned long sllp;
......
...@@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid, ...@@ -53,6 +53,7 @@ extern void radix__flush_tlb_lpid_page(unsigned int lpid,
unsigned long addr, unsigned long addr,
unsigned long page_size); unsigned long page_size);
extern void radix__flush_pwc_lpid(unsigned int lpid); extern void radix__flush_pwc_lpid(unsigned int lpid);
extern void radix__flush_tlb_lpid(unsigned int lpid);
extern void radix__local_flush_tlb_lpid(unsigned int lpid); extern void radix__local_flush_tlb_lpid(unsigned int lpid);
extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid); extern void radix__local_flush_tlb_lpid_guest(unsigned int lpid);
......
...@@ -322,6 +322,11 @@ ...@@ -322,6 +322,11 @@
#define H_GET_24X7_DATA 0xF07C #define H_GET_24X7_DATA 0xF07C
#define H_GET_PERF_COUNTER_INFO 0xF080 #define H_GET_PERF_COUNTER_INFO 0xF080
/* Platform-specific hcalls used for nested HV KVM */
#define H_SET_PARTITION_TABLE 0xF800
#define H_ENTER_NESTED 0xF804
#define H_TLB_INVALIDATE 0xF808
/* Values for 2nd argument to H_SET_MODE */ /* Values for 2nd argument to H_SET_MODE */
#define H_SET_MODE_RESOURCE_SET_CIABR 1 #define H_SET_MODE_RESOURCE_SET_CIABR 1
#define H_SET_MODE_RESOURCE_SET_DAWR 2 #define H_SET_MODE_RESOURCE_SET_DAWR 2
...@@ -461,6 +466,42 @@ struct h_cpu_char_result { ...@@ -461,6 +466,42 @@ struct h_cpu_char_result {
u64 behaviour; u64 behaviour;
}; };
/* Register state for entering a nested guest with H_ENTER_NESTED */
struct hv_guest_state {
u64 version; /* version of this structure layout */
u32 lpid;
u32 vcpu_token;
/* These registers are hypervisor privileged (at least for writing) */
u64 lpcr;
u64 pcr;
u64 amor;
u64 dpdes;
u64 hfscr;
s64 tb_offset;
u64 dawr0;
u64 dawrx0;
u64 ciabr;
u64 hdec_expiry;
u64 purr;
u64 spurr;
u64 ic;
u64 vtb;
u64 hdar;
u64 hdsisr;
u64 heir;
u64 asdr;
/* These are OS privileged but need to be set late in guest entry */
u64 srr0;
u64 srr1;
u64 sprg[4];
u64 pidr;
u64 cfar;
u64 ppr;
};
/* Latest version of hv_guest_state structure */
#define HV_GUEST_STATE_VERSION 1
#endif /* __ASSEMBLY__ */ #endif /* __ASSEMBLY__ */
#endif /* __KERNEL__ */ #endif /* __KERNEL__ */
#endif /* _ASM_POWERPC_HVCALL_H */ #endif /* _ASM_POWERPC_HVCALL_H */
...@@ -84,7 +84,6 @@ ...@@ -84,7 +84,6 @@
#define BOOK3S_INTERRUPT_INST_STORAGE 0x400 #define BOOK3S_INTERRUPT_INST_STORAGE 0x400
#define BOOK3S_INTERRUPT_INST_SEGMENT 0x480 #define BOOK3S_INTERRUPT_INST_SEGMENT 0x480
#define BOOK3S_INTERRUPT_EXTERNAL 0x500 #define BOOK3S_INTERRUPT_EXTERNAL 0x500
#define BOOK3S_INTERRUPT_EXTERNAL_LEVEL 0x501
#define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502 #define BOOK3S_INTERRUPT_EXTERNAL_HV 0x502
#define BOOK3S_INTERRUPT_ALIGNMENT 0x600 #define BOOK3S_INTERRUPT_ALIGNMENT 0x600
#define BOOK3S_INTERRUPT_PROGRAM 0x700 #define BOOK3S_INTERRUPT_PROGRAM 0x700
...@@ -134,8 +133,7 @@ ...@@ -134,8 +133,7 @@
#define BOOK3S_IRQPRIO_EXTERNAL 14 #define BOOK3S_IRQPRIO_EXTERNAL 14
#define BOOK3S_IRQPRIO_DECREMENTER 15 #define BOOK3S_IRQPRIO_DECREMENTER 15
#define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 16 #define BOOK3S_IRQPRIO_PERFORMANCE_MONITOR 16
#define BOOK3S_IRQPRIO_EXTERNAL_LEVEL 17 #define BOOK3S_IRQPRIO_MAX 17
#define BOOK3S_IRQPRIO_MAX 18
#define BOOK3S_HFLAG_DCBZ32 0x1 #define BOOK3S_HFLAG_DCBZ32 0x1
#define BOOK3S_HFLAG_SLB 0x2 #define BOOK3S_HFLAG_SLB 0x2
......
...@@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc); ...@@ -188,14 +188,37 @@ extern int kvmppc_book3s_hcall_implemented(struct kvm *kvm, unsigned long hc);
extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run, extern int kvmppc_book3s_radix_page_fault(struct kvm_run *run,
struct kvm_vcpu *vcpu, struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr); unsigned long ea, unsigned long dsisr);
extern int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, u64 root,
u64 *pte_ret_p);
extern int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, u64 table,
int table_index, u64 *pte_ret_p);
extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, extern int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, bool data, bool iswrite); struct kvmppc_pte *gpte, bool data, bool iswrite);
extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
unsigned int shift, struct kvm_memory_slot *memslot,
unsigned int lpid);
extern bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable,
bool writing, unsigned long gpa,
unsigned int lpid);
extern int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
unsigned long gpa,
struct kvm_memory_slot *memslot,
bool writing, bool kvm_ro,
pte_t *inserted_pte, unsigned int *levelp);
extern int kvmppc_init_vm_radix(struct kvm *kvm); extern int kvmppc_init_vm_radix(struct kvm *kvm);
extern void kvmppc_free_radix(struct kvm *kvm); extern void kvmppc_free_radix(struct kvm *kvm);
extern void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd,
unsigned int lpid);
extern int kvmppc_radix_init(void); extern int kvmppc_radix_init(void);
extern void kvmppc_radix_exit(void); extern void kvmppc_radix_exit(void);
extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, extern int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn); unsigned long gfn);
extern void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
unsigned long gpa, unsigned int shift,
struct kvm_memory_slot *memslot,
unsigned int lpid);
extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, extern int kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn); unsigned long gfn);
extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, extern int kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
...@@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {} ...@@ -271,6 +294,21 @@ static inline void kvmppc_save_tm_sprs(struct kvm_vcpu *vcpu) {}
static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {} static inline void kvmppc_restore_tm_sprs(struct kvm_vcpu *vcpu) {}
#endif #endif
long kvmhv_nested_init(void);
void kvmhv_nested_exit(void);
void kvmhv_vm_nested_init(struct kvm *kvm);
long kvmhv_set_partition_table(struct kvm_vcpu *vcpu);
void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1);
void kvmhv_release_all_nested(struct kvm *kvm);
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu);
long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu);
int kvmhv_run_single_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu,
u64 time_limit, unsigned long lpcr);
void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr);
void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
struct hv_guest_state *hr);
long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu);
void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac); void kvmppc_giveup_fac(struct kvm_vcpu *vcpu, ulong fac);
extern int kvm_irq_bypass; extern int kvm_irq_bypass;
...@@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) ...@@ -301,12 +339,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
{ {
vcpu->arch.cr = val; vcpu->arch.regs.ccr = val;
} }
static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
{ {
return vcpu->arch.cr; return vcpu->arch.regs.ccr;
} }
static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
...@@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu); ...@@ -384,9 +422,6 @@ extern int kvmppc_h_logical_ci_store(struct kvm_vcpu *vcpu);
/* TO = 31 for unconditional trap */ /* TO = 31 for unconditional trap */
#define INS_TW 0x7fe00008 #define INS_TW 0x7fe00008
/* LPIDs we support with this build -- runtime limit may be lower */
#define KVMPPC_NR_LPIDS (LPID_RSVD + 1)
#define SPLIT_HACK_MASK 0xff000000 #define SPLIT_HACK_MASK 0xff000000
#define SPLIT_HACK_OFFS 0xfb000000 #define SPLIT_HACK_OFFS 0xfb000000
......
...@@ -23,6 +23,108 @@ ...@@ -23,6 +23,108 @@
#include <linux/string.h> #include <linux/string.h>
#include <asm/bitops.h> #include <asm/bitops.h>
#include <asm/book3s/64/mmu-hash.h> #include <asm/book3s/64/mmu-hash.h>
#include <asm/cpu_has_feature.h>
#include <asm/ppc-opcode.h>
#ifdef CONFIG_PPC_PSERIES
static inline bool kvmhv_on_pseries(void)
{
return !cpu_has_feature(CPU_FTR_HVMODE);
}
#else
static inline bool kvmhv_on_pseries(void)
{
return false;
}
#endif
/*
* Structure for a nested guest, that is, for a guest that is managed by
* one of our guests.
*/
struct kvm_nested_guest {
struct kvm *l1_host; /* L1 VM that owns this nested guest */
int l1_lpid; /* lpid L1 guest thinks this guest is */
int shadow_lpid; /* real lpid of this nested guest */
pgd_t *shadow_pgtable; /* our page table for this guest */
u64 l1_gr_to_hr; /* L1's addr of part'n-scoped table */
u64 process_table; /* process table entry for this guest */
long refcnt; /* number of pointers to this struct */
struct mutex tlb_lock; /* serialize page faults and tlbies */
struct kvm_nested_guest *next;
cpumask_t need_tlb_flush;
cpumask_t cpu_in_guest;
short prev_cpu[NR_CPUS];
};
/*
* We define a nested rmap entry as a single 64-bit quantity
* 0xFFF0000000000000 12-bit lpid field
* 0x000FFFFFFFFFF000 40-bit guest 4k page frame number
* 0x0000000000000001 1-bit single entry flag
*/
#define RMAP_NESTED_LPID_MASK 0xFFF0000000000000UL
#define RMAP_NESTED_LPID_SHIFT (52)
#define RMAP_NESTED_GPA_MASK 0x000FFFFFFFFFF000UL
#define RMAP_NESTED_IS_SINGLE_ENTRY 0x0000000000000001UL
/* Structure for a nested guest rmap entry */
struct rmap_nested {
struct llist_node list;
u64 rmap;
};
/*
* for_each_nest_rmap_safe - iterate over the list of nested rmap entries
* safe against removal of the list entry or NULL list
* @pos: a (struct rmap_nested *) to use as a loop cursor
* @node: pointer to the first entry
* NOTE: this can be NULL
* @rmapp: an (unsigned long *) in which to return the rmap entries on each
* iteration
* NOTE: this must point to already allocated memory
*
* The nested_rmap is a llist of (struct rmap_nested) entries pointed to by the
* rmap entry in the memslot. The list is always terminated by a "single entry"
* stored in the list element of the final entry of the llist. If there is ONLY
* a single entry then this is itself in the rmap entry of the memslot, not a
* llist head pointer.
*
* Note that the iterator below assumes that a nested rmap entry is always
* non-zero. This is true for our usage because the LPID field is always
* non-zero (zero is reserved for the host).
*
* This should be used to iterate over the list of rmap_nested entries with
* processing done on the u64 rmap value given by each iteration. This is safe
* against removal of list entries and it is always safe to call free on (pos).
*
* e.g.
* struct rmap_nested *cursor;
* struct llist_node *first;
* unsigned long rmap;
* for_each_nest_rmap_safe(cursor, first, &rmap) {
* do_something(rmap);
* free(cursor);
* }
*/
#define for_each_nest_rmap_safe(pos, node, rmapp) \
for ((pos) = llist_entry((node), typeof(*(pos)), list); \
(node) && \
(*(rmapp) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \
((u64) (node)) : ((pos)->rmap))) && \
(((node) = ((RMAP_NESTED_IS_SINGLE_ENTRY & ((u64) (node))) ? \
((struct llist_node *) ((pos) = NULL)) : \
(pos)->list.next)), true); \
(pos) = llist_entry((node), typeof(*(pos)), list))
struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
bool create);
void kvmhv_put_nested(struct kvm_nested_guest *gp);
int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid);
/* Encoding of first parameter for H_TLB_INVALIDATE */
#define H_TLBIE_P1_ENC(ric, prs, r) (___PPC_RIC(ric) | ___PPC_PRS(prs) | \
___PPC_R(r))
/* Power architecture requires HPT is at least 256kiB, at most 64TiB */ /* Power architecture requires HPT is at least 256kiB, at most 64TiB */
#define PPC_MIN_HPT_ORDER 18 #define PPC_MIN_HPT_ORDER 18
...@@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm) ...@@ -435,6 +537,7 @@ static inline struct kvm_memslots *kvm_memslots_raw(struct kvm *kvm)
} }
extern void kvmppc_mmu_debugfs_init(struct kvm *kvm); extern void kvmppc_mmu_debugfs_init(struct kvm *kvm);
extern void kvmhv_radix_debugfs_init(struct kvm *kvm);
extern void kvmhv_rm_send_ipi(int cpu); extern void kvmhv_rm_send_ipi(int cpu);
...@@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr) ...@@ -482,7 +585,7 @@ static inline u64 sanitize_msr(u64 msr)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu) static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
{ {
vcpu->arch.cr = vcpu->arch.cr_tm; vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
vcpu->arch.regs.xer = vcpu->arch.xer_tm; vcpu->arch.regs.xer = vcpu->arch.xer_tm;
vcpu->arch.regs.link = vcpu->arch.lr_tm; vcpu->arch.regs.link = vcpu->arch.lr_tm;
vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
...@@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu) ...@@ -499,7 +602,7 @@ static inline void copy_from_checkpoint(struct kvm_vcpu *vcpu)
static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
{ {
vcpu->arch.cr_tm = vcpu->arch.cr; vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
vcpu->arch.xer_tm = vcpu->arch.regs.xer; vcpu->arch.xer_tm = vcpu->arch.regs.xer;
vcpu->arch.lr_tm = vcpu->arch.regs.link; vcpu->arch.lr_tm = vcpu->arch.regs.link;
vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
...@@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu) ...@@ -515,6 +618,17 @@ static inline void copy_to_checkpoint(struct kvm_vcpu *vcpu)
} }
#endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
extern int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
unsigned long gpa, unsigned int level,
unsigned long mmu_seq, unsigned int lpid,
unsigned long *rmapp, struct rmap_nested **n_rmap);
extern void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
struct rmap_nested **n_rmap);
extern void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
struct kvm_memory_slot *memslot,
unsigned long gpa, unsigned long hpa,
unsigned long nbytes);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#endif /* __ASM_KVM_BOOK3S_64_H__ */ #endif /* __ASM_KVM_BOOK3S_64_H__ */
...@@ -25,6 +25,9 @@ ...@@ -25,6 +25,9 @@
#define XICS_MFRR 0xc #define XICS_MFRR 0xc
#define XICS_IPI 2 /* interrupt source # for IPIs */ #define XICS_IPI 2 /* interrupt source # for IPIs */
/* LPIDs we support with this build -- runtime limit may be lower */
#define KVMPPC_NR_LPIDS (LPID_RSVD + 1)
/* Maximum number of threads per physical core */ /* Maximum number of threads per physical core */
#define MAX_SMT_THREADS 8 #define MAX_SMT_THREADS 8
......
...@@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num) ...@@ -46,12 +46,12 @@ static inline ulong kvmppc_get_gpr(struct kvm_vcpu *vcpu, int num)
static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val) static inline void kvmppc_set_cr(struct kvm_vcpu *vcpu, u32 val)
{ {
vcpu->arch.cr = val; vcpu->arch.regs.ccr = val;
} }
static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu) static inline u32 kvmppc_get_cr(struct kvm_vcpu *vcpu)
{ {
return vcpu->arch.cr; return vcpu->arch.regs.ccr;
} }
static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val) static inline void kvmppc_set_xer(struct kvm_vcpu *vcpu, ulong val)
......
...@@ -46,6 +46,7 @@ ...@@ -46,6 +46,7 @@
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
#include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */ #include <asm/kvm_book3s_asm.h> /* for MAX_SMT_THREADS */
#define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES) #define KVM_MAX_VCPU_ID (MAX_SMT_THREADS * KVM_MAX_VCORES)
#define KVM_MAX_NESTED_GUESTS KVMPPC_NR_LPIDS
#else #else
#define KVM_MAX_VCPU_ID KVM_MAX_VCPUS #define KVM_MAX_VCPU_ID KVM_MAX_VCPUS
...@@ -94,6 +95,7 @@ struct dtl_entry; ...@@ -94,6 +95,7 @@ struct dtl_entry;
struct kvmppc_vcpu_book3s; struct kvmppc_vcpu_book3s;
struct kvmppc_book3s_shadow_vcpu; struct kvmppc_book3s_shadow_vcpu;
struct kvm_nested_guest;
struct kvm_vm_stat { struct kvm_vm_stat {
ulong remote_tlb_flush; ulong remote_tlb_flush;
...@@ -287,10 +289,12 @@ struct kvm_arch { ...@@ -287,10 +289,12 @@ struct kvm_arch {
u8 radix; u8 radix;
u8 fwnmi_enabled; u8 fwnmi_enabled;
bool threads_indep; bool threads_indep;
bool nested_enable;
pgd_t *pgtable; pgd_t *pgtable;
u64 process_table; u64 process_table;
struct dentry *debugfs_dir; struct dentry *debugfs_dir;
struct dentry *htab_dentry; struct dentry *htab_dentry;
struct dentry *radix_dentry;
struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */ struct kvm_resize_hpt *resize_hpt; /* protected by kvm->lock */
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */ #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE
...@@ -311,6 +315,9 @@ struct kvm_arch { ...@@ -311,6 +315,9 @@ struct kvm_arch {
#endif #endif
struct kvmppc_ops *kvm_ops; struct kvmppc_ops *kvm_ops;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
u64 l1_ptcr;
int max_nested_lpid;
struct kvm_nested_guest *nested_guests[KVM_MAX_NESTED_GUESTS];
/* This array can grow quite large, keep it at the end */ /* This array can grow quite large, keep it at the end */
struct kvmppc_vcore *vcores[KVM_MAX_VCORES]; struct kvmppc_vcore *vcores[KVM_MAX_VCORES];
#endif #endif
...@@ -360,7 +367,9 @@ struct kvmppc_pte { ...@@ -360,7 +367,9 @@ struct kvmppc_pte {
bool may_write : 1; bool may_write : 1;
bool may_execute : 1; bool may_execute : 1;
unsigned long wimg; unsigned long wimg;
unsigned long rc;
u8 page_size; /* MMU_PAGE_xxx */ u8 page_size; /* MMU_PAGE_xxx */
u8 page_shift;
}; };
struct kvmppc_mmu { struct kvmppc_mmu {
...@@ -537,8 +546,6 @@ struct kvm_vcpu_arch { ...@@ -537,8 +546,6 @@ struct kvm_vcpu_arch {
ulong tar; ulong tar;
#endif #endif
u32 cr;
#ifdef CONFIG_PPC_BOOK3S #ifdef CONFIG_PPC_BOOK3S
ulong hflags; ulong hflags;
ulong guest_owned_ext; ulong guest_owned_ext;
...@@ -707,6 +714,7 @@ struct kvm_vcpu_arch { ...@@ -707,6 +714,7 @@ struct kvm_vcpu_arch {
u8 hcall_needed; u8 hcall_needed;
u8 epr_flags; /* KVMPPC_EPR_xxx */ u8 epr_flags; /* KVMPPC_EPR_xxx */
u8 epr_needed; u8 epr_needed;
u8 external_oneshot; /* clear external irq after delivery */
u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */ u32 cpr0_cfgaddr; /* holds the last set cpr0_cfgaddr */
...@@ -781,6 +789,10 @@ struct kvm_vcpu_arch { ...@@ -781,6 +789,10 @@ struct kvm_vcpu_arch {
u32 emul_inst; u32 emul_inst;
u32 online; u32 online;
/* For support of nested guests */
struct kvm_nested_guest *nested;
u32 nested_vcpu_id;
#endif #endif
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
......
...@@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table( ...@@ -194,9 +194,7 @@ extern struct kvmppc_spapr_tce_table *kvmppc_find_table(
(iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \ (iommu_tce_check_ioba((stt)->page_shift, (stt)->offset, \
(stt)->size, (ioba), (npages)) ? \ (stt)->size, (ioba), (npages)) ? \
H_PARAMETER : H_SUCCESS) H_PARAMETER : H_SUCCESS)
extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, extern long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
unsigned long tce);
extern long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
unsigned long *ua, unsigned long **prmap); unsigned long *ua, unsigned long **prmap);
extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt, extern void kvmppc_tce_put(struct kvmppc_spapr_tce_table *tt,
unsigned long idx, unsigned long tce); unsigned long idx, unsigned long tce);
...@@ -327,6 +325,7 @@ struct kvmppc_ops { ...@@ -327,6 +325,7 @@ struct kvmppc_ops {
int (*set_smt_mode)(struct kvm *kvm, unsigned long mode, int (*set_smt_mode)(struct kvm *kvm, unsigned long mode,
unsigned long flags); unsigned long flags);
void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr); void (*giveup_ext)(struct kvm_vcpu *vcpu, ulong msr);
int (*enable_nested)(struct kvm *kvm);
}; };
extern struct kvmppc_ops *kvmppc_hv_ops; extern struct kvmppc_ops *kvmppc_hv_ops;
...@@ -585,6 +584,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); ...@@ -585,6 +584,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval);
extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
int level, bool line_status); int level, bool line_status);
extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
#else #else
static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server, static inline int kvmppc_xive_set_xive(struct kvm *kvm, u32 irq, u32 server,
u32 priority) { return -1; } u32 priority) { return -1; }
...@@ -607,6 +607,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur ...@@ -607,6 +607,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur
static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq,
int level, bool line_status) { return -ENODEV; } int level, bool line_status) { return -ENODEV; }
static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
#endif /* CONFIG_KVM_XIVE */ #endif /* CONFIG_KVM_XIVE */
/* /*
...@@ -652,6 +653,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server, ...@@ -652,6 +653,7 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
unsigned long mfrr); unsigned long mfrr);
int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr); int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr);
int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr); int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr);
void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu);
/* /*
* Host-side operations we want to set up while running in real * Host-side operations we want to set up while running in real
......
...@@ -104,6 +104,7 @@ ...@@ -104,6 +104,7 @@
#define OP_31_XOP_LHZUX 311 #define OP_31_XOP_LHZUX 311
#define OP_31_XOP_MSGSNDP 142 #define OP_31_XOP_MSGSNDP 142
#define OP_31_XOP_MSGCLRP 174 #define OP_31_XOP_MSGCLRP 174
#define OP_31_XOP_TLBIE 306
#define OP_31_XOP_MFSPR 339 #define OP_31_XOP_MFSPR 339
#define OP_31_XOP_LWAX 341 #define OP_31_XOP_LWAX 341
#define OP_31_XOP_LHAX 343 #define OP_31_XOP_LHAX 343
......
...@@ -415,6 +415,7 @@ ...@@ -415,6 +415,7 @@
#define HFSCR_DSCR __MASK(FSCR_DSCR_LG) #define HFSCR_DSCR __MASK(FSCR_DSCR_LG)
#define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG) #define HFSCR_VECVSX __MASK(FSCR_VECVSX_LG)
#define HFSCR_FP __MASK(FSCR_FP_LG) #define HFSCR_FP __MASK(FSCR_FP_LG)
#define HFSCR_INTR_CAUSE (ASM_CONST(0xFF) << 56) /* interrupt cause */
#define SPRN_TAR 0x32f /* Target Address Register */ #define SPRN_TAR 0x32f /* Target Address Register */
#define SPRN_LPCR 0x13E /* LPAR Control Register */ #define SPRN_LPCR 0x13E /* LPAR Control Register */
#define LPCR_VPM0 ASM_CONST(0x8000000000000000) #define LPCR_VPM0 ASM_CONST(0x8000000000000000)
...@@ -766,6 +767,7 @@ ...@@ -766,6 +767,7 @@
#define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */ #define SPRN_HSRR0 0x13A /* Save/Restore Register 0 */
#define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */ #define SPRN_HSRR1 0x13B /* Save/Restore Register 1 */
#define HSRR1_DENORM 0x00100000 /* Denorm exception */ #define HSRR1_DENORM 0x00100000 /* Denorm exception */
#define HSRR1_HISI_WRITE 0x00010000 /* HISI bcs couldn't update mem */
#define SPRN_TBCTL 0x35f /* PA6T Timebase control register */ #define SPRN_TBCTL 0x35f /* PA6T Timebase control register */
#define TBCTL_FREEZE 0x0000000000000000ull /* Freeze all tbs */ #define TBCTL_FREEZE 0x0000000000000000ull /* Freeze all tbs */
......
...@@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char { ...@@ -634,6 +634,7 @@ struct kvm_ppc_cpu_char {
#define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe) #define KVM_REG_PPC_DEC_EXPIRY (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xbe)
#define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf) #define KVM_REG_PPC_ONLINE (KVM_REG_PPC | KVM_REG_SIZE_U32 | 0xbf)
#define KVM_REG_PPC_PTCR (KVM_REG_PPC | KVM_REG_SIZE_U64 | 0xc0)
/* Transactional Memory checkpointed state: /* Transactional Memory checkpointed state:
* This is all GPRs, all VSX regs and a subset of SPRs * This is all GPRs, all VSX regs and a subset of SPRs
......
...@@ -438,7 +438,7 @@ int main(void) ...@@ -438,7 +438,7 @@ int main(void)
#ifdef CONFIG_PPC_BOOK3S #ifdef CONFIG_PPC_BOOK3S
OFFSET(VCPU_TAR, kvm_vcpu, arch.tar); OFFSET(VCPU_TAR, kvm_vcpu, arch.tar);
#endif #endif
OFFSET(VCPU_CR, kvm_vcpu, arch.cr); OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip); OFFSET(VCPU_PC, kvm_vcpu, arch.regs.nip);
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr); OFFSET(VCPU_MSR, kvm_vcpu, arch.shregs.msr);
...@@ -503,6 +503,7 @@ int main(void) ...@@ -503,6 +503,7 @@ int main(void)
OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr); OFFSET(VCPU_VPA, kvm_vcpu, arch.vpa.pinned_addr);
OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty); OFFSET(VCPU_VPA_DIRTY, kvm_vcpu, arch.vpa.dirty);
OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst); OFFSET(VCPU_HEIR, kvm_vcpu, arch.emul_inst);
OFFSET(VCPU_NESTED, kvm_vcpu, arch.nested);
OFFSET(VCPU_CPU, kvm_vcpu, cpu); OFFSET(VCPU_CPU, kvm_vcpu, cpu);
OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu); OFFSET(VCPU_THREAD_CPU, kvm_vcpu, arch.thread_cpu);
#endif #endif
...@@ -695,7 +696,7 @@ int main(void) ...@@ -695,7 +696,7 @@ int main(void)
#endif /* CONFIG_PPC_BOOK3S_64 */ #endif /* CONFIG_PPC_BOOK3S_64 */
#else /* CONFIG_PPC_BOOK3S */ #else /* CONFIG_PPC_BOOK3S */
OFFSET(VCPU_CR, kvm_vcpu, arch.cr); OFFSET(VCPU_CR, kvm_vcpu, arch.regs.ccr);
OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer); OFFSET(VCPU_XER, kvm_vcpu, arch.regs.xer);
OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link); OFFSET(VCPU_LR, kvm_vcpu, arch.regs.link);
OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr); OFFSET(VCPU_CTR, kvm_vcpu, arch.regs.ctr);
......
...@@ -147,8 +147,8 @@ __init_hvmode_206: ...@@ -147,8 +147,8 @@ __init_hvmode_206:
rldicl. r0,r3,4,63 rldicl. r0,r3,4,63
bnelr bnelr
ld r5,CPU_SPEC_FEATURES(r4) ld r5,CPU_SPEC_FEATURES(r4)
LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE) LOAD_REG_IMMEDIATE(r6,CPU_FTR_HVMODE | CPU_FTR_P9_TM_HV_ASSIST)
xor r5,r5,r6 andc r5,r5,r6
std r5,CPU_SPEC_FEATURES(r4) std r5,CPU_SPEC_FEATURES(r4)
blr blr
......
...@@ -75,7 +75,8 @@ kvm-hv-y += \ ...@@ -75,7 +75,8 @@ kvm-hv-y += \
book3s_hv.o \ book3s_hv.o \
book3s_hv_interrupts.o \ book3s_hv_interrupts.o \
book3s_64_mmu_hv.o \ book3s_64_mmu_hv.o \
book3s_64_mmu_radix.o book3s_64_mmu_radix.o \
book3s_hv_nested.o
kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \ kvm-hv-$(CONFIG_PPC_TRANSACTIONAL_MEM) += \
book3s_hv_tm.o book3s_hv_tm.o
......
...@@ -78,8 +78,11 @@ void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu) ...@@ -78,8 +78,11 @@ void kvmppc_unfixup_split_real(struct kvm_vcpu *vcpu)
{ {
if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) { if (vcpu->arch.hflags & BOOK3S_HFLAG_SPLIT_HACK) {
ulong pc = kvmppc_get_pc(vcpu); ulong pc = kvmppc_get_pc(vcpu);
ulong lr = kvmppc_get_lr(vcpu);
if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS) if ((pc & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK); kvmppc_set_pc(vcpu, pc & ~SPLIT_HACK_MASK);
if ((lr & SPLIT_HACK_MASK) == SPLIT_HACK_OFFS)
kvmppc_set_lr(vcpu, lr & ~SPLIT_HACK_MASK);
vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK; vcpu->arch.hflags &= ~BOOK3S_HFLAG_SPLIT_HACK;
} }
} }
...@@ -150,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec) ...@@ -150,7 +153,6 @@ static int kvmppc_book3s_vec2irqprio(unsigned int vec)
case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break; case 0x400: prio = BOOK3S_IRQPRIO_INST_STORAGE; break;
case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break; case 0x480: prio = BOOK3S_IRQPRIO_INST_SEGMENT; break;
case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break; case 0x500: prio = BOOK3S_IRQPRIO_EXTERNAL; break;
case 0x501: prio = BOOK3S_IRQPRIO_EXTERNAL_LEVEL; break;
case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break; case 0x600: prio = BOOK3S_IRQPRIO_ALIGNMENT; break;
case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break; case 0x700: prio = BOOK3S_IRQPRIO_PROGRAM; break;
case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break; case 0x800: prio = BOOK3S_IRQPRIO_FP_UNAVAIL; break;
...@@ -236,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec); ...@@ -236,18 +238,35 @@ EXPORT_SYMBOL_GPL(kvmppc_core_dequeue_dec);
void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, void kvmppc_core_queue_external(struct kvm_vcpu *vcpu,
struct kvm_interrupt *irq) struct kvm_interrupt *irq)
{ {
unsigned int vec = BOOK3S_INTERRUPT_EXTERNAL; /*
* This case (KVM_INTERRUPT_SET) should never actually arise for
if (irq->irq == KVM_INTERRUPT_SET_LEVEL) * a pseries guest (because pseries guests expect their interrupt
vec = BOOK3S_INTERRUPT_EXTERNAL_LEVEL; * controllers to continue asserting an external interrupt request
* until it is acknowledged at the interrupt controller), but is
* included to avoid ABI breakage and potentially for other
* sorts of guest.
*
* There is a subtlety here: HV KVM does not test the
* external_oneshot flag in the code that synthesizes
* external interrupts for the guest just before entering
* the guest. That is OK even if userspace did do a
* KVM_INTERRUPT_SET on a pseries guest vcpu, because the
* caller (kvm_vcpu_ioctl_interrupt) does a kvm_vcpu_kick()
* which ends up doing a smp_send_reschedule(), which will
* pull the guest all the way out to the host, meaning that
* we will call kvmppc_core_prepare_to_enter() before entering
* the guest again, and that will handle the external_oneshot
* flag correctly.
*/
if (irq->irq == KVM_INTERRUPT_SET)
vcpu->arch.external_oneshot = 1;
kvmppc_book3s_queue_irqprio(vcpu, vec); kvmppc_book3s_queue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
} }
void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu) void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu)
{ {
kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL); kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL);
kvmppc_book3s_dequeue_irqprio(vcpu, BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
} }
void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar, void kvmppc_core_queue_data_storage(struct kvm_vcpu *vcpu, ulong dar,
...@@ -278,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu, ...@@ -278,7 +297,6 @@ static int kvmppc_book3s_irqprio_deliver(struct kvm_vcpu *vcpu,
vec = BOOK3S_INTERRUPT_DECREMENTER; vec = BOOK3S_INTERRUPT_DECREMENTER;
break; break;
case BOOK3S_IRQPRIO_EXTERNAL: case BOOK3S_IRQPRIO_EXTERNAL:
case BOOK3S_IRQPRIO_EXTERNAL_LEVEL:
deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit; deliver = (kvmppc_get_msr(vcpu) & MSR_EE) && !crit;
vec = BOOK3S_INTERRUPT_EXTERNAL; vec = BOOK3S_INTERRUPT_EXTERNAL;
break; break;
...@@ -352,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority) ...@@ -352,8 +370,16 @@ static bool clear_irqprio(struct kvm_vcpu *vcpu, unsigned int priority)
case BOOK3S_IRQPRIO_DECREMENTER: case BOOK3S_IRQPRIO_DECREMENTER:
/* DEC interrupts get cleared by mtdec */ /* DEC interrupts get cleared by mtdec */
return false; return false;
case BOOK3S_IRQPRIO_EXTERNAL_LEVEL: case BOOK3S_IRQPRIO_EXTERNAL:
/* External interrupts get cleared by userspace */ /*
* External interrupts get cleared by userspace
* except when set by the KVM_INTERRUPT ioctl with
* KVM_INTERRUPT_SET (not KVM_INTERRUPT_SET_LEVEL).
*/
if (vcpu->arch.external_oneshot) {
vcpu->arch.external_oneshot = 0;
return true;
}
return false; return false;
} }
......
...@@ -268,13 +268,12 @@ int kvmppc_mmu_hv_init(void) ...@@ -268,13 +268,12 @@ int kvmppc_mmu_hv_init(void)
{ {
unsigned long host_lpid, rsvd_lpid; unsigned long host_lpid, rsvd_lpid;
if (!cpu_has_feature(CPU_FTR_HVMODE))
return -EINVAL;
if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE)) if (!mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE))
return -EINVAL; return -EINVAL;
/* POWER7 has 10-bit LPIDs (12-bit in POWER8) */ /* POWER7 has 10-bit LPIDs (12-bit in POWER8) */
host_lpid = 0;
if (cpu_has_feature(CPU_FTR_HVMODE))
host_lpid = mfspr(SPRN_LPID); host_lpid = mfspr(SPRN_LPID);
rsvd_lpid = LPID_RSVD; rsvd_lpid = LPID_RSVD;
......
...@@ -10,6 +10,9 @@ ...@@ -10,6 +10,9 @@
#include <linux/string.h> #include <linux/string.h>
#include <linux/kvm.h> #include <linux/kvm.h>
#include <linux/kvm_host.h> #include <linux/kvm_host.h>
#include <linux/anon_inodes.h>
#include <linux/file.h>
#include <linux/debugfs.h>
#include <asm/kvm_ppc.h> #include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h> #include <asm/kvm_book3s.h>
...@@ -26,87 +29,74 @@ ...@@ -26,87 +29,74 @@
*/ */
static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 };
int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, bool data, bool iswrite) struct kvmppc_pte *gpte, u64 root,
u64 *pte_ret_p)
{ {
struct kvm *kvm = vcpu->kvm; struct kvm *kvm = vcpu->kvm;
u32 pid;
int ret, level, ps; int ret, level, ps;
__be64 prte, rpte; unsigned long rts, bits, offset, index;
unsigned long ptbl; u64 pte, base, gpa;
unsigned long root, pte, index; __be64 rpte;
unsigned long rts, bits, offset;
unsigned long gpa;
unsigned long proc_tbl_size;
/* Work out effective PID */
switch (eaddr >> 62) {
case 0:
pid = vcpu->arch.pid;
break;
case 3:
pid = 0;
break;
default:
return -EINVAL;
}
proc_tbl_size = 1 << ((kvm->arch.process_table & PRTS_MASK) + 12);
if (pid * 16 >= proc_tbl_size)
return -EINVAL;
/* Read partition table to find root of tree for effective PID */
ptbl = (kvm->arch.process_table & PRTB_MASK) + (pid * 16);
ret = kvm_read_guest(kvm, ptbl, &prte, sizeof(prte));
if (ret)
return ret;
root = be64_to_cpu(prte);
rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) |
((root & RTS2_MASK) >> RTS2_SHIFT); ((root & RTS2_MASK) >> RTS2_SHIFT);
bits = root & RPDS_MASK; bits = root & RPDS_MASK;
root = root & RPDB_MASK; base = root & RPDB_MASK;
offset = rts + 31; offset = rts + 31;
/* current implementations only support 52-bit space */ /* Current implementations only support 52-bit space */
if (offset != 52) if (offset != 52)
return -EINVAL; return -EINVAL;
/* Walk each level of the radix tree */
for (level = 3; level >= 0; --level) { for (level = 3; level >= 0; --level) {
u64 addr;
/* Check a valid size */
if (level && bits != p9_supported_radix_bits[level]) if (level && bits != p9_supported_radix_bits[level])
return -EINVAL; return -EINVAL;
if (level == 0 && !(bits == 5 || bits == 9)) if (level == 0 && !(bits == 5 || bits == 9))
return -EINVAL; return -EINVAL;
offset -= bits; offset -= bits;
index = (eaddr >> offset) & ((1UL << bits) - 1); index = (eaddr >> offset) & ((1UL << bits) - 1);
/* check that low bits of page table base are zero */ /* Check that low bits of page table base are zero */
if (root & ((1UL << (bits + 3)) - 1)) if (base & ((1UL << (bits + 3)) - 1))
return -EINVAL; return -EINVAL;
ret = kvm_read_guest(kvm, root + index * 8, /* Read the entry from guest memory */
&rpte, sizeof(rpte)); addr = base + (index * sizeof(rpte));
if (ret) ret = kvm_read_guest(kvm, addr, &rpte, sizeof(rpte));
if (ret) {
if (pte_ret_p)
*pte_ret_p = addr;
return ret; return ret;
}
pte = __be64_to_cpu(rpte); pte = __be64_to_cpu(rpte);
if (!(pte & _PAGE_PRESENT)) if (!(pte & _PAGE_PRESENT))
return -ENOENT; return -ENOENT;
/* Check if a leaf entry */
if (pte & _PAGE_PTE) if (pte & _PAGE_PTE)
break; break;
bits = pte & 0x1f; /* Get ready to walk the next level */
root = pte & 0x0fffffffffffff00ul; base = pte & RPDB_MASK;
bits = pte & RPDS_MASK;
} }
/* need a leaf at lowest level; 512GB pages not supported */
/* Need a leaf at lowest level; 512GB pages not supported */
if (level < 0 || level == 3) if (level < 0 || level == 3)
return -EINVAL; return -EINVAL;
/* offset is now log base 2 of the page size */ /* We found a valid leaf PTE */
/* Offset is now log base 2 of the page size */
gpa = pte & 0x01fffffffffff000ul; gpa = pte & 0x01fffffffffff000ul;
if (gpa & ((1ul << offset) - 1)) if (gpa & ((1ul << offset) - 1))
return -EINVAL; return -EINVAL;
gpa += eaddr & ((1ul << offset) - 1); gpa |= eaddr & ((1ul << offset) - 1);
for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps)
if (offset == mmu_psize_defs[ps].shift) if (offset == mmu_psize_defs[ps].shift)
break; break;
gpte->page_size = ps; gpte->page_size = ps;
gpte->page_shift = offset;
gpte->eaddr = eaddr; gpte->eaddr = eaddr;
gpte->raddr = gpa; gpte->raddr = gpa;
...@@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, ...@@ -115,6 +105,77 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
gpte->may_read = !!(pte & _PAGE_READ); gpte->may_read = !!(pte & _PAGE_READ);
gpte->may_write = !!(pte & _PAGE_WRITE); gpte->may_write = !!(pte & _PAGE_WRITE);
gpte->may_execute = !!(pte & _PAGE_EXEC); gpte->may_execute = !!(pte & _PAGE_EXEC);
gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY);
if (pte_ret_p)
*pte_ret_p = pte;
return 0;
}
/*
* Used to walk a partition or process table radix tree in guest memory
* Note: We exploit the fact that a partition table and a process
* table have the same layout, a partition-scoped page table and a
* process-scoped page table have the same layout, and the 2nd
* doubleword of a partition table entry has the same layout as
* the PTCR register.
*/
int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, u64 table,
int table_index, u64 *pte_ret_p)
{
struct kvm *kvm = vcpu->kvm;
int ret;
unsigned long size, ptbl, root;
struct prtb_entry entry;
if ((table & PRTS_MASK) > 24)
return -EINVAL;
size = 1ul << ((table & PRTS_MASK) + 12);
/* Is the table big enough to contain this entry? */
if ((table_index * sizeof(entry)) >= size)
return -EINVAL;
/* Read the table to find the root of the radix tree */
ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry));
ret = kvm_read_guest(kvm, ptbl, &entry, sizeof(entry));
if (ret)
return ret;
/* Root is stored in the first double word */
root = be64_to_cpu(entry.prtb0);
return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p);
}
int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
struct kvmppc_pte *gpte, bool data, bool iswrite)
{
u32 pid;
u64 pte;
int ret;
/* Work out effective PID */
switch (eaddr >> 62) {
case 0:
pid = vcpu->arch.pid;
break;
case 3:
pid = 0;
break;
default:
return -EINVAL;
}
ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte,
vcpu->kvm->arch.process_table, pid, &pte);
if (ret)
return ret;
/* Check privilege (applies only to process scoped translations) */
if (kvmppc_get_msr(vcpu) & MSR_PR) { if (kvmppc_get_msr(vcpu) & MSR_PR) {
if (pte & _PAGE_PRIVILEGED) { if (pte & _PAGE_PRIVILEGED) {
gpte->may_read = 0; gpte->may_read = 0;
...@@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, ...@@ -137,20 +198,46 @@ int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
} }
static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, static void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr,
unsigned int pshift) unsigned int pshift, unsigned int lpid)
{ {
unsigned long psize = PAGE_SIZE; unsigned long psize = PAGE_SIZE;
int psi;
long rc;
unsigned long rb;
if (pshift) if (pshift)
psize = 1UL << pshift; psize = 1UL << pshift;
else
pshift = PAGE_SHIFT;
addr &= ~(psize - 1); addr &= ~(psize - 1);
radix__flush_tlb_lpid_page(kvm->arch.lpid, addr, psize);
if (!kvmhv_on_pseries()) {
radix__flush_tlb_lpid_page(lpid, addr, psize);
return;
}
psi = shift_to_mmu_psize(pshift);
rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58));
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1),
lpid, rb);
if (rc)
pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n", rc);
} }
static void kvmppc_radix_flush_pwc(struct kvm *kvm) static void kvmppc_radix_flush_pwc(struct kvm *kvm, unsigned int lpid)
{ {
radix__flush_pwc_lpid(kvm->arch.lpid); long rc;
if (!kvmhv_on_pseries()) {
radix__flush_pwc_lpid(lpid);
return;
}
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1),
lpid, TLBIEL_INVAL_SET_LPID);
if (rc)
pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n", rc);
} }
static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep,
...@@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp) ...@@ -195,23 +282,38 @@ static void kvmppc_pmd_free(pmd_t *pmdp)
kmem_cache_free(kvm_pmd_cache, pmdp); kmem_cache_free(kvm_pmd_cache, pmdp);
} }
static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, /* Called with kvm->mmu_lock held */
unsigned long gpa, unsigned int shift) void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa,
unsigned int shift, struct kvm_memory_slot *memslot,
unsigned int lpid)
{ {
unsigned long page_size = 1ul << shift;
unsigned long old; unsigned long old;
unsigned long gfn = gpa >> PAGE_SHIFT;
unsigned long page_size = PAGE_SIZE;
unsigned long hpa;
old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift); old = kvmppc_radix_update_pte(kvm, pte, ~0UL, 0, gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid);
if (old & _PAGE_DIRTY) {
unsigned long gfn = gpa >> PAGE_SHIFT; /* The following only applies to L1 entries */
struct kvm_memory_slot *memslot; if (lpid != kvm->arch.lpid)
return;
if (!memslot) {
memslot = gfn_to_memslot(kvm, gfn); memslot = gfn_to_memslot(kvm, gfn);
if (memslot && memslot->dirty_bitmap) if (!memslot)
kvmppc_update_dirty_map(memslot, gfn, page_size); return;
} }
if (shift)
page_size = 1ul << shift;
gpa &= ~(page_size - 1);
hpa = old & PTE_RPN_MASK;
kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size);
if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap)
kvmppc_update_dirty_map(memslot, gfn, page_size);
} }
/* /*
...@@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, ...@@ -224,7 +326,8 @@ static void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte,
* and emit a warning if encountered, but there may already be data * and emit a warning if encountered, but there may already be data
* corruption due to the unexpected mappings. * corruption due to the unexpected mappings.
*/ */
static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full,
unsigned int lpid)
{ {
if (full) { if (full) {
memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE); memset(pte, 0, sizeof(long) << PTE_INDEX_SIZE);
...@@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full) ...@@ -238,14 +341,15 @@ static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full)
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
kvmppc_unmap_pte(kvm, p, kvmppc_unmap_pte(kvm, p,
pte_pfn(*p) << PAGE_SHIFT, pte_pfn(*p) << PAGE_SHIFT,
PAGE_SHIFT); PAGE_SHIFT, NULL, lpid);
} }
} }
kvmppc_pte_free(pte); kvmppc_pte_free(pte);
} }
static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full,
unsigned int lpid)
{ {
unsigned long im; unsigned long im;
pmd_t *p = pmd; pmd_t *p = pmd;
...@@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full) ...@@ -260,20 +364,21 @@ static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full)
WARN_ON_ONCE(1); WARN_ON_ONCE(1);
kvmppc_unmap_pte(kvm, (pte_t *)p, kvmppc_unmap_pte(kvm, (pte_t *)p,
pte_pfn(*(pte_t *)p) << PAGE_SHIFT, pte_pfn(*(pte_t *)p) << PAGE_SHIFT,
PMD_SHIFT); PMD_SHIFT, NULL, lpid);
} }
} else { } else {
pte_t *pte; pte_t *pte;
pte = pte_offset_map(p, 0); pte = pte_offset_map(p, 0);
kvmppc_unmap_free_pte(kvm, pte, full); kvmppc_unmap_free_pte(kvm, pte, full, lpid);
pmd_clear(p); pmd_clear(p);
} }
} }
kvmppc_pmd_free(pmd); kvmppc_pmd_free(pmd);
} }
static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud,
unsigned int lpid)
{ {
unsigned long iu; unsigned long iu;
pud_t *p = pud; pud_t *p = pud;
...@@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud) ...@@ -287,36 +392,40 @@ static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud)
pmd_t *pmd; pmd_t *pmd;
pmd = pmd_offset(p, 0); pmd = pmd_offset(p, 0);
kvmppc_unmap_free_pmd(kvm, pmd, true); kvmppc_unmap_free_pmd(kvm, pmd, true, lpid);
pud_clear(p); pud_clear(p);
} }
} }
pud_free(kvm->mm, pud); pud_free(kvm->mm, pud);
} }
void kvmppc_free_radix(struct kvm *kvm) void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, unsigned int lpid)
{ {
unsigned long ig; unsigned long ig;
pgd_t *pgd;
if (!kvm->arch.pgtable)
return;
pgd = kvm->arch.pgtable;
for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) {
pud_t *pud; pud_t *pud;
if (!pgd_present(*pgd)) if (!pgd_present(*pgd))
continue; continue;
pud = pud_offset(pgd, 0); pud = pud_offset(pgd, 0);
kvmppc_unmap_free_pud(kvm, pud); kvmppc_unmap_free_pud(kvm, pud, lpid);
pgd_clear(pgd); pgd_clear(pgd);
} }
}
void kvmppc_free_radix(struct kvm *kvm)
{
if (kvm->arch.pgtable) {
kvmppc_free_pgtable_radix(kvm, kvm->arch.pgtable,
kvm->arch.lpid);
pgd_free(kvm->mm, kvm->arch.pgtable); pgd_free(kvm->mm, kvm->arch.pgtable);
kvm->arch.pgtable = NULL; kvm->arch.pgtable = NULL;
}
} }
static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
unsigned long gpa) unsigned long gpa, unsigned int lpid)
{ {
pte_t *pte = pte_offset_kernel(pmd, 0); pte_t *pte = pte_offset_kernel(pmd, 0);
...@@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, ...@@ -326,13 +435,13 @@ static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd,
* flushing the PWC again. * flushing the PWC again.
*/ */
pmd_clear(pmd); pmd_clear(pmd);
kvmppc_radix_flush_pwc(kvm); kvmppc_radix_flush_pwc(kvm, lpid);
kvmppc_unmap_free_pte(kvm, pte, false); kvmppc_unmap_free_pte(kvm, pte, false, lpid);
} }
static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
unsigned long gpa) unsigned long gpa, unsigned int lpid)
{ {
pmd_t *pmd = pmd_offset(pud, 0); pmd_t *pmd = pmd_offset(pud, 0);
...@@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, ...@@ -342,9 +451,9 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
* so can be freed without flushing the PWC again. * so can be freed without flushing the PWC again.
*/ */
pud_clear(pud); pud_clear(pud);
kvmppc_radix_flush_pwc(kvm); kvmppc_radix_flush_pwc(kvm, lpid);
kvmppc_unmap_free_pmd(kvm, pmd, false); kvmppc_unmap_free_pmd(kvm, pmd, false, lpid);
} }
/* /*
...@@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, ...@@ -356,8 +465,10 @@ static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud,
*/ */
#define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED))
static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte,
unsigned int level, unsigned long mmu_seq) unsigned long gpa, unsigned int level,
unsigned long mmu_seq, unsigned int lpid,
unsigned long *rmapp, struct rmap_nested **n_rmap)
{ {
pgd_t *pgd; pgd_t *pgd;
pud_t *pud, *new_pud = NULL; pud_t *pud, *new_pud = NULL;
...@@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ...@@ -366,7 +477,7 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
int ret; int ret;
/* Traverse the guest's 2nd-level tree, allocate new levels needed */ /* Traverse the guest's 2nd-level tree, allocate new levels needed */
pgd = kvm->arch.pgtable + pgd_index(gpa); pgd = pgtable + pgd_index(gpa);
pud = NULL; pud = NULL;
if (pgd_present(*pgd)) if (pgd_present(*pgd))
pud = pud_offset(pgd, gpa); pud = pud_offset(pgd, gpa);
...@@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ...@@ -423,7 +534,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
goto out_unlock; goto out_unlock;
} }
/* Valid 1GB page here already, remove it */ /* Valid 1GB page here already, remove it */
kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT); kvmppc_unmap_pte(kvm, (pte_t *)pud, hgpa, PUD_SHIFT, NULL,
lpid);
} }
if (level == 2) { if (level == 2) {
if (!pud_none(*pud)) { if (!pud_none(*pud)) {
...@@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ...@@ -432,9 +544,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
* install a large page, so remove and free the page * install a large page, so remove and free the page
* table page. * table page.
*/ */
kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa); kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid);
} }
kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte); kvmppc_radix_set_pte_at(kvm, gpa, (pte_t *)pud, pte);
if (rmapp && n_rmap)
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0; ret = 0;
goto out_unlock; goto out_unlock;
} }
...@@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ...@@ -472,7 +586,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
goto out_unlock; goto out_unlock;
} }
/* Valid 2MB page here already, remove it */ /* Valid 2MB page here already, remove it */
kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT); kvmppc_unmap_pte(kvm, pmdp_ptep(pmd), lgpa, PMD_SHIFT, NULL,
lpid);
} }
if (level == 1) { if (level == 1) {
if (!pmd_none(*pmd)) { if (!pmd_none(*pmd)) {
...@@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ...@@ -481,9 +596,11 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
* install a large page, so remove and free the page * install a large page, so remove and free the page
* table page. * table page.
*/ */
kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa); kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid);
} }
kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte); kvmppc_radix_set_pte_at(kvm, gpa, pmdp_ptep(pmd), pte);
if (rmapp && n_rmap)
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0; ret = 0;
goto out_unlock; goto out_unlock;
} }
...@@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ...@@ -508,6 +625,8 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
goto out_unlock; goto out_unlock;
} }
kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte); kvmppc_radix_set_pte_at(kvm, gpa, ptep, pte);
if (rmapp && n_rmap)
kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap);
ret = 0; ret = 0;
out_unlock: out_unlock:
...@@ -521,69 +640,13 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa, ...@@ -521,69 +640,13 @@ static int kvmppc_create_pte(struct kvm *kvm, pte_t pte, unsigned long gpa,
return ret; return ret;
} }
int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, bool kvmppc_hv_handle_set_rc(struct kvm *kvm, pgd_t *pgtable, bool writing,
unsigned long ea, unsigned long dsisr) unsigned long gpa, unsigned int lpid)
{ {
struct kvm *kvm = vcpu->kvm;
unsigned long mmu_seq;
unsigned long gpa, gfn, hva;
struct kvm_memory_slot *memslot;
struct page *page = NULL;
long ret;
bool writing;
bool upgrade_write = false;
bool *upgrade_p = &upgrade_write;
pte_t pte, *ptep;
unsigned long pgflags; unsigned long pgflags;
unsigned int shift, level; unsigned int shift;
pte_t *ptep;
/* Check for unusual errors */
if (dsisr & DSISR_UNSUPP_MMU) {
pr_err("KVM: Got unsupported MMU fault\n");
return -EFAULT;
}
if (dsisr & DSISR_BADACCESS) {
/* Reflect to the guest as DSI */
pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
/* Translate the logical address and get the page */
gpa = vcpu->arch.fault_gpa & ~0xfffUL;
gpa &= ~0xF000000000000000ul;
gfn = gpa >> PAGE_SHIFT;
if (!(dsisr & DSISR_PRTABLE_FAULT))
gpa |= ea & 0xfff;
memslot = gfn_to_memslot(kvm, gfn);
/* No memslot means it's an emulated MMIO region */
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
DSISR_SET_RC)) {
/*
* Bad address in guest page table tree, or other
* unusual error - reflect it to the guest as DSI.
*/
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea,
dsisr & DSISR_ISSTORE);
}
writing = (dsisr & DSISR_ISSTORE) != 0;
if (memslot->flags & KVM_MEM_READONLY) {
if (writing) {
/* give the guest a DSI */
dsisr = DSISR_ISSTORE | DSISR_PROTFAULT;
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
upgrade_p = NULL;
}
if (dsisr & DSISR_SET_RC) {
/* /*
* Need to set an R or C bit in the 2nd-level tables; * Need to set an R or C bit in the 2nd-level tables;
* since we are just helping out the hardware here, * since we are just helping out the hardware here,
...@@ -593,23 +656,33 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -593,23 +656,33 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (writing) if (writing)
pgflags |= _PAGE_DIRTY; pgflags |= _PAGE_DIRTY;
/* /*
* We are walking the secondary page table here. We can do this * We are walking the secondary (partition-scoped) page table here.
* without disabling irq. * We can do this without disabling irq because the Linux MM
* subsystem doesn't do THP splits and collapses on this tree.
*/ */
spin_lock(&kvm->mmu_lock); ptep = __find_linux_pte(pgtable, gpa, NULL, &shift);
ptep = __find_linux_pte(kvm->arch.pgtable, if (ptep && pte_present(*ptep) && (!writing || pte_write(*ptep))) {
gpa, NULL, &shift); kvmppc_radix_update_pte(kvm, ptep, 0, pgflags, gpa, shift);
if (ptep && pte_present(*ptep) && return true;
(!writing || pte_write(*ptep))) {
kvmppc_radix_update_pte(kvm, ptep, 0, pgflags,
gpa, shift);
dsisr &= ~DSISR_SET_RC;
}
spin_unlock(&kvm->mmu_lock);
if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
DSISR_PROTFAULT | DSISR_SET_RC)))
return RESUME_GUEST;
} }
return false;
}
int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu,
unsigned long gpa,
struct kvm_memory_slot *memslot,
bool writing, bool kvm_ro,
pte_t *inserted_pte, unsigned int *levelp)
{
struct kvm *kvm = vcpu->kvm;
struct page *page = NULL;
unsigned long mmu_seq;
unsigned long hva, gfn = gpa >> PAGE_SHIFT;
bool upgrade_write = false;
bool *upgrade_p = &upgrade_write;
pte_t pte, *ptep;
unsigned int shift, level;
int ret;
/* used to check for invalidations in progress */ /* used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq; mmu_seq = kvm->mmu_notifier_seq;
...@@ -622,7 +695,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -622,7 +695,7 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
* is that the page is writable. * is that the page is writable.
*/ */
hva = gfn_to_hva_memslot(memslot, gfn); hva = gfn_to_hva_memslot(memslot, gfn);
if (upgrade_p && __get_user_pages_fast(hva, 1, 1, &page) == 1) { if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) {
upgrade_write = true; upgrade_write = true;
} else { } else {
unsigned long pfn; unsigned long pfn;
...@@ -680,7 +753,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -680,7 +753,12 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
} }
/* Allocate space in the tree and write the PTE */ /* Allocate space in the tree and write the PTE */
ret = kvmppc_create_pte(kvm, pte, gpa, level, mmu_seq); ret = kvmppc_create_pte(kvm, kvm->arch.pgtable, pte, gpa, level,
mmu_seq, kvm->arch.lpid, NULL, NULL);
if (inserted_pte)
*inserted_pte = pte;
if (levelp)
*levelp = level;
if (page) { if (page) {
if (!ret && (pte_val(pte) & _PAGE_WRITE)) if (!ret && (pte_val(pte) & _PAGE_WRITE))
...@@ -688,6 +766,82 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -688,6 +766,82 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
put_page(page); put_page(page);
} }
return ret;
}
int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
unsigned long ea, unsigned long dsisr)
{
struct kvm *kvm = vcpu->kvm;
unsigned long gpa, gfn;
struct kvm_memory_slot *memslot;
long ret;
bool writing = !!(dsisr & DSISR_ISSTORE);
bool kvm_ro = false;
/* Check for unusual errors */
if (dsisr & DSISR_UNSUPP_MMU) {
pr_err("KVM: Got unsupported MMU fault\n");
return -EFAULT;
}
if (dsisr & DSISR_BADACCESS) {
/* Reflect to the guest as DSI */
pr_err("KVM: Got radix HV page fault with DSISR=%lx\n", dsisr);
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
/* Translate the logical address */
gpa = vcpu->arch.fault_gpa & ~0xfffUL;
gpa &= ~0xF000000000000000ul;
gfn = gpa >> PAGE_SHIFT;
if (!(dsisr & DSISR_PRTABLE_FAULT))
gpa |= ea & 0xfff;
/* Get the corresponding memslot */
memslot = gfn_to_memslot(kvm, gfn);
/* No memslot means it's an emulated MMIO region */
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS |
DSISR_SET_RC)) {
/*
* Bad address in guest page table tree, or other
* unusual error - reflect it to the guest as DSI.
*/
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
return kvmppc_hv_emulate_mmio(run, vcpu, gpa, ea, writing);
}
if (memslot->flags & KVM_MEM_READONLY) {
if (writing) {
/* give the guest a DSI */
kvmppc_core_queue_data_storage(vcpu, ea, DSISR_ISSTORE |
DSISR_PROTFAULT);
return RESUME_GUEST;
}
kvm_ro = true;
}
/* Failed to set the reference/change bits */
if (dsisr & DSISR_SET_RC) {
spin_lock(&kvm->mmu_lock);
if (kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable,
writing, gpa, kvm->arch.lpid))
dsisr &= ~DSISR_SET_RC;
spin_unlock(&kvm->mmu_lock);
if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
DSISR_PROTFAULT | DSISR_SET_RC)))
return RESUME_GUEST;
}
/* Try to insert a pte */
ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing,
kvm_ro, NULL, NULL);
if (ret == 0 || ret == -EAGAIN) if (ret == 0 || ret == -EAGAIN)
ret = RESUME_GUEST; ret = RESUME_GUEST;
return ret; return ret;
...@@ -700,20 +854,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, ...@@ -700,20 +854,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
pte_t *ptep; pte_t *ptep;
unsigned long gpa = gfn << PAGE_SHIFT; unsigned long gpa = gfn << PAGE_SHIFT;
unsigned int shift; unsigned int shift;
unsigned long old;
ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift); ptep = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (ptep && pte_present(*ptep)) { if (ptep && pte_present(*ptep))
old = kvmppc_radix_update_pte(kvm, ptep, ~0UL, 0, kvmppc_unmap_pte(kvm, ptep, gpa, shift, memslot,
gpa, shift); kvm->arch.lpid);
kvmppc_radix_tlbie_page(kvm, gpa, shift);
if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
unsigned long psize = PAGE_SIZE;
if (shift)
psize = 1ul << shift;
kvmppc_update_dirty_map(memslot, gfn, psize);
}
}
return 0; return 0;
} }
...@@ -768,7 +913,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm, ...@@ -768,7 +913,7 @@ static int kvm_radix_test_clear_dirty(struct kvm *kvm,
ret = 1 << (shift - PAGE_SHIFT); ret = 1 << (shift - PAGE_SHIFT);
kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0, kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, 0,
gpa, shift); gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift); kvmppc_radix_tlbie_page(kvm, gpa, shift, kvm->arch.lpid);
} }
return ret; return ret;
} }
...@@ -853,6 +998,215 @@ static void pmd_ctor(void *addr) ...@@ -853,6 +998,215 @@ static void pmd_ctor(void *addr)
memset(addr, 0, RADIX_PMD_TABLE_SIZE); memset(addr, 0, RADIX_PMD_TABLE_SIZE);
} }
struct debugfs_radix_state {
struct kvm *kvm;
struct mutex mutex;
unsigned long gpa;
int lpid;
int chars_left;
int buf_index;
char buf[128];
u8 hdr;
};
static int debugfs_radix_open(struct inode *inode, struct file *file)
{
struct kvm *kvm = inode->i_private;
struct debugfs_radix_state *p;
p = kzalloc(sizeof(*p), GFP_KERNEL);
if (!p)
return -ENOMEM;
kvm_get_kvm(kvm);
p->kvm = kvm;
mutex_init(&p->mutex);
file->private_data = p;
return nonseekable_open(inode, file);
}
static int debugfs_radix_release(struct inode *inode, struct file *file)
{
struct debugfs_radix_state *p = file->private_data;
kvm_put_kvm(p->kvm);
kfree(p);
return 0;
}
static ssize_t debugfs_radix_read(struct file *file, char __user *buf,
size_t len, loff_t *ppos)
{
struct debugfs_radix_state *p = file->private_data;
ssize_t ret, r;
unsigned long n;
struct kvm *kvm;
unsigned long gpa;
pgd_t *pgt;
struct kvm_nested_guest *nested;
pgd_t pgd, *pgdp;
pud_t pud, *pudp;
pmd_t pmd, *pmdp;
pte_t *ptep;
int shift;
unsigned long pte;
kvm = p->kvm;
if (!kvm_is_radix(kvm))
return 0;
ret = mutex_lock_interruptible(&p->mutex);
if (ret)
return ret;
if (p->chars_left) {
n = p->chars_left;
if (n > len)
n = len;
r = copy_to_user(buf, p->buf + p->buf_index, n);
n -= r;
p->chars_left -= n;
p->buf_index += n;
buf += n;
len -= n;
ret = n;
if (r) {
if (!n)
ret = -EFAULT;
goto out;
}
}
gpa = p->gpa;
nested = NULL;
pgt = NULL;
while (len != 0 && p->lpid >= 0) {
if (gpa >= RADIX_PGTABLE_RANGE) {
gpa = 0;
pgt = NULL;
if (nested) {
kvmhv_put_nested(nested);
nested = NULL;
}
p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid);
p->hdr = 0;
if (p->lpid < 0)
break;
}
if (!pgt) {
if (p->lpid == 0) {
pgt = kvm->arch.pgtable;
} else {
nested = kvmhv_get_nested(kvm, p->lpid, false);
if (!nested) {
gpa = RADIX_PGTABLE_RANGE;
continue;
}
pgt = nested->shadow_pgtable;
}
}
n = 0;
if (!p->hdr) {
if (p->lpid > 0)
n = scnprintf(p->buf, sizeof(p->buf),
"\nNested LPID %d: ", p->lpid);
n += scnprintf(p->buf + n, sizeof(p->buf) - n,
"pgdir: %lx\n", (unsigned long)pgt);
p->hdr = 1;
goto copy;
}
pgdp = pgt + pgd_index(gpa);
pgd = READ_ONCE(*pgdp);
if (!(pgd_val(pgd) & _PAGE_PRESENT)) {
gpa = (gpa & PGDIR_MASK) + PGDIR_SIZE;
continue;
}
pudp = pud_offset(&pgd, gpa);
pud = READ_ONCE(*pudp);
if (!(pud_val(pud) & _PAGE_PRESENT)) {
gpa = (gpa & PUD_MASK) + PUD_SIZE;
continue;
}
if (pud_val(pud) & _PAGE_PTE) {
pte = pud_val(pud);
shift = PUD_SHIFT;
goto leaf;
}
pmdp = pmd_offset(&pud, gpa);
pmd = READ_ONCE(*pmdp);
if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
gpa = (gpa & PMD_MASK) + PMD_SIZE;
continue;
}
if (pmd_val(pmd) & _PAGE_PTE) {
pte = pmd_val(pmd);
shift = PMD_SHIFT;
goto leaf;
}
ptep = pte_offset_kernel(&pmd, gpa);
pte = pte_val(READ_ONCE(*ptep));
if (!(pte & _PAGE_PRESENT)) {
gpa += PAGE_SIZE;
continue;
}
shift = PAGE_SHIFT;
leaf:
n = scnprintf(p->buf, sizeof(p->buf),
" %lx: %lx %d\n", gpa, pte, shift);
gpa += 1ul << shift;
copy:
p->chars_left = n;
if (n > len)
n = len;
r = copy_to_user(buf, p->buf, n);
n -= r;
p->chars_left -= n;
p->buf_index = n;
buf += n;
len -= n;
ret += n;
if (r) {
if (!ret)
ret = -EFAULT;
break;
}
}
p->gpa = gpa;
if (nested)
kvmhv_put_nested(nested);
out:
mutex_unlock(&p->mutex);
return ret;
}
static ssize_t debugfs_radix_write(struct file *file, const char __user *buf,
size_t len, loff_t *ppos)
{
return -EACCES;
}
static const struct file_operations debugfs_radix_fops = {
.owner = THIS_MODULE,
.open = debugfs_radix_open,
.release = debugfs_radix_release,
.read = debugfs_radix_read,
.write = debugfs_radix_write,
.llseek = generic_file_llseek,
};
void kvmhv_radix_debugfs_init(struct kvm *kvm)
{
kvm->arch.radix_dentry = debugfs_create_file("radix", 0400,
kvm->arch.debugfs_dir, kvm,
&debugfs_radix_fops);
}
int kvmppc_radix_init(void) int kvmppc_radix_init(void)
{ {
unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE;
......
...@@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, ...@@ -363,6 +363,40 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
return ret; return ret;
} }
static long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long tce)
{
unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
enum dma_data_direction dir = iommu_tce_direction(tce);
struct kvmppc_spapr_tce_iommu_table *stit;
unsigned long ua = 0;
/* Allow userspace to poison TCE table */
if (dir == DMA_NONE)
return H_SUCCESS;
if (iommu_tce_check_gpa(stt->page_shift, gpa))
return H_TOO_HARD;
if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
return H_TOO_HARD;
list_for_each_entry_rcu(stit, &stt->iommu_tables, next) {
unsigned long hpa = 0;
struct mm_iommu_table_group_mem_t *mem;
long shift = stit->tbl->it_page_shift;
mem = mm_iommu_lookup(stt->kvm->mm, ua, 1ULL << shift);
if (!mem)
return H_TOO_HARD;
if (mm_iommu_ua_to_hpa(mem, ua, shift, &hpa))
return H_TOO_HARD;
}
return H_SUCCESS;
}
static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry)
{ {
unsigned long hpa = 0; unsigned long hpa = 0;
...@@ -401,7 +435,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm, ...@@ -401,7 +435,7 @@ static long kvmppc_tce_iommu_do_unmap(struct kvm *kvm,
long ret; long ret;
if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir)))
return H_HARDWARE; return H_TOO_HARD;
if (dir == DMA_NONE) if (dir == DMA_NONE)
return H_SUCCESS; return H_SUCCESS;
...@@ -449,15 +483,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, ...@@ -449,15 +483,15 @@ long kvmppc_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
return H_TOO_HARD; return H_TOO_HARD;
if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa))) if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, tbl->it_page_shift, &hpa)))
return H_HARDWARE; return H_TOO_HARD;
if (mm_iommu_mapped_inc(mem)) if (mm_iommu_mapped_inc(mem))
return H_CLOSED; return H_TOO_HARD;
ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); ret = iommu_tce_xchg(tbl, entry, &hpa, &dir);
if (WARN_ON_ONCE(ret)) { if (WARN_ON_ONCE(ret)) {
mm_iommu_mapped_dec(mem); mm_iommu_mapped_dec(mem);
return H_HARDWARE; return H_TOO_HARD;
} }
if (dir != DMA_NONE) if (dir != DMA_NONE)
...@@ -517,8 +551,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ...@@ -517,8 +551,7 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
idx = srcu_read_lock(&vcpu->kvm->srcu); idx = srcu_read_lock(&vcpu->kvm->srcu);
if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL)) {
tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) {
ret = H_PARAMETER; ret = H_PARAMETER;
goto unlock_exit; goto unlock_exit;
} }
...@@ -533,14 +566,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ...@@ -533,14 +566,10 @@ long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl, ret = kvmppc_tce_iommu_map(vcpu->kvm, stt, stit->tbl,
entry, ua, dir); entry, ua, dir);
if (ret == H_SUCCESS) if (ret != H_SUCCESS) {
continue;
if (ret == H_TOO_HARD)
goto unlock_exit;
WARN_ON_ONCE(1);
kvmppc_clear_tce(stit->tbl, entry); kvmppc_clear_tce(stit->tbl, entry);
goto unlock_exit;
}
} }
kvmppc_tce_put(stt, entry, tce); kvmppc_tce_put(stt, entry, tce);
...@@ -583,7 +612,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, ...@@ -583,7 +612,7 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
return ret; return ret;
idx = srcu_read_lock(&vcpu->kvm->srcu); idx = srcu_read_lock(&vcpu->kvm->srcu);
if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) { if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL)) {
ret = H_TOO_HARD; ret = H_TOO_HARD;
goto unlock_exit; goto unlock_exit;
} }
...@@ -599,10 +628,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, ...@@ -599,10 +628,26 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
ret = kvmppc_tce_validate(stt, tce); ret = kvmppc_tce_validate(stt, tce);
if (ret != H_SUCCESS) if (ret != H_SUCCESS)
goto unlock_exit; goto unlock_exit;
}
for (i = 0; i < npages; ++i) {
/*
* This looks unsafe, because we validate, then regrab
* the TCE from userspace which could have been changed by
* another thread.
*
* But it actually is safe, because the relevant checks will be
* re-executed in the following code. If userspace tries to
* change this dodgily it will result in a messier failure mode
* but won't threaten the host.
*/
if (get_user(tce, tces + i)) {
ret = H_TOO_HARD;
goto unlock_exit;
}
tce = be64_to_cpu(tce);
if (kvmppc_gpa_to_ua(vcpu->kvm, if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
&ua, NULL))
return H_PARAMETER; return H_PARAMETER;
list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
...@@ -610,14 +655,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu, ...@@ -610,14 +655,10 @@ long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
stit->tbl, entry + i, ua, stit->tbl, entry + i, ua,
iommu_tce_direction(tce)); iommu_tce_direction(tce));
if (ret == H_SUCCESS) if (ret != H_SUCCESS) {
continue;
if (ret == H_TOO_HARD)
goto unlock_exit;
WARN_ON_ONCE(1);
kvmppc_clear_tce(stit->tbl, entry); kvmppc_clear_tce(stit->tbl, entry);
goto unlock_exit;
}
} }
kvmppc_tce_put(stt, entry + i, tce); kvmppc_tce_put(stt, entry + i, tce);
......
...@@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, ...@@ -87,6 +87,7 @@ struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm,
} }
EXPORT_SYMBOL_GPL(kvmppc_find_table); EXPORT_SYMBOL_GPL(kvmppc_find_table);
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/* /*
* Validates TCE address. * Validates TCE address.
* At the moment flags and page mask are validated. * At the moment flags and page mask are validated.
...@@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table); ...@@ -94,14 +95,14 @@ EXPORT_SYMBOL_GPL(kvmppc_find_table);
* to the table and user space is supposed to process them), we can skip * to the table and user space is supposed to process them), we can skip
* checking other things (such as TCE is a guest RAM address or the page * checking other things (such as TCE is a guest RAM address or the page
* was actually allocated). * was actually allocated).
*
* WARNING: This will be called in real-mode on HV KVM and virtual
* mode on PR KVM
*/ */
long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) static long kvmppc_rm_tce_validate(struct kvmppc_spapr_tce_table *stt,
unsigned long tce)
{ {
unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE); unsigned long gpa = tce & ~(TCE_PCI_READ | TCE_PCI_WRITE);
enum dma_data_direction dir = iommu_tce_direction(tce); enum dma_data_direction dir = iommu_tce_direction(tce);
struct kvmppc_spapr_tce_iommu_table *stit;
unsigned long ua = 0;
/* Allow userspace to poison TCE table */ /* Allow userspace to poison TCE table */
if (dir == DMA_NONE) if (dir == DMA_NONE)
...@@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce) ...@@ -110,9 +111,25 @@ long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *stt, unsigned long tce)
if (iommu_tce_check_gpa(stt->page_shift, gpa)) if (iommu_tce_check_gpa(stt->page_shift, gpa))
return H_PARAMETER; return H_PARAMETER;
if (kvmppc_tce_to_ua(stt->kvm, tce, &ua, NULL))
return H_TOO_HARD;
list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
unsigned long hpa = 0;
struct mm_iommu_table_group_mem_t *mem;
long shift = stit->tbl->it_page_shift;
mem = mm_iommu_lookup_rm(stt->kvm->mm, ua, 1ULL << shift);
if (!mem)
return H_TOO_HARD;
if (mm_iommu_ua_to_hpa_rm(mem, ua, shift, &hpa))
return H_TOO_HARD;
}
return H_SUCCESS; return H_SUCCESS;
} }
EXPORT_SYMBOL_GPL(kvmppc_tce_validate); #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
/* Note on the use of page_address() in real mode, /* Note on the use of page_address() in real mode,
* *
...@@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt, ...@@ -164,10 +181,10 @@ void kvmppc_tce_put(struct kvmppc_spapr_tce_table *stt,
} }
EXPORT_SYMBOL_GPL(kvmppc_tce_put); EXPORT_SYMBOL_GPL(kvmppc_tce_put);
long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce,
unsigned long *ua, unsigned long **prmap) unsigned long *ua, unsigned long **prmap)
{ {
unsigned long gfn = gpa >> PAGE_SHIFT; unsigned long gfn = tce >> PAGE_SHIFT;
struct kvm_memory_slot *memslot; struct kvm_memory_slot *memslot;
memslot = search_memslots(kvm_memslots(kvm), gfn); memslot = search_memslots(kvm_memslots(kvm), gfn);
...@@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, ...@@ -175,7 +192,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
return -EINVAL; return -EINVAL;
*ua = __gfn_to_hva_memslot(memslot, gfn) | *ua = __gfn_to_hva_memslot(memslot, gfn) |
(gpa & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE)); (tce & ~(PAGE_MASK | TCE_PCI_READ | TCE_PCI_WRITE));
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
if (prmap) if (prmap)
...@@ -184,7 +201,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa, ...@@ -184,7 +201,7 @@ long kvmppc_gpa_to_ua(struct kvm *kvm, unsigned long gpa,
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); EXPORT_SYMBOL_GPL(kvmppc_tce_to_ua);
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl, static long iommu_tce_xchg_rm(struct mm_struct *mm, struct iommu_table *tbl,
...@@ -300,10 +317,10 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl, ...@@ -300,10 +317,10 @@ static long kvmppc_rm_tce_iommu_do_map(struct kvm *kvm, struct iommu_table *tbl,
if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift, if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, tbl->it_page_shift,
&hpa))) &hpa)))
return H_HARDWARE; return H_TOO_HARD;
if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem)))
return H_CLOSED; return H_TOO_HARD;
ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir); ret = iommu_tce_xchg_rm(kvm->mm, tbl, entry, &hpa, &dir);
if (ret) { if (ret) {
...@@ -368,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ...@@ -368,13 +385,12 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
if (ret != H_SUCCESS) if (ret != H_SUCCESS)
return ret; return ret;
ret = kvmppc_tce_validate(stt, tce); ret = kvmppc_rm_tce_validate(stt, tce);
if (ret != H_SUCCESS) if (ret != H_SUCCESS)
return ret; return ret;
dir = iommu_tce_direction(tce); dir = iommu_tce_direction(tce);
if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, if ((dir != DMA_NONE) && kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL))
return H_PARAMETER; return H_PARAMETER;
entry = ioba >> stt->page_shift; entry = ioba >> stt->page_shift;
...@@ -387,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, ...@@ -387,14 +403,10 @@ long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt, ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, stt,
stit->tbl, entry, ua, dir); stit->tbl, entry, ua, dir);
if (ret == H_SUCCESS) if (ret != H_SUCCESS) {
continue;
if (ret == H_TOO_HARD)
return ret;
WARN_ON_ONCE_RM(1);
kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry); kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
return ret;
}
} }
kvmppc_tce_put(stt, entry, tce); kvmppc_tce_put(stt, entry, tce);
...@@ -480,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, ...@@ -480,7 +492,7 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
*/ */
struct mm_iommu_table_group_mem_t *mem; struct mm_iommu_table_group_mem_t *mem;
if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, NULL))
return H_TOO_HARD; return H_TOO_HARD;
mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K);
...@@ -496,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, ...@@ -496,12 +508,12 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
* We do not require memory to be preregistered in this case * We do not require memory to be preregistered in this case
* so lock rmap and do __find_linux_pte_or_hugepte(). * so lock rmap and do __find_linux_pte_or_hugepte().
*/ */
if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) if (kvmppc_tce_to_ua(vcpu->kvm, tce_list, &ua, &rmap))
return H_TOO_HARD; return H_TOO_HARD;
rmap = (void *) vmalloc_to_phys(rmap); rmap = (void *) vmalloc_to_phys(rmap);
if (WARN_ON_ONCE_RM(!rmap)) if (WARN_ON_ONCE_RM(!rmap))
return H_HARDWARE; return H_TOO_HARD;
/* /*
* Synchronize with the MMU notifier callbacks in * Synchronize with the MMU notifier callbacks in
...@@ -521,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, ...@@ -521,14 +533,16 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
for (i = 0; i < npages; ++i) { for (i = 0; i < npages; ++i) {
unsigned long tce = be64_to_cpu(((u64 *)tces)[i]); unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
ret = kvmppc_tce_validate(stt, tce); ret = kvmppc_rm_tce_validate(stt, tce);
if (ret != H_SUCCESS) if (ret != H_SUCCESS)
goto unlock_exit; goto unlock_exit;
}
for (i = 0; i < npages; ++i) {
unsigned long tce = be64_to_cpu(((u64 *)tces)[i]);
ua = 0; ua = 0;
if (kvmppc_gpa_to_ua(vcpu->kvm, if (kvmppc_tce_to_ua(vcpu->kvm, tce, &ua, NULL))
tce & ~(TCE_PCI_READ | TCE_PCI_WRITE),
&ua, NULL))
return H_PARAMETER; return H_PARAMETER;
list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { list_for_each_entry_lockless(stit, &stt->iommu_tables, next) {
...@@ -536,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu, ...@@ -536,14 +550,11 @@ long kvmppc_rm_h_put_tce_indirect(struct kvm_vcpu *vcpu,
stit->tbl, entry + i, ua, stit->tbl, entry + i, ua,
iommu_tce_direction(tce)); iommu_tce_direction(tce));
if (ret == H_SUCCESS) if (ret != H_SUCCESS) {
continue; kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl,
entry);
if (ret == H_TOO_HARD)
goto unlock_exit; goto unlock_exit;
}
WARN_ON_ONCE_RM(1);
kvmppc_rm_clear_tce(vcpu->kvm, stit->tbl, entry);
} }
kvmppc_tce_put(stt, entry + i, tce); kvmppc_tce_put(stt, entry + i, tce);
......
...@@ -36,7 +36,6 @@ ...@@ -36,7 +36,6 @@
#define OP_31_XOP_MTSR 210 #define OP_31_XOP_MTSR 210
#define OP_31_XOP_MTSRIN 242 #define OP_31_XOP_MTSRIN 242
#define OP_31_XOP_TLBIEL 274 #define OP_31_XOP_TLBIEL 274
#define OP_31_XOP_TLBIE 306
/* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */ /* Opcode is officially reserved, reuse it as sc 1 when sc 1 doesn't trap */
#define OP_31_XOP_FAKE_SC1 308 #define OP_31_XOP_FAKE_SC1 308
#define OP_31_XOP_SLBMTE 402 #define OP_31_XOP_SLBMTE 402
...@@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu) ...@@ -110,7 +109,7 @@ static inline void kvmppc_copyto_vcpu_tm(struct kvm_vcpu *vcpu)
vcpu->arch.ctr_tm = vcpu->arch.regs.ctr; vcpu->arch.ctr_tm = vcpu->arch.regs.ctr;
vcpu->arch.tar_tm = vcpu->arch.tar; vcpu->arch.tar_tm = vcpu->arch.tar;
vcpu->arch.lr_tm = vcpu->arch.regs.link; vcpu->arch.lr_tm = vcpu->arch.regs.link;
vcpu->arch.cr_tm = vcpu->arch.cr; vcpu->arch.cr_tm = vcpu->arch.regs.ccr;
vcpu->arch.xer_tm = vcpu->arch.regs.xer; vcpu->arch.xer_tm = vcpu->arch.regs.xer;
vcpu->arch.vrsave_tm = vcpu->arch.vrsave; vcpu->arch.vrsave_tm = vcpu->arch.vrsave;
} }
...@@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu) ...@@ -129,7 +128,7 @@ static inline void kvmppc_copyfrom_vcpu_tm(struct kvm_vcpu *vcpu)
vcpu->arch.regs.ctr = vcpu->arch.ctr_tm; vcpu->arch.regs.ctr = vcpu->arch.ctr_tm;
vcpu->arch.tar = vcpu->arch.tar_tm; vcpu->arch.tar = vcpu->arch.tar_tm;
vcpu->arch.regs.link = vcpu->arch.lr_tm; vcpu->arch.regs.link = vcpu->arch.lr_tm;
vcpu->arch.cr = vcpu->arch.cr_tm; vcpu->arch.regs.ccr = vcpu->arch.cr_tm;
vcpu->arch.regs.xer = vcpu->arch.xer_tm; vcpu->arch.regs.xer = vcpu->arch.xer_tm;
vcpu->arch.vrsave = vcpu->arch.vrsave_tm; vcpu->arch.vrsave = vcpu->arch.vrsave_tm;
} }
...@@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val) ...@@ -141,7 +140,7 @@ static void kvmppc_emulate_treclaim(struct kvm_vcpu *vcpu, int ra_val)
uint64_t texasr; uint64_t texasr;
/* CR0 = 0 | MSR[TS] | 0 */ /* CR0 = 0 | MSR[TS] | 0 */
vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
<< CR0_SHIFT); << CR0_SHIFT);
...@@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val) ...@@ -220,7 +219,7 @@ void kvmppc_emulate_tabort(struct kvm_vcpu *vcpu, int ra_val)
tm_abort(ra_val); tm_abort(ra_val);
/* CR0 = 0 | MSR[TS] | 0 */ /* CR0 = 0 | MSR[TS] | 0 */
vcpu->arch.cr = (vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT)) | vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)) |
(((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1)) (((guest_msr & MSR_TS_MASK) >> (MSR_TS_S_LG - 1))
<< CR0_SHIFT); << CR0_SHIFT);
...@@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -494,8 +493,8 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
if (!(kvmppc_get_msr(vcpu) & MSR_PR)) { if (!(kvmppc_get_msr(vcpu) & MSR_PR)) {
preempt_disable(); preempt_disable();
vcpu->arch.cr = (CR0_TBEGIN_FAILURE | vcpu->arch.regs.ccr = (CR0_TBEGIN_FAILURE |
(vcpu->arch.cr & ~(CR0_MASK << CR0_SHIFT))); (vcpu->arch.regs.ccr & ~(CR0_MASK << CR0_SHIFT)));
vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT | vcpu->arch.texasr = (TEXASR_FS | TEXASR_EXACT |
(((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT)) (((u64)(TM_CAUSE_EMULATE | TM_CAUSE_PERSISTENT))
......
...@@ -50,6 +50,7 @@ ...@@ -50,6 +50,7 @@
#include <asm/reg.h> #include <asm/reg.h>
#include <asm/ppc-opcode.h> #include <asm/ppc-opcode.h>
#include <asm/asm-prototypes.h> #include <asm/asm-prototypes.h>
#include <asm/archrandom.h>
#include <asm/debug.h> #include <asm/debug.h>
#include <asm/disassemble.h> #include <asm/disassemble.h>
#include <asm/cputable.h> #include <asm/cputable.h>
...@@ -104,6 +105,10 @@ static bool indep_threads_mode = true; ...@@ -104,6 +105,10 @@ static bool indep_threads_mode = true;
module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
static bool one_vm_per_core;
module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)");
#ifdef CONFIG_KVM_XICS #ifdef CONFIG_KVM_XICS
static struct kernel_param_ops module_param_ops = { static struct kernel_param_ops module_param_ops = {
.set = param_set_int, .set = param_set_int,
...@@ -117,6 +122,16 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644); ...@@ -117,6 +122,16 @@ module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, 0644);
MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
#endif #endif
/* If set, guests are allowed to create and control nested guests */
static bool nested = true;
module_param(nested, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
static inline bool nesting_enabled(struct kvm *kvm)
{
return kvm->arch.nested_enable && kvm_is_radix(kvm);
}
/* If set, the threads on each CPU core have to be in the same MMU mode */ /* If set, the threads on each CPU core have to be in the same MMU mode */
static bool no_mixing_hpt_and_radix; static bool no_mixing_hpt_and_radix;
...@@ -173,6 +188,10 @@ static bool kvmppc_ipi_thread(int cpu) ...@@ -173,6 +188,10 @@ static bool kvmppc_ipi_thread(int cpu)
{ {
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
/* If we're a nested hypervisor, fall back to ordinary IPIs for now */
if (kvmhv_on_pseries())
return false;
/* On POWER9 we can use msgsnd to IPI any cpu */ /* On POWER9 we can use msgsnd to IPI any cpu */
if (cpu_has_feature(CPU_FTR_ARCH_300)) { if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu); msg |= get_hard_smp_processor_id(cpu);
...@@ -410,8 +429,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu) ...@@ -410,8 +429,8 @@ static void kvmppc_dump_regs(struct kvm_vcpu *vcpu)
vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1); vcpu->arch.shregs.sprg0, vcpu->arch.shregs.sprg1);
pr_err("sprg2 = %.16llx sprg3 = %.16llx\n", pr_err("sprg2 = %.16llx sprg3 = %.16llx\n",
vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3); vcpu->arch.shregs.sprg2, vcpu->arch.shregs.sprg3);
pr_err("cr = %.8x xer = %.16lx dsisr = %.8x\n", pr_err("cr = %.8lx xer = %.16lx dsisr = %.8x\n",
vcpu->arch.cr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr); vcpu->arch.regs.ccr, vcpu->arch.regs.xer, vcpu->arch.shregs.dsisr);
pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar); pr_err("dar = %.16llx\n", vcpu->arch.shregs.dar);
pr_err("fault dar = %.16lx dsisr = %.8x\n", pr_err("fault dar = %.16lx dsisr = %.8x\n",
vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); vcpu->arch.fault_dar, vcpu->arch.fault_dsisr);
...@@ -730,8 +749,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu) ...@@ -730,8 +749,7 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
/* /*
* Ensure that the read of vcore->dpdes comes after the read * Ensure that the read of vcore->dpdes comes after the read
* of vcpu->doorbell_request. This barrier matches the * of vcpu->doorbell_request. This barrier matches the
* lwsync in book3s_hv_rmhandlers.S just before the * smb_wmb() in kvmppc_guest_entry_inject().
* fast_guest_return label.
*/ */
smp_rmb(); smp_rmb();
vc = vcpu->arch.vcore; vc = vcpu->arch.vcore;
...@@ -912,6 +930,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) ...@@ -912,6 +930,19 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
break; break;
} }
return RESUME_HOST; return RESUME_HOST;
case H_SET_DABR:
ret = kvmppc_h_set_dabr(vcpu, kvmppc_get_gpr(vcpu, 4));
break;
case H_SET_XDABR:
ret = kvmppc_h_set_xdabr(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5));
break;
case H_GET_TCE:
ret = kvmppc_h_get_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5));
if (ret == H_TOO_HARD)
return RESUME_HOST;
break;
case H_PUT_TCE: case H_PUT_TCE:
ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4), ret = kvmppc_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 5),
...@@ -935,6 +966,32 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) ...@@ -935,6 +966,32 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
if (ret == H_TOO_HARD) if (ret == H_TOO_HARD)
return RESUME_HOST; return RESUME_HOST;
break; break;
case H_RANDOM:
if (!powernv_get_random_long(&vcpu->arch.regs.gpr[4]))
ret = H_HARDWARE;
break;
case H_SET_PARTITION_TABLE:
ret = H_FUNCTION;
if (nesting_enabled(vcpu->kvm))
ret = kvmhv_set_partition_table(vcpu);
break;
case H_ENTER_NESTED:
ret = H_FUNCTION;
if (!nesting_enabled(vcpu->kvm))
break;
ret = kvmhv_enter_nested_guest(vcpu);
if (ret == H_INTERRUPT) {
kvmppc_set_gpr(vcpu, 3, 0);
return -EINTR;
}
break;
case H_TLB_INVALIDATE:
ret = H_FUNCTION;
if (nesting_enabled(vcpu->kvm))
ret = kvmhv_do_nested_tlbie(vcpu);
break;
default: default:
return RESUME_HOST; return RESUME_HOST;
} }
...@@ -943,6 +1000,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) ...@@ -943,6 +1000,24 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
return RESUME_GUEST; return RESUME_GUEST;
} }
/*
* Handle H_CEDE in the nested virtualization case where we haven't
* called the real-mode hcall handlers in book3s_hv_rmhandlers.S.
* This has to be done early, not in kvmppc_pseries_do_hcall(), so
* that the cede logic in kvmppc_run_single_vcpu() works properly.
*/
static void kvmppc_nested_cede(struct kvm_vcpu *vcpu)
{
vcpu->arch.shregs.msr |= MSR_EE;
vcpu->arch.ceded = 1;
smp_mb();
if (vcpu->arch.prodded) {
vcpu->arch.prodded = 0;
smp_mb();
vcpu->arch.ceded = 0;
}
}
static int kvmppc_hcall_impl_hv(unsigned long cmd) static int kvmppc_hcall_impl_hv(unsigned long cmd)
{ {
switch (cmd) { switch (cmd) {
...@@ -1085,7 +1160,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu) ...@@ -1085,7 +1160,6 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu *vcpu)
return RESUME_GUEST; return RESUME_GUEST;
} }
/* Called with vcpu->arch.vcore->lock held */
static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
struct task_struct *tsk) struct task_struct *tsk)
{ {
...@@ -1190,7 +1264,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -1190,7 +1264,10 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
break; break;
case BOOK3S_INTERRUPT_H_INST_STORAGE: case BOOK3S_INTERRUPT_H_INST_STORAGE:
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu); vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
vcpu->arch.fault_dsisr = 0; vcpu->arch.fault_dsisr = vcpu->arch.shregs.msr &
DSISR_SRR1_MATCH_64S;
if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
r = RESUME_PAGE_FAULT; r = RESUME_PAGE_FAULT;
break; break;
/* /*
...@@ -1206,10 +1283,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -1206,10 +1283,7 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
swab32(vcpu->arch.emul_inst) : swab32(vcpu->arch.emul_inst) :
vcpu->arch.emul_inst; vcpu->arch.emul_inst;
if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) { if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) {
/* Need vcore unlocked to call kvmppc_get_last_inst */
spin_unlock(&vcpu->arch.vcore->lock);
r = kvmppc_emulate_debug_inst(run, vcpu); r = kvmppc_emulate_debug_inst(run, vcpu);
spin_lock(&vcpu->arch.vcore->lock);
} else { } else {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL); kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST; r = RESUME_GUEST;
...@@ -1225,12 +1299,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -1225,12 +1299,8 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
r = EMULATE_FAIL; r = EMULATE_FAIL;
if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) && if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
cpu_has_feature(CPU_FTR_ARCH_300)) { cpu_has_feature(CPU_FTR_ARCH_300))
/* Need vcore unlocked to call kvmppc_get_last_inst */
spin_unlock(&vcpu->arch.vcore->lock);
r = kvmppc_emulate_doorbell_instr(vcpu); r = kvmppc_emulate_doorbell_instr(vcpu);
spin_lock(&vcpu->arch.vcore->lock);
}
if (r == EMULATE_FAIL) { if (r == EMULATE_FAIL) {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL); kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST; r = RESUME_GUEST;
...@@ -1265,6 +1335,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -1265,6 +1335,104 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu,
return r; return r;
} }
static int kvmppc_handle_nested_exit(struct kvm_vcpu *vcpu)
{
int r;
int srcu_idx;
vcpu->stat.sum_exits++;
/*
* This can happen if an interrupt occurs in the last stages
* of guest entry or the first stages of guest exit (i.e. after
* setting paca->kvm_hstate.in_guest to KVM_GUEST_MODE_GUEST_HV
* and before setting it to KVM_GUEST_MODE_HOST_HV).
* That can happen due to a bug, or due to a machine check
* occurring at just the wrong time.
*/
if (vcpu->arch.shregs.msr & MSR_HV) {
pr_emerg("KVM trap in HV mode while nested!\n");
pr_emerg("trap=0x%x | pc=0x%lx | msr=0x%llx\n",
vcpu->arch.trap, kvmppc_get_pc(vcpu),
vcpu->arch.shregs.msr);
kvmppc_dump_regs(vcpu);
return RESUME_HOST;
}
switch (vcpu->arch.trap) {
/* We're good on these - the host merely wanted to get our attention */
case BOOK3S_INTERRUPT_HV_DECREMENTER:
vcpu->stat.dec_exits++;
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_EXTERNAL:
vcpu->stat.ext_intr_exits++;
r = RESUME_HOST;
break;
case BOOK3S_INTERRUPT_H_DOORBELL:
case BOOK3S_INTERRUPT_H_VIRT:
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST;
break;
/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
case BOOK3S_INTERRUPT_HMI:
case BOOK3S_INTERRUPT_PERFMON:
case BOOK3S_INTERRUPT_SYSTEM_RESET:
r = RESUME_GUEST;
break;
case BOOK3S_INTERRUPT_MACHINE_CHECK:
/* Pass the machine check to the L1 guest */
r = RESUME_HOST;
/* Print the MCE event to host console. */
machine_check_print_event_info(&vcpu->arch.mce_evt, false);
break;
/*
* We get these next two if the guest accesses a page which it thinks
* it has mapped but which is not actually present, either because
* it is for an emulated I/O device or because the corresonding
* host page has been paged out.
*/
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvmhv_nested_page_fault(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
vcpu->arch.fault_dar = kvmppc_get_pc(vcpu);
vcpu->arch.fault_dsisr = kvmppc_get_msr(vcpu) &
DSISR_SRR1_MATCH_64S;
if (vcpu->arch.shregs.msr & HSRR1_HISI_WRITE)
vcpu->arch.fault_dsisr |= DSISR_ISSTORE;
srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
r = kvmhv_nested_page_fault(vcpu);
srcu_read_unlock(&vcpu->kvm->srcu, srcu_idx);
break;
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
case BOOK3S_INTERRUPT_HV_SOFTPATCH:
/*
* This occurs for various TM-related instructions that
* we need to emulate on POWER9 DD2.2. We have already
* handled the cases where the guest was in real-suspend
* mode and was transitioning to transactional state.
*/
r = kvmhv_p9_tm_emulation(vcpu);
break;
#endif
case BOOK3S_INTERRUPT_HV_RM_HARD:
vcpu->arch.trap = 0;
r = RESUME_GUEST;
if (!xive_enabled())
kvmppc_xics_rm_complete(vcpu, 0);
break;
default:
r = RESUME_HOST;
break;
}
return r;
}
static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu, static int kvm_arch_vcpu_ioctl_get_sregs_hv(struct kvm_vcpu *vcpu,
struct kvm_sregs *sregs) struct kvm_sregs *sregs)
{ {
...@@ -1555,6 +1723,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, ...@@ -1555,6 +1723,9 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
case KVM_REG_PPC_ONLINE: case KVM_REG_PPC_ONLINE:
*val = get_reg_val(id, vcpu->arch.online); *val = get_reg_val(id, vcpu->arch.online);
break; break;
case KVM_REG_PPC_PTCR:
*val = get_reg_val(id, vcpu->kvm->arch.l1_ptcr);
break;
default: default:
r = -EINVAL; r = -EINVAL;
break; break;
...@@ -1786,6 +1957,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, ...@@ -1786,6 +1957,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
atomic_dec(&vcpu->arch.vcore->online_count); atomic_dec(&vcpu->arch.vcore->online_count);
vcpu->arch.online = i; vcpu->arch.online = i;
break; break;
case KVM_REG_PPC_PTCR:
vcpu->kvm->arch.l1_ptcr = set_reg_val(id, *val);
break;
default: default:
r = -EINVAL; r = -EINVAL;
break; break;
...@@ -2019,15 +2193,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm, ...@@ -2019,15 +2193,18 @@ static struct kvm_vcpu *kvmppc_core_vcpu_create_hv(struct kvm *kvm,
* Set the default HFSCR for the guest from the host value. * Set the default HFSCR for the guest from the host value.
* This value is only used on POWER9. * This value is only used on POWER9.
* On POWER9, we want to virtualize the doorbell facility, so we * On POWER9, we want to virtualize the doorbell facility, so we
* turn off the HFSCR bit, which causes those instructions to trap. * don't set the HFSCR_MSGP bit, and that causes those instructions
* to trap and then we emulate them.
*/ */
vcpu->arch.hfscr = mfspr(SPRN_HFSCR); vcpu->arch.hfscr = HFSCR_TAR | HFSCR_EBB | HFSCR_PM | HFSCR_BHRB |
HFSCR_DSCR | HFSCR_VECVSX | HFSCR_FP;
if (cpu_has_feature(CPU_FTR_HVMODE)) {
vcpu->arch.hfscr &= mfspr(SPRN_HFSCR);
if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
vcpu->arch.hfscr |= HFSCR_TM; vcpu->arch.hfscr |= HFSCR_TM;
else if (!cpu_has_feature(CPU_FTR_TM_COMP)) }
vcpu->arch.hfscr &= ~HFSCR_TM; if (cpu_has_feature(CPU_FTR_TM_COMP))
if (cpu_has_feature(CPU_FTR_ARCH_300)) vcpu->arch.hfscr |= HFSCR_TM;
vcpu->arch.hfscr &= ~HFSCR_MSGP;
kvmppc_mmu_book3s_hv_init(vcpu); kvmppc_mmu_book3s_hv_init(vcpu);
...@@ -2242,10 +2419,18 @@ static void kvmppc_release_hwthread(int cpu) ...@@ -2242,10 +2419,18 @@ static void kvmppc_release_hwthread(int cpu)
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
{ {
struct kvm_nested_guest *nested = vcpu->arch.nested;
cpumask_t *cpu_in_guest;
int i; int i;
cpu = cpu_first_thread_sibling(cpu); cpu = cpu_first_thread_sibling(cpu);
if (nested) {
cpumask_set_cpu(cpu, &nested->need_tlb_flush);
cpu_in_guest = &nested->cpu_in_guest;
} else {
cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush); cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
cpu_in_guest = &kvm->arch.cpu_in_guest;
}
/* /*
* Make sure setting of bit in need_tlb_flush precedes * Make sure setting of bit in need_tlb_flush precedes
* testing of cpu_in_guest bits. The matching barrier on * testing of cpu_in_guest bits. The matching barrier on
...@@ -2253,13 +2438,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu) ...@@ -2253,13 +2438,23 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
*/ */
smp_mb(); smp_mb();
for (i = 0; i < threads_per_core; ++i) for (i = 0; i < threads_per_core; ++i)
if (cpumask_test_cpu(cpu + i, &kvm->arch.cpu_in_guest)) if (cpumask_test_cpu(cpu + i, cpu_in_guest))
smp_call_function_single(cpu + i, do_nothing, NULL, 1); smp_call_function_single(cpu + i, do_nothing, NULL, 1);
} }
static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
{ {
struct kvm_nested_guest *nested = vcpu->arch.nested;
struct kvm *kvm = vcpu->kvm; struct kvm *kvm = vcpu->kvm;
int prev_cpu;
if (!cpu_has_feature(CPU_FTR_HVMODE))
return;
if (nested)
prev_cpu = nested->prev_cpu[vcpu->arch.nested_vcpu_id];
else
prev_cpu = vcpu->arch.prev_cpu;
/* /*
* With radix, the guest can do TLB invalidations itself, * With radix, the guest can do TLB invalidations itself,
...@@ -2273,15 +2468,49 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu) ...@@ -2273,15 +2468,49 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
* ran to flush the TLB. The TLB is shared between threads, * ran to flush the TLB. The TLB is shared between threads,
* so we use a single bit in .need_tlb_flush for all 4 threads. * so we use a single bit in .need_tlb_flush for all 4 threads.
*/ */
if (vcpu->arch.prev_cpu != pcpu) { if (prev_cpu != pcpu) {
if (vcpu->arch.prev_cpu >= 0 && if (prev_cpu >= 0 &&
cpu_first_thread_sibling(vcpu->arch.prev_cpu) != cpu_first_thread_sibling(prev_cpu) !=
cpu_first_thread_sibling(pcpu)) cpu_first_thread_sibling(pcpu))
radix_flush_cpu(kvm, vcpu->arch.prev_cpu, vcpu); radix_flush_cpu(kvm, prev_cpu, vcpu);
if (nested)
nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
else
vcpu->arch.prev_cpu = pcpu; vcpu->arch.prev_cpu = pcpu;
} }
} }
static void kvmppc_radix_check_need_tlb_flush(struct kvm *kvm, int pcpu,
struct kvm_nested_guest *nested)
{
cpumask_t *need_tlb_flush;
int lpid;
if (!cpu_has_feature(CPU_FTR_HVMODE))
return;
if (cpu_has_feature(CPU_FTR_ARCH_300))
pcpu &= ~0x3UL;
if (nested) {
lpid = nested->shadow_lpid;
need_tlb_flush = &nested->need_tlb_flush;
} else {
lpid = kvm->arch.lpid;
need_tlb_flush = &kvm->arch.need_tlb_flush;
}
mtspr(SPRN_LPID, lpid);
isync();
smp_mb();
if (cpumask_test_cpu(pcpu, need_tlb_flush)) {
radix__local_flush_tlb_lpid_guest(lpid);
/* Clear the bit after the TLB flush */
cpumask_clear_cpu(pcpu, need_tlb_flush);
}
}
static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc) static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
{ {
int cpu; int cpu;
...@@ -2493,6 +2722,10 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip) ...@@ -2493,6 +2722,10 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
if (!cpu_has_feature(CPU_FTR_ARCH_207S)) if (!cpu_has_feature(CPU_FTR_ARCH_207S))
return false; return false;
/* In one_vm_per_core mode, require all vcores to be from the same vm */
if (one_vm_per_core && vc->kvm != cip->vc[0]->kvm)
return false;
/* Some POWER9 chips require all threads to be in the same MMU mode */ /* Some POWER9 chips require all threads to be in the same MMU mode */
if (no_mixing_hpt_and_radix && if (no_mixing_hpt_and_radix &&
kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm)) kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
...@@ -2600,6 +2833,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) ...@@ -2600,6 +2833,14 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
spin_lock(&vc->lock); spin_lock(&vc->lock);
now = get_tb(); now = get_tb();
for_each_runnable_thread(i, vcpu, vc) { for_each_runnable_thread(i, vcpu, vc) {
/*
* It's safe to unlock the vcore in the loop here, because
* for_each_runnable_thread() is safe against removal of
* the vcpu, and the vcore state is VCORE_EXITING here,
* so any vcpus becoming runnable will have their arch.trap
* set to zero and can't actually run in the guest.
*/
spin_unlock(&vc->lock);
/* cancel pending dec exception if dec is positive */ /* cancel pending dec exception if dec is positive */
if (now < vcpu->arch.dec_expires && if (now < vcpu->arch.dec_expires &&
kvmppc_core_pending_dec(vcpu)) kvmppc_core_pending_dec(vcpu))
...@@ -2615,6 +2856,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master) ...@@ -2615,6 +2856,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, bool is_master)
vcpu->arch.ret = ret; vcpu->arch.ret = ret;
vcpu->arch.trap = 0; vcpu->arch.trap = 0;
spin_lock(&vc->lock);
if (is_kvmppc_resume_guest(vcpu->arch.ret)) { if (is_kvmppc_resume_guest(vcpu->arch.ret)) {
if (vcpu->arch.pending_exceptions) if (vcpu->arch.pending_exceptions)
kvmppc_core_prepare_to_enter(vcpu); kvmppc_core_prepare_to_enter(vcpu);
...@@ -2963,8 +3205,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) ...@@ -2963,8 +3205,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
spin_unlock(&core_info.vc[sub]->lock); spin_unlock(&core_info.vc[sub]->lock);
if (kvm_is_radix(vc->kvm)) { if (kvm_is_radix(vc->kvm)) {
int tmp = pcpu;
/* /*
* Do we need to flush the process scoped TLB for the LPAR? * Do we need to flush the process scoped TLB for the LPAR?
* *
...@@ -2975,17 +3215,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) ...@@ -2975,17 +3215,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* *
* Hash must be flushed in realmode in order to use tlbiel. * Hash must be flushed in realmode in order to use tlbiel.
*/ */
mtspr(SPRN_LPID, vc->kvm->arch.lpid); kvmppc_radix_check_need_tlb_flush(vc->kvm, pcpu, NULL);
isync();
if (cpu_has_feature(CPU_FTR_ARCH_300))
tmp &= ~0x3UL;
if (cpumask_test_cpu(tmp, &vc->kvm->arch.need_tlb_flush)) {
radix__local_flush_tlb_lpid_guest(vc->kvm->arch.lpid);
/* Clear the bit after the TLB flush */
cpumask_clear_cpu(tmp, &vc->kvm->arch.need_tlb_flush);
}
} }
/* /*
...@@ -3079,6 +3309,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) ...@@ -3079,6 +3309,300 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
trace_kvmppc_run_core(vc, 1); trace_kvmppc_run_core(vc, 1);
} }
/*
* Load up hypervisor-mode registers on P9.
*/
static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
s64 hdec;
u64 tb, purr, spurr;
int trap;
unsigned long host_hfscr = mfspr(SPRN_HFSCR);
unsigned long host_ciabr = mfspr(SPRN_CIABR);
unsigned long host_dawr = mfspr(SPRN_DAWR);
unsigned long host_dawrx = mfspr(SPRN_DAWRX);
unsigned long host_psscr = mfspr(SPRN_PSSCR);
unsigned long host_pidr = mfspr(SPRN_PID);
hdec = time_limit - mftb();
if (hdec < 0)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
mtspr(SPRN_HDEC, hdec);
if (vc->tb_offset) {
u64 new_tb = mftb() + vc->tb_offset;
mtspr(SPRN_TBU40, new_tb);
tb = mftb();
if ((tb & 0xffffff) < (new_tb & 0xffffff))
mtspr(SPRN_TBU40, new_tb + 0x1000000);
vc->tb_offset_applied = vc->tb_offset;
}
if (vc->pcr)
mtspr(SPRN_PCR, vc->pcr);
mtspr(SPRN_DPDES, vc->dpdes);
mtspr(SPRN_VTB, vc->vtb);
local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
mtspr(SPRN_PURR, vcpu->arch.purr);
mtspr(SPRN_SPURR, vcpu->arch.spurr);
if (cpu_has_feature(CPU_FTR_DAWR)) {
mtspr(SPRN_DAWR, vcpu->arch.dawr);
mtspr(SPRN_DAWRX, vcpu->arch.dawrx);
}
mtspr(SPRN_CIABR, vcpu->arch.ciabr);
mtspr(SPRN_IC, vcpu->arch.ic);
mtspr(SPRN_PID, vcpu->arch.pid);
mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
(local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
mtspr(SPRN_AMOR, ~0UL);
mtspr(SPRN_LPCR, lpcr);
isync();
kvmppc_xive_push_vcpu(vcpu);
mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0);
mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1);
trap = __kvmhv_vcpu_entry_p9(vcpu);
/* Advance host PURR/SPURR by the amount used by guest */
purr = mfspr(SPRN_PURR);
spurr = mfspr(SPRN_SPURR);
mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
purr - vcpu->arch.purr);
mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
spurr - vcpu->arch.spurr);
vcpu->arch.purr = purr;
vcpu->arch.spurr = spurr;
vcpu->arch.ic = mfspr(SPRN_IC);
vcpu->arch.pid = mfspr(SPRN_PID);
vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
mtspr(SPRN_PSSCR, host_psscr);
mtspr(SPRN_HFSCR, host_hfscr);
mtspr(SPRN_CIABR, host_ciabr);
mtspr(SPRN_DAWR, host_dawr);
mtspr(SPRN_DAWRX, host_dawrx);
mtspr(SPRN_PID, host_pidr);
/*
* Since this is radix, do a eieio; tlbsync; ptesync sequence in
* case we interrupted the guest between a tlbie and a ptesync.
*/
asm volatile("eieio; tlbsync; ptesync");
mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid); /* restore host LPID */
isync();
vc->dpdes = mfspr(SPRN_DPDES);
vc->vtb = mfspr(SPRN_VTB);
mtspr(SPRN_DPDES, 0);
if (vc->pcr)
mtspr(SPRN_PCR, 0);
if (vc->tb_offset_applied) {
u64 new_tb = mftb() - vc->tb_offset_applied;
mtspr(SPRN_TBU40, new_tb);
tb = mftb();
if ((tb & 0xffffff) < (new_tb & 0xffffff))
mtspr(SPRN_TBU40, new_tb + 0x1000000);
vc->tb_offset_applied = 0;
}
mtspr(SPRN_HDEC, 0x7fffffff);
mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr);
return trap;
}
/*
* Virtual-mode guest entry for POWER9 and later when the host and
* guest are both using the radix MMU. The LPIDR has already been set.
*/
int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long host_dscr = mfspr(SPRN_DSCR);
unsigned long host_tidr = mfspr(SPRN_TIDR);
unsigned long host_iamr = mfspr(SPRN_IAMR);
s64 dec;
u64 tb;
int trap, save_pmu;
dec = mfspr(SPRN_DEC);
tb = mftb();
if (dec < 512)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
local_paca->kvm_hstate.dec_expires = dec + tb;
if (local_paca->kvm_hstate.dec_expires < time_limit)
time_limit = local_paca->kvm_hstate.dec_expires;
vcpu->arch.ceded = 0;
kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */
kvmppc_subcore_enter_guest();
vc->entry_exit_map = 1;
vc->in_guest = 1;
if (vcpu->arch.vpa.pinned_addr) {
struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
lp->yield_count = cpu_to_be32(yield_count);
vcpu->arch.vpa.dirty = 1;
}
if (cpu_has_feature(CPU_FTR_TM) ||
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
kvmhv_load_guest_pmu(vcpu);
msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
load_fp_state(&vcpu->arch.fp);
#ifdef CONFIG_ALTIVEC
load_vr_state(&vcpu->arch.vr);
#endif
mtspr(SPRN_DSCR, vcpu->arch.dscr);
mtspr(SPRN_IAMR, vcpu->arch.iamr);
mtspr(SPRN_PSPB, vcpu->arch.pspb);
mtspr(SPRN_FSCR, vcpu->arch.fscr);
mtspr(SPRN_TAR, vcpu->arch.tar);
mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
mtspr(SPRN_BESCR, vcpu->arch.bescr);
mtspr(SPRN_WORT, vcpu->arch.wort);
mtspr(SPRN_TIDR, vcpu->arch.tid);
mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
mtspr(SPRN_AMR, vcpu->arch.amr);
mtspr(SPRN_UAMOR, vcpu->arch.uamor);
if (!(vcpu->arch.ctrl & 1))
mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
if (kvmhv_on_pseries()) {
/* call our hypervisor to load up HV regs and go */
struct hv_guest_state hvregs;
kvmhv_save_hv_regs(vcpu, &hvregs);
hvregs.lpcr = lpcr;
vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
hvregs.version = HV_GUEST_STATE_VERSION;
if (vcpu->arch.nested) {
hvregs.lpid = vcpu->arch.nested->shadow_lpid;
hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
} else {
hvregs.lpid = vcpu->kvm->arch.lpid;
hvregs.vcpu_token = vcpu->vcpu_id;
}
hvregs.hdec_expiry = time_limit;
trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
__pa(&vcpu->arch.regs));
kvmhv_restore_hv_return_state(vcpu, &hvregs);
vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
/* H_CEDE has to be handled now, not later */
if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
kvmppc_nested_cede(vcpu);
trap = 0;
}
} else {
trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr);
}
vcpu->arch.slb_max = 0;
dec = mfspr(SPRN_DEC);
tb = mftb();
vcpu->arch.dec_expires = dec + tb;
vcpu->cpu = -1;
vcpu->arch.thread_cpu = -1;
vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
vcpu->arch.iamr = mfspr(SPRN_IAMR);
vcpu->arch.pspb = mfspr(SPRN_PSPB);
vcpu->arch.fscr = mfspr(SPRN_FSCR);
vcpu->arch.tar = mfspr(SPRN_TAR);
vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
vcpu->arch.bescr = mfspr(SPRN_BESCR);
vcpu->arch.wort = mfspr(SPRN_WORT);
vcpu->arch.tid = mfspr(SPRN_TIDR);
vcpu->arch.amr = mfspr(SPRN_AMR);
vcpu->arch.uamor = mfspr(SPRN_UAMOR);
vcpu->arch.dscr = mfspr(SPRN_DSCR);
mtspr(SPRN_PSPB, 0);
mtspr(SPRN_WORT, 0);
mtspr(SPRN_AMR, 0);
mtspr(SPRN_UAMOR, 0);
mtspr(SPRN_DSCR, host_dscr);
mtspr(SPRN_TIDR, host_tidr);
mtspr(SPRN_IAMR, host_iamr);
mtspr(SPRN_PSPB, 0);
msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
store_fp_state(&vcpu->arch.fp);
#ifdef CONFIG_ALTIVEC
store_vr_state(&vcpu->arch.vr);
#endif
if (cpu_has_feature(CPU_FTR_TM) ||
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
save_pmu = 1;
if (vcpu->arch.vpa.pinned_addr) {
struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
lp->yield_count = cpu_to_be32(yield_count);
vcpu->arch.vpa.dirty = 1;
save_pmu = lp->pmcregs_in_use;
}
kvmhv_save_guest_pmu(vcpu, save_pmu);
vc->entry_exit_map = 0x101;
vc->in_guest = 0;
mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
kvmhv_load_host_pmu();
kvmppc_subcore_exit_guest();
return trap;
}
/* /*
* Wait for some other vcpu thread to execute us, and * Wait for some other vcpu thread to execute us, and
* wake us up when we need to handle something in the host. * wake us up when we need to handle something in the host.
...@@ -3256,6 +3780,11 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) ...@@ -3256,6 +3780,11 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc)
trace_kvmppc_vcore_wakeup(do_sleep, block_ns); trace_kvmppc_vcore_wakeup(do_sleep, block_ns);
} }
/*
* This never fails for a radix guest, as none of the operations it does
* for a radix guest can fail or have a way to report failure.
* kvmhv_run_single_vcpu() relies on this fact.
*/
static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu)
{ {
int r = 0; int r = 0;
...@@ -3405,6 +3934,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) ...@@ -3405,6 +3934,171 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
return vcpu->arch.ret; return vcpu->arch.ret;
} }
int kvmhv_run_single_vcpu(struct kvm_run *kvm_run,
struct kvm_vcpu *vcpu, u64 time_limit,
unsigned long lpcr)
{
int trap, r, pcpu;
int srcu_idx;
struct kvmppc_vcore *vc;
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *nested = vcpu->arch.nested;
trace_kvmppc_run_vcpu_enter(vcpu);
kvm_run->exit_reason = 0;
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
vc = vcpu->arch.vcore;
vcpu->arch.ceded = 0;
vcpu->arch.run_task = current;
vcpu->arch.kvm_run = kvm_run;
vcpu->arch.stolen_logged = vcore_stolen_time(vc, mftb());
vcpu->arch.state = KVMPPC_VCPU_RUNNABLE;
vcpu->arch.busy_preempt = TB_NIL;
vcpu->arch.last_inst = KVM_INST_FETCH_FAILED;
vc->runnable_threads[0] = vcpu;
vc->n_runnable = 1;
vc->runner = vcpu;
/* See if the MMU is ready to go */
if (!kvm->arch.mmu_ready)
kvmhv_setup_mmu(vcpu);
if (need_resched())
cond_resched();
kvmppc_update_vpas(vcpu);
init_vcore_to_run(vc);
vc->preempt_tb = TB_NIL;
preempt_disable();
pcpu = smp_processor_id();
vc->pcpu = pcpu;
kvmppc_prepare_radix_vcpu(vcpu, pcpu);
local_irq_disable();
hard_irq_disable();
if (signal_pending(current))
goto sigpend;
if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
goto out;
if (!nested) {
kvmppc_core_prepare_to_enter(vcpu);
if (vcpu->arch.doorbell_request) {
vc->dpdes = 1;
smp_wmb();
vcpu->arch.doorbell_request = 0;
}
if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
&vcpu->arch.pending_exceptions))
lpcr |= LPCR_MER;
} else if (vcpu->arch.pending_exceptions ||
vcpu->arch.doorbell_request ||
xive_interrupt_pending(vcpu)) {
vcpu->arch.ret = RESUME_HOST;
goto out;
}
kvmppc_clear_host_core(pcpu);
local_paca->kvm_hstate.tid = 0;
local_paca->kvm_hstate.napping = 0;
local_paca->kvm_hstate.kvm_split_mode = NULL;
kvmppc_start_thread(vcpu, vc);
kvmppc_create_dtl_entry(vcpu, vc);
trace_kvm_guest_enter(vcpu);
vc->vcore_state = VCORE_RUNNING;
trace_kvmppc_run_core(vc, 0);
if (cpu_has_feature(CPU_FTR_HVMODE))
kvmppc_radix_check_need_tlb_flush(kvm, pcpu, nested);
trace_hardirqs_on();
guest_enter_irqoff();
srcu_idx = srcu_read_lock(&kvm->srcu);
this_cpu_disable_ftrace();
trap = kvmhv_p9_guest_entry(vcpu, time_limit, lpcr);
vcpu->arch.trap = trap;
this_cpu_enable_ftrace();
srcu_read_unlock(&kvm->srcu, srcu_idx);
if (cpu_has_feature(CPU_FTR_HVMODE)) {
mtspr(SPRN_LPID, kvm->arch.host_lpid);
isync();
}
trace_hardirqs_off();
set_irq_happened(trap);
kvmppc_set_host_core(pcpu);
local_irq_enable();
guest_exit();
cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
preempt_enable();
/* cancel pending decrementer exception if DEC is now positive */
if (get_tb() < vcpu->arch.dec_expires && kvmppc_core_pending_dec(vcpu))
kvmppc_core_dequeue_dec(vcpu);
trace_kvm_guest_exit(vcpu);
r = RESUME_GUEST;
if (trap) {
if (!nested)
r = kvmppc_handle_exit_hv(kvm_run, vcpu, current);
else
r = kvmppc_handle_nested_exit(vcpu);
}
vcpu->arch.ret = r;
if (is_kvmppc_resume_guest(r) && vcpu->arch.ceded &&
!kvmppc_vcpu_woken(vcpu)) {
kvmppc_set_timer(vcpu);
while (vcpu->arch.ceded && !kvmppc_vcpu_woken(vcpu)) {
if (signal_pending(current)) {
vcpu->stat.signal_exits++;
kvm_run->exit_reason = KVM_EXIT_INTR;
vcpu->arch.ret = -EINTR;
break;
}
spin_lock(&vc->lock);
kvmppc_vcore_blocked(vc);
spin_unlock(&vc->lock);
}
}
vcpu->arch.ceded = 0;
vc->vcore_state = VCORE_INACTIVE;
trace_kvmppc_run_core(vc, 1);
done:
kvmppc_remove_runnable(vc, vcpu);
trace_kvmppc_run_vcpu_exit(vcpu, kvm_run);
return vcpu->arch.ret;
sigpend:
vcpu->stat.signal_exits++;
kvm_run->exit_reason = KVM_EXIT_INTR;
vcpu->arch.ret = -EINTR;
out:
local_irq_enable();
preempt_enable();
goto done;
}
static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
{ {
int r; int r;
...@@ -3480,6 +4174,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu) ...@@ -3480,6 +4174,10 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST;
do { do {
if (kvm->arch.threads_indep && kvm_is_radix(kvm))
r = kvmhv_run_single_vcpu(run, vcpu, ~(u64)0,
vcpu->arch.vcore->lpcr);
else
r = kvmppc_run_vcpu(run, vcpu); r = kvmppc_run_vcpu(run, vcpu);
if (run->exit_reason == KVM_EXIT_PAPR_HCALL && if (run->exit_reason == KVM_EXIT_PAPR_HCALL &&
...@@ -3559,6 +4257,10 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm, ...@@ -3559,6 +4257,10 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01); kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L); kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
/* If running as a nested hypervisor, we don't support HPT guests */
if (kvmhv_on_pseries())
info->flags |= KVM_PPC_NO_HASH;
return 0; return 0;
} }
...@@ -3723,8 +4425,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm) ...@@ -3723,8 +4425,7 @@ void kvmppc_setup_partition_table(struct kvm *kvm)
__pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE; __pa(kvm->arch.pgtable) | RADIX_PGD_INDEX_SIZE;
dw1 = PATB_GR | kvm->arch.process_table; dw1 = PATB_GR | kvm->arch.process_table;
} }
kvmhv_set_ptbl_entry(kvm->arch.lpid, dw0, dw1);
mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
} }
/* /*
...@@ -3820,6 +4521,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu) ...@@ -3820,6 +4521,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */ /* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
int kvmppc_switch_mmu_to_hpt(struct kvm *kvm) int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
{ {
if (nesting_enabled(kvm))
kvmhv_release_all_nested(kvm);
kvmppc_free_radix(kvm); kvmppc_free_radix(kvm);
kvmppc_update_lpcr(kvm, LPCR_VPM1, kvmppc_update_lpcr(kvm, LPCR_VPM1,
LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
...@@ -3841,6 +4544,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm) ...@@ -3841,6 +4544,7 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_free_hpt(&kvm->arch.hpt);
kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR, kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR); LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
kvmppc_rmap_reset(kvm);
kvm->arch.radix = 1; kvm->arch.radix = 1;
return 0; return 0;
} }
...@@ -3940,6 +4644,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) ...@@ -3940,6 +4644,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
kvmppc_alloc_host_rm_ops(); kvmppc_alloc_host_rm_ops();
kvmhv_vm_nested_init(kvm);
/* /*
* Since we don't flush the TLB when tearing down a VM, * Since we don't flush the TLB when tearing down a VM,
* and this lpid might have previously been used, * and this lpid might have previously been used,
...@@ -3958,9 +4664,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) ...@@ -3958,9 +4664,13 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
kvm->arch.host_sdr1 = mfspr(SPRN_SDR1); kvm->arch.host_sdr1 = mfspr(SPRN_SDR1);
/* Init LPCR for virtual RMA mode */ /* Init LPCR for virtual RMA mode */
if (cpu_has_feature(CPU_FTR_HVMODE)) {
kvm->arch.host_lpid = mfspr(SPRN_LPID); kvm->arch.host_lpid = mfspr(SPRN_LPID);
kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR); kvm->arch.host_lpcr = lpcr = mfspr(SPRN_LPCR);
lpcr &= LPCR_PECE | LPCR_LPES; lpcr &= LPCR_PECE | LPCR_LPES;
} else {
lpcr = 0;
}
lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE | lpcr |= (4UL << LPCR_DPFD_SH) | LPCR_HDICE |
LPCR_VPM0 | LPCR_VPM1; LPCR_VPM0 | LPCR_VPM1;
kvm->arch.vrma_slb_v = SLB_VSID_B_1T | kvm->arch.vrma_slb_v = SLB_VSID_B_1T |
...@@ -4027,8 +4737,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) ...@@ -4027,8 +4737,14 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
* On POWER9, we only need to do this if the "indep_threads_mode" * On POWER9, we only need to do this if the "indep_threads_mode"
* module parameter has been set to N. * module parameter has been set to N.
*/ */
if (cpu_has_feature(CPU_FTR_ARCH_300)) if (cpu_has_feature(CPU_FTR_ARCH_300)) {
if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) {
pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n");
kvm->arch.threads_indep = true;
} else {
kvm->arch.threads_indep = indep_threads_mode; kvm->arch.threads_indep = indep_threads_mode;
}
}
if (!kvm->arch.threads_indep) if (!kvm->arch.threads_indep)
kvm_hv_vm_activated(); kvm_hv_vm_activated();
...@@ -4051,6 +4767,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) ...@@ -4051,6 +4767,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
snprintf(buf, sizeof(buf), "vm%d", current->pid); snprintf(buf, sizeof(buf), "vm%d", current->pid);
kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir); kvm->arch.debugfs_dir = debugfs_create_dir(buf, kvm_debugfs_dir);
kvmppc_mmu_debugfs_init(kvm); kvmppc_mmu_debugfs_init(kvm);
if (radix_enabled())
kvmhv_radix_debugfs_init(kvm);
return 0; return 0;
} }
...@@ -4073,13 +4791,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) ...@@ -4073,13 +4791,21 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
kvmppc_free_vcores(kvm); kvmppc_free_vcores(kvm);
kvmppc_free_lpid(kvm->arch.lpid);
if (kvm_is_radix(kvm)) if (kvm_is_radix(kvm))
kvmppc_free_radix(kvm); kvmppc_free_radix(kvm);
else else
kvmppc_free_hpt(&kvm->arch.hpt); kvmppc_free_hpt(&kvm->arch.hpt);
/* Perform global invalidation and return lpid to the pool */
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
if (nesting_enabled(kvm))
kvmhv_release_all_nested(kvm);
kvm->arch.process_table = 0;
kvmhv_set_ptbl_entry(kvm->arch.lpid, 0, 0);
}
kvmppc_free_lpid(kvm->arch.lpid);
kvmppc_free_pimap(kvm); kvmppc_free_pimap(kvm);
} }
...@@ -4104,11 +4830,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn, ...@@ -4104,11 +4830,15 @@ static int kvmppc_core_emulate_mfspr_hv(struct kvm_vcpu *vcpu, int sprn,
static int kvmppc_core_check_processor_compat_hv(void) static int kvmppc_core_check_processor_compat_hv(void)
{ {
if (!cpu_has_feature(CPU_FTR_HVMODE) || if (cpu_has_feature(CPU_FTR_HVMODE) &&
!cpu_has_feature(CPU_FTR_ARCH_206)) cpu_has_feature(CPU_FTR_ARCH_206))
return -EIO; return 0;
/* POWER9 in radix mode is capable of being a nested hypervisor. */
if (cpu_has_feature(CPU_FTR_ARCH_300) && radix_enabled())
return 0; return 0;
return -EIO;
} }
#ifdef CONFIG_KVM_XICS #ifdef CONFIG_KVM_XICS
...@@ -4426,6 +5156,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) ...@@ -4426,6 +5156,10 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if (radix && !radix_enabled()) if (radix && !radix_enabled())
return -EINVAL; return -EINVAL;
/* If we're a nested hypervisor, we currently only support radix */
if (kvmhv_on_pseries() && !radix)
return -EINVAL;
mutex_lock(&kvm->lock); mutex_lock(&kvm->lock);
if (radix != kvm_is_radix(kvm)) { if (radix != kvm_is_radix(kvm)) {
if (kvm->arch.mmu_ready) { if (kvm->arch.mmu_ready) {
...@@ -4458,6 +5192,19 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg) ...@@ -4458,6 +5192,19 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
return err; return err;
} }
static int kvmhv_enable_nested(struct kvm *kvm)
{
if (!nested)
return -EPERM;
if (!cpu_has_feature(CPU_FTR_ARCH_300))
return -ENODEV;
/* kvm == NULL means the caller is testing if the capability exists */
if (kvm)
kvm->arch.nested_enable = true;
return 0;
}
static struct kvmppc_ops kvm_ops_hv = { static struct kvmppc_ops kvm_ops_hv = {
.get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv, .get_sregs = kvm_arch_vcpu_ioctl_get_sregs_hv,
.set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv, .set_sregs = kvm_arch_vcpu_ioctl_set_sregs_hv,
...@@ -4497,6 +5244,7 @@ static struct kvmppc_ops kvm_ops_hv = { ...@@ -4497,6 +5244,7 @@ static struct kvmppc_ops kvm_ops_hv = {
.configure_mmu = kvmhv_configure_mmu, .configure_mmu = kvmhv_configure_mmu,
.get_rmmu_info = kvmhv_get_rmmu_info, .get_rmmu_info = kvmhv_get_rmmu_info,
.set_smt_mode = kvmhv_set_smt_mode, .set_smt_mode = kvmhv_set_smt_mode,
.enable_nested = kvmhv_enable_nested,
}; };
static int kvm_init_subcore_bitmap(void) static int kvm_init_subcore_bitmap(void)
...@@ -4547,6 +5295,10 @@ static int kvmppc_book3s_init_hv(void) ...@@ -4547,6 +5295,10 @@ static int kvmppc_book3s_init_hv(void)
if (r < 0) if (r < 0)
return -ENODEV; return -ENODEV;
r = kvmhv_nested_init();
if (r)
return r;
r = kvm_init_subcore_bitmap(); r = kvm_init_subcore_bitmap();
if (r) if (r)
return r; return r;
...@@ -4557,7 +5309,8 @@ static int kvmppc_book3s_init_hv(void) ...@@ -4557,7 +5309,8 @@ static int kvmppc_book3s_init_hv(void)
* indirectly, via OPAL. * indirectly, via OPAL.
*/ */
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
if (!xive_enabled() && !local_paca->kvm_hstate.xics_phys) { if (!xive_enabled() && !kvmhv_on_pseries() &&
!local_paca->kvm_hstate.xics_phys) {
struct device_node *np; struct device_node *np;
np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc"); np = of_find_compatible_node(NULL, NULL, "ibm,opal-intc");
...@@ -4605,6 +5358,7 @@ static void kvmppc_book3s_exit_hv(void) ...@@ -4605,6 +5358,7 @@ static void kvmppc_book3s_exit_hv(void)
if (kvmppc_radix_possible()) if (kvmppc_radix_possible())
kvmppc_radix_exit(); kvmppc_radix_exit();
kvmppc_hv_ops = NULL; kvmppc_hv_ops = NULL;
kvmhv_nested_exit();
} }
module_init(kvmppc_book3s_init_hv); module_init(kvmppc_book3s_init_hv);
......
...@@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu) ...@@ -231,6 +231,15 @@ void kvmhv_rm_send_ipi(int cpu)
void __iomem *xics_phys; void __iomem *xics_phys;
unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER);
/* For a nested hypervisor, use the XICS via hcall */
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
plpar_hcall_raw(H_IPI, retbuf, get_hard_smp_processor_id(cpu),
IPI_PRIORITY);
return;
}
/* On POWER9 we can use msgsnd for any destination cpu. */ /* On POWER9 we can use msgsnd for any destination cpu. */
if (cpu_has_feature(CPU_FTR_ARCH_300)) { if (cpu_has_feature(CPU_FTR_ARCH_300)) {
msg |= get_hard_smp_processor_id(cpu); msg |= get_hard_smp_processor_id(cpu);
...@@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again) ...@@ -460,12 +469,19 @@ static long kvmppc_read_one_intr(bool *again)
return 1; return 1;
/* Now read the interrupt from the ICP */ /* Now read the interrupt from the ICP */
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
rc = plpar_hcall_raw(H_XIRR, retbuf, 0xFF);
xirr = cpu_to_be32(retbuf[0]);
} else {
xics_phys = local_paca->kvm_hstate.xics_phys; xics_phys = local_paca->kvm_hstate.xics_phys;
rc = 0; rc = 0;
if (!xics_phys) if (!xics_phys)
rc = opal_int_get_xirr(&xirr, false); rc = opal_int_get_xirr(&xirr, false);
else else
xirr = __raw_rm_readl(xics_phys + XICS_XIRR); xirr = __raw_rm_readl(xics_phys + XICS_XIRR);
}
if (rc < 0) if (rc < 0)
return 1; return 1;
...@@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again) ...@@ -494,7 +510,13 @@ static long kvmppc_read_one_intr(bool *again)
*/ */
if (xisr == XICS_IPI) { if (xisr == XICS_IPI) {
rc = 0; rc = 0;
if (xics_phys) { if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
plpar_hcall_raw(H_IPI, retbuf,
hard_smp_processor_id(), 0xff);
plpar_hcall_raw(H_EOI, retbuf, h_xirr);
} else if (xics_phys) {
__raw_rm_writeb(0xff, xics_phys + XICS_MFRR); __raw_rm_writeb(0xff, xics_phys + XICS_MFRR);
__raw_rm_writel(xirr, xics_phys + XICS_XIRR); __raw_rm_writel(xirr, xics_phys + XICS_XIRR);
} else { } else {
...@@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again) ...@@ -520,7 +542,13 @@ static long kvmppc_read_one_intr(bool *again)
/* We raced with the host, /* We raced with the host,
* we need to resend that IPI, bummer * we need to resend that IPI, bummer
*/ */
if (xics_phys) if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
plpar_hcall_raw(H_IPI, retbuf,
hard_smp_processor_id(),
IPI_PRIORITY);
} else if (xics_phys)
__raw_rm_writeb(IPI_PRIORITY, __raw_rm_writeb(IPI_PRIORITY,
xics_phys + XICS_MFRR); xics_phys + XICS_MFRR);
else else
...@@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip) ...@@ -729,3 +757,51 @@ void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
smp_mb(); smp_mb();
local_paca->kvm_hstate.kvm_split_mode = NULL; local_paca->kvm_hstate.kvm_split_mode = NULL;
} }
/*
* Is there a PRIV_DOORBELL pending for the guest (on POWER9)?
* Can we inject a Decrementer or a External interrupt?
*/
void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
{
int ext;
unsigned long vec = 0;
unsigned long lpcr;
/* Insert EXTERNAL bit into LPCR at the MER bit position */
ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
lpcr = mfspr(SPRN_LPCR);
lpcr |= ext << LPCR_MER_SH;
mtspr(SPRN_LPCR, lpcr);
isync();
if (vcpu->arch.shregs.msr & MSR_EE) {
if (ext) {
vec = BOOK3S_INTERRUPT_EXTERNAL;
} else {
long int dec = mfspr(SPRN_DEC);
if (!(lpcr & LPCR_LD))
dec = (int) dec;
if (dec < 0)
vec = BOOK3S_INTERRUPT_DECREMENTER;
}
}
if (vec) {
unsigned long msr, old_msr = vcpu->arch.shregs.msr;
kvmppc_set_srr0(vcpu, kvmppc_get_pc(vcpu));
kvmppc_set_srr1(vcpu, old_msr);
kvmppc_set_pc(vcpu, vec);
msr = vcpu->arch.intr_msr;
if (MSR_TM_ACTIVE(old_msr))
msr |= MSR_TS_S;
vcpu->arch.shregs.msr = msr;
}
if (vcpu->arch.doorbell_request) {
mtspr(SPRN_DPDES, 1);
vcpu->arch.vcore->dpdes = 1;
smp_wmb();
vcpu->arch.doorbell_request = 0;
}
}
...@@ -64,52 +64,7 @@ BEGIN_FTR_SECTION ...@@ -64,52 +64,7 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
/* Save host PMU registers */ /* Save host PMU registers */
BEGIN_FTR_SECTION bl kvmhv_save_host_pmu
/* Work around P8 PMAE bug */
li r3, -1
clrrdi r3, r3, 10
mfspr r8, SPRN_MMCR2
mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */
isync
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mfspr r7, SPRN_MMCR0 /* save MMCR0 */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */
mfspr r6, SPRN_MMCRA
/* Clear MMCRA in order to disable SDAR updates */
li r5, 0
mtspr SPRN_MMCRA, r5
isync
lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */
cmpwi r5, 0
beq 31f /* skip if not */
mfspr r5, SPRN_MMCR1
mfspr r9, SPRN_SIAR
mfspr r10, SPRN_SDAR
std r7, HSTATE_MMCR0(r13)
std r5, HSTATE_MMCR1(r13)
std r6, HSTATE_MMCRA(r13)
std r9, HSTATE_SIAR(r13)
std r10, HSTATE_SDAR(r13)
BEGIN_FTR_SECTION
mfspr r9, SPRN_SIER
std r8, HSTATE_MMCR2(r13)
std r9, HSTATE_SIER(r13)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mfspr r3, SPRN_PMC1
mfspr r5, SPRN_PMC2
mfspr r6, SPRN_PMC3
mfspr r7, SPRN_PMC4
mfspr r8, SPRN_PMC5
mfspr r9, SPRN_PMC6
stw r3, HSTATE_PMC1(r13)
stw r5, HSTATE_PMC2(r13)
stw r6, HSTATE_PMC3(r13)
stw r7, HSTATE_PMC4(r13)
stw r8, HSTATE_PMC5(r13)
stw r9, HSTATE_PMC6(r13)
31:
/* /*
* Put whatever is in the decrementer into the * Put whatever is in the decrementer into the
...@@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ...@@ -161,3 +116,51 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
ld r0, PPC_LR_STKOFF(r1) ld r0, PPC_LR_STKOFF(r1)
mtlr r0 mtlr r0
blr blr
_GLOBAL(kvmhv_save_host_pmu)
BEGIN_FTR_SECTION
/* Work around P8 PMAE bug */
li r3, -1
clrrdi r3, r3, 10
mfspr r8, SPRN_MMCR2
mtspr SPRN_MMCR2, r3 /* freeze all counters using MMCR2 */
isync
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mfspr r7, SPRN_MMCR0 /* save MMCR0 */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable interrupts */
mfspr r6, SPRN_MMCRA
/* Clear MMCRA in order to disable SDAR updates */
li r5, 0
mtspr SPRN_MMCRA, r5
isync
lbz r5, PACA_PMCINUSE(r13) /* is the host using the PMU? */
cmpwi r5, 0
beq 31f /* skip if not */
mfspr r5, SPRN_MMCR1
mfspr r9, SPRN_SIAR
mfspr r10, SPRN_SDAR
std r7, HSTATE_MMCR0(r13)
std r5, HSTATE_MMCR1(r13)
std r6, HSTATE_MMCRA(r13)
std r9, HSTATE_SIAR(r13)
std r10, HSTATE_SDAR(r13)
BEGIN_FTR_SECTION
mfspr r9, SPRN_SIER
std r8, HSTATE_MMCR2(r13)
std r9, HSTATE_SIER(r13)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mfspr r3, SPRN_PMC1
mfspr r5, SPRN_PMC2
mfspr r6, SPRN_PMC3
mfspr r7, SPRN_PMC4
mfspr r8, SPRN_PMC5
mfspr r9, SPRN_PMC6
stw r3, HSTATE_PMC1(r13)
stw r5, HSTATE_PMC2(r13)
stw r6, HSTATE_PMC3(r13)
stw r7, HSTATE_PMC4(r13)
stw r8, HSTATE_PMC5(r13)
stw r9, HSTATE_PMC6(r13)
31: blr
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright IBM Corporation, 2018
* Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com>
* Paul Mackerras <paulus@ozlabs.org>
*
* Description: KVM functions specific to running nested KVM-HV guests
* on Book3S processors (specifically POWER9 and later).
*/
#include <linux/kernel.h>
#include <linux/kvm_host.h>
#include <linux/llist.h>
#include <asm/kvm_ppc.h>
#include <asm/kvm_book3s.h>
#include <asm/mmu.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
#include <asm/pte-walk.h>
#include <asm/reg.h>
static struct patb_entry *pseries_partition_tb;
static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp);
static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free);
void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
hr->pcr = vc->pcr;
hr->dpdes = vc->dpdes;
hr->hfscr = vcpu->arch.hfscr;
hr->tb_offset = vc->tb_offset;
hr->dawr0 = vcpu->arch.dawr;
hr->dawrx0 = vcpu->arch.dawrx;
hr->ciabr = vcpu->arch.ciabr;
hr->purr = vcpu->arch.purr;
hr->spurr = vcpu->arch.spurr;
hr->ic = vcpu->arch.ic;
hr->vtb = vc->vtb;
hr->srr0 = vcpu->arch.shregs.srr0;
hr->srr1 = vcpu->arch.shregs.srr1;
hr->sprg[0] = vcpu->arch.shregs.sprg0;
hr->sprg[1] = vcpu->arch.shregs.sprg1;
hr->sprg[2] = vcpu->arch.shregs.sprg2;
hr->sprg[3] = vcpu->arch.shregs.sprg3;
hr->pidr = vcpu->arch.pid;
hr->cfar = vcpu->arch.cfar;
hr->ppr = vcpu->arch.ppr;
}
static void byteswap_pt_regs(struct pt_regs *regs)
{
unsigned long *addr = (unsigned long *) regs;
for (; addr < ((unsigned long *) (regs + 1)); addr++)
*addr = swab64(*addr);
}
static void byteswap_hv_regs(struct hv_guest_state *hr)
{
hr->version = swab64(hr->version);
hr->lpid = swab32(hr->lpid);
hr->vcpu_token = swab32(hr->vcpu_token);
hr->lpcr = swab64(hr->lpcr);
hr->pcr = swab64(hr->pcr);
hr->amor = swab64(hr->amor);
hr->dpdes = swab64(hr->dpdes);
hr->hfscr = swab64(hr->hfscr);
hr->tb_offset = swab64(hr->tb_offset);
hr->dawr0 = swab64(hr->dawr0);
hr->dawrx0 = swab64(hr->dawrx0);
hr->ciabr = swab64(hr->ciabr);
hr->hdec_expiry = swab64(hr->hdec_expiry);
hr->purr = swab64(hr->purr);
hr->spurr = swab64(hr->spurr);
hr->ic = swab64(hr->ic);
hr->vtb = swab64(hr->vtb);
hr->hdar = swab64(hr->hdar);
hr->hdsisr = swab64(hr->hdsisr);
hr->heir = swab64(hr->heir);
hr->asdr = swab64(hr->asdr);
hr->srr0 = swab64(hr->srr0);
hr->srr1 = swab64(hr->srr1);
hr->sprg[0] = swab64(hr->sprg[0]);
hr->sprg[1] = swab64(hr->sprg[1]);
hr->sprg[2] = swab64(hr->sprg[2]);
hr->sprg[3] = swab64(hr->sprg[3]);
hr->pidr = swab64(hr->pidr);
hr->cfar = swab64(hr->cfar);
hr->ppr = swab64(hr->ppr);
}
static void save_hv_return_state(struct kvm_vcpu *vcpu, int trap,
struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
hr->dpdes = vc->dpdes;
hr->hfscr = vcpu->arch.hfscr;
hr->purr = vcpu->arch.purr;
hr->spurr = vcpu->arch.spurr;
hr->ic = vcpu->arch.ic;
hr->vtb = vc->vtb;
hr->srr0 = vcpu->arch.shregs.srr0;
hr->srr1 = vcpu->arch.shregs.srr1;
hr->sprg[0] = vcpu->arch.shregs.sprg0;
hr->sprg[1] = vcpu->arch.shregs.sprg1;
hr->sprg[2] = vcpu->arch.shregs.sprg2;
hr->sprg[3] = vcpu->arch.shregs.sprg3;
hr->pidr = vcpu->arch.pid;
hr->cfar = vcpu->arch.cfar;
hr->ppr = vcpu->arch.ppr;
switch (trap) {
case BOOK3S_INTERRUPT_H_DATA_STORAGE:
hr->hdar = vcpu->arch.fault_dar;
hr->hdsisr = vcpu->arch.fault_dsisr;
hr->asdr = vcpu->arch.fault_gpa;
break;
case BOOK3S_INTERRUPT_H_INST_STORAGE:
hr->asdr = vcpu->arch.fault_gpa;
break;
case BOOK3S_INTERRUPT_H_EMUL_ASSIST:
hr->heir = vcpu->arch.emul_inst;
break;
}
}
static void sanitise_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
{
/*
* Don't let L1 enable features for L2 which we've disabled for L1,
* but preserve the interrupt cause field.
*/
hr->hfscr &= (HFSCR_INTR_CAUSE | vcpu->arch.hfscr);
/* Don't let data address watchpoint match in hypervisor state */
hr->dawrx0 &= ~DAWRX_HYP;
/* Don't let completed instruction address breakpt match in HV state */
if ((hr->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
hr->ciabr &= ~CIABR_PRIV;
}
static void restore_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
vc->pcr = hr->pcr;
vc->dpdes = hr->dpdes;
vcpu->arch.hfscr = hr->hfscr;
vcpu->arch.dawr = hr->dawr0;
vcpu->arch.dawrx = hr->dawrx0;
vcpu->arch.ciabr = hr->ciabr;
vcpu->arch.purr = hr->purr;
vcpu->arch.spurr = hr->spurr;
vcpu->arch.ic = hr->ic;
vc->vtb = hr->vtb;
vcpu->arch.shregs.srr0 = hr->srr0;
vcpu->arch.shregs.srr1 = hr->srr1;
vcpu->arch.shregs.sprg0 = hr->sprg[0];
vcpu->arch.shregs.sprg1 = hr->sprg[1];
vcpu->arch.shregs.sprg2 = hr->sprg[2];
vcpu->arch.shregs.sprg3 = hr->sprg[3];
vcpu->arch.pid = hr->pidr;
vcpu->arch.cfar = hr->cfar;
vcpu->arch.ppr = hr->ppr;
}
void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu,
struct hv_guest_state *hr)
{
struct kvmppc_vcore *vc = vcpu->arch.vcore;
vc->dpdes = hr->dpdes;
vcpu->arch.hfscr = hr->hfscr;
vcpu->arch.purr = hr->purr;
vcpu->arch.spurr = hr->spurr;
vcpu->arch.ic = hr->ic;
vc->vtb = hr->vtb;
vcpu->arch.fault_dar = hr->hdar;
vcpu->arch.fault_dsisr = hr->hdsisr;
vcpu->arch.fault_gpa = hr->asdr;
vcpu->arch.emul_inst = hr->heir;
vcpu->arch.shregs.srr0 = hr->srr0;
vcpu->arch.shregs.srr1 = hr->srr1;
vcpu->arch.shregs.sprg0 = hr->sprg[0];
vcpu->arch.shregs.sprg1 = hr->sprg[1];
vcpu->arch.shregs.sprg2 = hr->sprg[2];
vcpu->arch.shregs.sprg3 = hr->sprg[3];
vcpu->arch.pid = hr->pidr;
vcpu->arch.cfar = hr->cfar;
vcpu->arch.ppr = hr->ppr;
}
long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
{
long int err, r;
struct kvm_nested_guest *l2;
struct pt_regs l2_regs, saved_l1_regs;
struct hv_guest_state l2_hv, saved_l1_hv;
struct kvmppc_vcore *vc = vcpu->arch.vcore;
u64 hv_ptr, regs_ptr;
u64 hdec_exp;
s64 delta_purr, delta_spurr, delta_ic, delta_vtb;
u64 mask;
unsigned long lpcr;
if (vcpu->kvm->arch.l1_ptcr == 0)
return H_NOT_AVAILABLE;
/* copy parameters in */
hv_ptr = kvmppc_get_gpr(vcpu, 4);
err = kvm_vcpu_read_guest(vcpu, hv_ptr, &l2_hv,
sizeof(struct hv_guest_state));
if (err)
return H_PARAMETER;
if (kvmppc_need_byteswap(vcpu))
byteswap_hv_regs(&l2_hv);
if (l2_hv.version != HV_GUEST_STATE_VERSION)
return H_P2;
regs_ptr = kvmppc_get_gpr(vcpu, 5);
err = kvm_vcpu_read_guest(vcpu, regs_ptr, &l2_regs,
sizeof(struct pt_regs));
if (err)
return H_PARAMETER;
if (kvmppc_need_byteswap(vcpu))
byteswap_pt_regs(&l2_regs);
if (l2_hv.vcpu_token >= NR_CPUS)
return H_PARAMETER;
/* translate lpid */
l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true);
if (!l2)
return H_PARAMETER;
if (!l2->l1_gr_to_hr) {
mutex_lock(&l2->tlb_lock);
kvmhv_update_ptbl_cache(l2);
mutex_unlock(&l2->tlb_lock);
}
/* save l1 values of things */
vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
saved_l1_regs = vcpu->arch.regs;
kvmhv_save_hv_regs(vcpu, &saved_l1_hv);
/* convert TB values/offsets to host (L0) values */
hdec_exp = l2_hv.hdec_expiry - vc->tb_offset;
vc->tb_offset += l2_hv.tb_offset;
/* set L1 state to L2 state */
vcpu->arch.nested = l2;
vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token;
vcpu->arch.regs = l2_regs;
vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD |
LPCR_LPES | LPCR_MER;
lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask);
sanitise_hv_regs(vcpu, &l2_hv);
restore_hv_regs(vcpu, &l2_hv);
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
do {
if (mftb() >= hdec_exp) {
vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
r = RESUME_HOST;
break;
}
r = kvmhv_run_single_vcpu(vcpu->arch.kvm_run, vcpu, hdec_exp,
lpcr);
} while (is_kvmppc_resume_guest(r));
/* save L2 state for return */
l2_regs = vcpu->arch.regs;
l2_regs.msr = vcpu->arch.shregs.msr;
delta_purr = vcpu->arch.purr - l2_hv.purr;
delta_spurr = vcpu->arch.spurr - l2_hv.spurr;
delta_ic = vcpu->arch.ic - l2_hv.ic;
delta_vtb = vc->vtb - l2_hv.vtb;
save_hv_return_state(vcpu, vcpu->arch.trap, &l2_hv);
/* restore L1 state */
vcpu->arch.nested = NULL;
vcpu->arch.regs = saved_l1_regs;
vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK;
/* set L1 MSR TS field according to L2 transaction state */
if (l2_regs.msr & MSR_TS_MASK)
vcpu->arch.shregs.msr |= MSR_TS_S;
vc->tb_offset = saved_l1_hv.tb_offset;
restore_hv_regs(vcpu, &saved_l1_hv);
vcpu->arch.purr += delta_purr;
vcpu->arch.spurr += delta_spurr;
vcpu->arch.ic += delta_ic;
vc->vtb += delta_vtb;
kvmhv_put_nested(l2);
/* copy l2_hv_state and regs back to guest */
if (kvmppc_need_byteswap(vcpu)) {
byteswap_hv_regs(&l2_hv);
byteswap_pt_regs(&l2_regs);
}
err = kvm_vcpu_write_guest(vcpu, hv_ptr, &l2_hv,
sizeof(struct hv_guest_state));
if (err)
return H_AUTHORITY;
err = kvm_vcpu_write_guest(vcpu, regs_ptr, &l2_regs,
sizeof(struct pt_regs));
if (err)
return H_AUTHORITY;
if (r == -EINTR)
return H_INTERRUPT;
return vcpu->arch.trap;
}
long kvmhv_nested_init(void)
{
long int ptb_order;
unsigned long ptcr;
long rc;
if (!kvmhv_on_pseries())
return 0;
if (!radix_enabled())
return -ENODEV;
/* find log base 2 of KVMPPC_NR_LPIDS, rounding up */
ptb_order = __ilog2(KVMPPC_NR_LPIDS - 1) + 1;
if (ptb_order < 8)
ptb_order = 8;
pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order,
GFP_KERNEL);
if (!pseries_partition_tb) {
pr_err("kvm-hv: failed to allocated nested partition table\n");
return -ENOMEM;
}
ptcr = __pa(pseries_partition_tb) | (ptb_order - 8);
rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr);
if (rc != H_SUCCESS) {
pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n",
rc);
kfree(pseries_partition_tb);
pseries_partition_tb = NULL;
return -ENODEV;
}
return 0;
}
void kvmhv_nested_exit(void)
{
/*
* N.B. the kvmhv_on_pseries() test is there because it enables
* the compiler to remove the call to plpar_hcall_norets()
* when CONFIG_PPC_PSERIES=n.
*/
if (kvmhv_on_pseries() && pseries_partition_tb) {
plpar_hcall_norets(H_SET_PARTITION_TABLE, 0);
kfree(pseries_partition_tb);
pseries_partition_tb = NULL;
}
}
static void kvmhv_flush_lpid(unsigned int lpid)
{
long rc;
if (!kvmhv_on_pseries()) {
radix__flush_tlb_lpid(lpid);
return;
}
rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1),
lpid, TLBIEL_INVAL_SET_LPID);
if (rc)
pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n", rc);
}
void kvmhv_set_ptbl_entry(unsigned int lpid, u64 dw0, u64 dw1)
{
if (!kvmhv_on_pseries()) {
mmu_partition_table_set_entry(lpid, dw0, dw1);
return;
}
pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0);
pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1);
/* L0 will do the necessary barriers */
kvmhv_flush_lpid(lpid);
}
static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp)
{
unsigned long dw0;
dw0 = PATB_HR | radix__get_tree_size() |
__pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE;
kvmhv_set_ptbl_entry(gp->shadow_lpid, dw0, gp->process_table);
}
void kvmhv_vm_nested_init(struct kvm *kvm)
{
kvm->arch.max_nested_lpid = -1;
}
/*
* Handle the H_SET_PARTITION_TABLE hcall.
* r4 = guest real address of partition table + log_2(size) - 12
* (formatted as for the PTCR).
*/
long kvmhv_set_partition_table(struct kvm_vcpu *vcpu)
{
struct kvm *kvm = vcpu->kvm;
unsigned long ptcr = kvmppc_get_gpr(vcpu, 4);
int srcu_idx;
long ret = H_SUCCESS;
srcu_idx = srcu_read_lock(&kvm->srcu);
/*
* Limit the partition table to 4096 entries (because that's what
* hardware supports), and check the base address.
*/
if ((ptcr & PRTS_MASK) > 12 - 8 ||
!kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT))
ret = H_PARAMETER;
srcu_read_unlock(&kvm->srcu, srcu_idx);
if (ret == H_SUCCESS)
kvm->arch.l1_ptcr = ptcr;
return ret;
}
/*
* Reload the partition table entry for a guest.
* Caller must hold gp->tlb_lock.
*/
static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp)
{
int ret;
struct patb_entry ptbl_entry;
unsigned long ptbl_addr;
struct kvm *kvm = gp->l1_host;
ret = -EFAULT;
ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4);
if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 8)))
ret = kvm_read_guest(kvm, ptbl_addr,
&ptbl_entry, sizeof(ptbl_entry));
if (ret) {
gp->l1_gr_to_hr = 0;
gp->process_table = 0;
} else {
gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0);
gp->process_table = be64_to_cpu(ptbl_entry.patb1);
}
kvmhv_set_nested_ptbl(gp);
}
struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid)
{
struct kvm_nested_guest *gp;
long shadow_lpid;
gp = kzalloc(sizeof(*gp), GFP_KERNEL);
if (!gp)
return NULL;
gp->l1_host = kvm;
gp->l1_lpid = lpid;
mutex_init(&gp->tlb_lock);
gp->shadow_pgtable = pgd_alloc(kvm->mm);
if (!gp->shadow_pgtable)
goto out_free;
shadow_lpid = kvmppc_alloc_lpid();
if (shadow_lpid < 0)
goto out_free2;
gp->shadow_lpid = shadow_lpid;
memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu));
return gp;
out_free2:
pgd_free(kvm->mm, gp->shadow_pgtable);
out_free:
kfree(gp);
return NULL;
}
/*
* Free up any resources allocated for a nested guest.
*/
static void kvmhv_release_nested(struct kvm_nested_guest *gp)
{
struct kvm *kvm = gp->l1_host;
if (gp->shadow_pgtable) {
/*
* No vcpu is using this struct and no call to
* kvmhv_get_nested can find this struct,
* so we don't need to hold kvm->mmu_lock.
*/
kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
gp->shadow_lpid);
pgd_free(kvm->mm, gp->shadow_pgtable);
}
kvmhv_set_ptbl_entry(gp->shadow_lpid, 0, 0);
kvmppc_free_lpid(gp->shadow_lpid);
kfree(gp);
}
static void kvmhv_remove_nested(struct kvm_nested_guest *gp)
{
struct kvm *kvm = gp->l1_host;
int lpid = gp->l1_lpid;
long ref;
spin_lock(&kvm->mmu_lock);
if (gp == kvm->arch.nested_guests[lpid]) {
kvm->arch.nested_guests[lpid] = NULL;
if (lpid == kvm->arch.max_nested_lpid) {
while (--lpid >= 0 && !kvm->arch.nested_guests[lpid])
;
kvm->arch.max_nested_lpid = lpid;
}
--gp->refcnt;
}
ref = gp->refcnt;
spin_unlock(&kvm->mmu_lock);
if (ref == 0)
kvmhv_release_nested(gp);
}
/*
* Free up all nested resources allocated for this guest.
* This is called with no vcpus of the guest running, when
* switching the guest to HPT mode or when destroying the
* guest.
*/
void kvmhv_release_all_nested(struct kvm *kvm)
{
int i;
struct kvm_nested_guest *gp;
struct kvm_nested_guest *freelist = NULL;
struct kvm_memory_slot *memslot;
int srcu_idx;
spin_lock(&kvm->mmu_lock);
for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
gp = kvm->arch.nested_guests[i];
if (!gp)
continue;
kvm->arch.nested_guests[i] = NULL;
if (--gp->refcnt == 0) {
gp->next = freelist;
freelist = gp;
}
}
kvm->arch.max_nested_lpid = -1;
spin_unlock(&kvm->mmu_lock);
while ((gp = freelist) != NULL) {
freelist = gp->next;
kvmhv_release_nested(gp);
}
srcu_idx = srcu_read_lock(&kvm->srcu);
kvm_for_each_memslot(memslot, kvm_memslots(kvm))
kvmhv_free_memslot_nest_rmap(memslot);
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
/* caller must hold gp->tlb_lock */
static void kvmhv_flush_nested(struct kvm_nested_guest *gp)
{
struct kvm *kvm = gp->l1_host;
spin_lock(&kvm->mmu_lock);
kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid);
spin_unlock(&kvm->mmu_lock);
kvmhv_flush_lpid(gp->shadow_lpid);
kvmhv_update_ptbl_cache(gp);
if (gp->l1_gr_to_hr == 0)
kvmhv_remove_nested(gp);
}
struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid,
bool create)
{
struct kvm_nested_guest *gp, *newgp;
if (l1_lpid >= KVM_MAX_NESTED_GUESTS ||
l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4)))
return NULL;
spin_lock(&kvm->mmu_lock);
gp = kvm->arch.nested_guests[l1_lpid];
if (gp)
++gp->refcnt;
spin_unlock(&kvm->mmu_lock);
if (gp || !create)
return gp;
newgp = kvmhv_alloc_nested(kvm, l1_lpid);
if (!newgp)
return NULL;
spin_lock(&kvm->mmu_lock);
if (kvm->arch.nested_guests[l1_lpid]) {
/* someone else beat us to it */
gp = kvm->arch.nested_guests[l1_lpid];
} else {
kvm->arch.nested_guests[l1_lpid] = newgp;
++newgp->refcnt;
gp = newgp;
newgp = NULL;
if (l1_lpid > kvm->arch.max_nested_lpid)
kvm->arch.max_nested_lpid = l1_lpid;
}
++gp->refcnt;
spin_unlock(&kvm->mmu_lock);
if (newgp)
kvmhv_release_nested(newgp);
return gp;
}
void kvmhv_put_nested(struct kvm_nested_guest *gp)
{
struct kvm *kvm = gp->l1_host;
long ref;
spin_lock(&kvm->mmu_lock);
ref = --gp->refcnt;
spin_unlock(&kvm->mmu_lock);
if (ref == 0)
kvmhv_release_nested(gp);
}
static struct kvm_nested_guest *kvmhv_find_nested(struct kvm *kvm, int lpid)
{
if (lpid > kvm->arch.max_nested_lpid)
return NULL;
return kvm->arch.nested_guests[lpid];
}
static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2)
{
return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK |
RMAP_NESTED_GPA_MASK));
}
void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp,
struct rmap_nested **n_rmap)
{
struct llist_node *entry = ((struct llist_head *) rmapp)->first;
struct rmap_nested *cursor;
u64 rmap, new_rmap = (*n_rmap)->rmap;
/* Are there any existing entries? */
if (!(*rmapp)) {
/* No -> use the rmap as a single entry */
*rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY;
return;
}
/* Do any entries match what we're trying to insert? */
for_each_nest_rmap_safe(cursor, entry, &rmap) {
if (kvmhv_n_rmap_is_equal(rmap, new_rmap))
return;
}
/* Do we need to create a list or just add the new entry? */
rmap = *rmapp;
if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
*rmapp = 0UL;
llist_add(&((*n_rmap)->list), (struct llist_head *) rmapp);
if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */
(*n_rmap)->list.next = (struct llist_node *) rmap;
/* Set NULL so not freed by caller */
*n_rmap = NULL;
}
static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap,
unsigned long hpa, unsigned long mask)
{
struct kvm_nested_guest *gp;
unsigned long gpa;
unsigned int shift, lpid;
pte_t *ptep;
gpa = n_rmap & RMAP_NESTED_GPA_MASK;
lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT;
gp = kvmhv_find_nested(kvm, lpid);
if (!gp)
return;
/* Find and invalidate the pte */
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
/* Don't spuriously invalidate ptes if the pfn has changed */
if (ptep && pte_present(*ptep) && ((pte_val(*ptep) & mask) == hpa))
kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
}
static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp,
unsigned long hpa, unsigned long mask)
{
struct llist_node *entry = llist_del_all((struct llist_head *) rmapp);
struct rmap_nested *cursor;
unsigned long rmap;
for_each_nest_rmap_safe(cursor, entry, &rmap) {
kvmhv_remove_nest_rmap(kvm, rmap, hpa, mask);
kfree(cursor);
}
}
/* called with kvm->mmu_lock held */
void kvmhv_remove_nest_rmap_range(struct kvm *kvm,
struct kvm_memory_slot *memslot,
unsigned long gpa, unsigned long hpa,
unsigned long nbytes)
{
unsigned long gfn, end_gfn;
unsigned long addr_mask;
if (!memslot)
return;
gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn;
end_gfn = gfn + (nbytes >> PAGE_SHIFT);
addr_mask = PTE_RPN_MASK & ~(nbytes - 1);
hpa &= addr_mask;
for (; gfn < end_gfn; gfn++) {
unsigned long *rmap = &memslot->arch.rmap[gfn];
kvmhv_remove_nest_rmap_list(kvm, rmap, hpa, addr_mask);
}
}
static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free)
{
unsigned long page;
for (page = 0; page < free->npages; page++) {
unsigned long rmap, *rmapp = &free->arch.rmap[page];
struct rmap_nested *cursor;
struct llist_node *entry;
entry = llist_del_all((struct llist_head *) rmapp);
for_each_nest_rmap_safe(cursor, entry, &rmap)
kfree(cursor);
}
}
static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp,
long gpa, int *shift_ret)
{
struct kvm *kvm = vcpu->kvm;
bool ret = false;
pte_t *ptep;
int shift;
spin_lock(&kvm->mmu_lock);
ptep = __find_linux_pte(gp->shadow_pgtable, gpa, NULL, &shift);
if (!shift)
shift = PAGE_SHIFT;
if (ptep && pte_present(*ptep)) {
kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid);
ret = true;
}
spin_unlock(&kvm->mmu_lock);
if (shift_ret)
*shift_ret = shift;
return ret;
}
static inline int get_ric(unsigned int instr)
{
return (instr >> 18) & 0x3;
}
static inline int get_prs(unsigned int instr)
{
return (instr >> 17) & 0x1;
}
static inline int get_r(unsigned int instr)
{
return (instr >> 16) & 0x1;
}
static inline int get_lpid(unsigned long r_val)
{
return r_val & 0xffffffff;
}
static inline int get_is(unsigned long r_val)
{
return (r_val >> 10) & 0x3;
}
static inline int get_ap(unsigned long r_val)
{
return (r_val >> 5) & 0x7;
}
static inline long get_epn(unsigned long r_val)
{
return r_val >> 12;
}
static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid,
int ap, long epn)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *gp;
long npages;
int shift, shadow_shift;
unsigned long addr;
shift = ap_to_shift(ap);
addr = epn << 12;
if (shift < 0)
/* Invalid ap encoding */
return -EINVAL;
addr &= ~((1UL << shift) - 1);
npages = 1UL << (shift - PAGE_SHIFT);
gp = kvmhv_get_nested(kvm, lpid, false);
if (!gp) /* No such guest -> nothing to do */
return 0;
mutex_lock(&gp->tlb_lock);
/* There may be more than one host page backing this single guest pte */
do {
kvmhv_invalidate_shadow_pte(vcpu, gp, addr, &shadow_shift);
npages -= 1UL << (shadow_shift - PAGE_SHIFT);
addr += 1UL << shadow_shift;
} while (npages > 0);
mutex_unlock(&gp->tlb_lock);
kvmhv_put_nested(gp);
return 0;
}
static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp, int ric)
{
struct kvm *kvm = vcpu->kvm;
mutex_lock(&gp->tlb_lock);
switch (ric) {
case 0:
/* Invalidate TLB */
spin_lock(&kvm->mmu_lock);
kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable,
gp->shadow_lpid);
kvmhv_flush_lpid(gp->shadow_lpid);
spin_unlock(&kvm->mmu_lock);
break;
case 1:
/*
* Invalidate PWC
* We don't cache this -> nothing to do
*/
break;
case 2:
/* Invalidate TLB, PWC and caching of partition table entries */
kvmhv_flush_nested(gp);
break;
default:
break;
}
mutex_unlock(&gp->tlb_lock);
}
static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *gp;
int i;
spin_lock(&kvm->mmu_lock);
for (i = 0; i <= kvm->arch.max_nested_lpid; i++) {
gp = kvm->arch.nested_guests[i];
if (gp) {
spin_unlock(&kvm->mmu_lock);
kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
spin_lock(&kvm->mmu_lock);
}
}
spin_unlock(&kvm->mmu_lock);
}
static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr,
unsigned long rsval, unsigned long rbval)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *gp;
int r, ric, prs, is, ap;
int lpid;
long epn;
int ret = 0;
ric = get_ric(instr);
prs = get_prs(instr);
r = get_r(instr);
lpid = get_lpid(rsval);
is = get_is(rbval);
/*
* These cases are invalid and are not handled:
* r != 1 -> Only radix supported
* prs == 1 -> Not HV privileged
* ric == 3 -> No cluster bombs for radix
* is == 1 -> Partition scoped translations not associated with pid
* (!is) && (ric == 1 || ric == 2) -> Not supported by ISA
*/
if ((!r) || (prs) || (ric == 3) || (is == 1) ||
((!is) && (ric == 1 || ric == 2)))
return -EINVAL;
switch (is) {
case 0:
/*
* We know ric == 0
* Invalidate TLB for a given target address
*/
epn = get_epn(rbval);
ap = get_ap(rbval);
ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn);
break;
case 2:
/* Invalidate matching LPID */
gp = kvmhv_get_nested(kvm, lpid, false);
if (gp) {
kvmhv_emulate_tlbie_lpid(vcpu, gp, ric);
kvmhv_put_nested(gp);
}
break;
case 3:
/* Invalidate ALL LPIDs */
kvmhv_emulate_tlbie_all_lpid(vcpu, ric);
break;
default:
ret = -EINVAL;
break;
}
return ret;
}
/*
* This handles the H_TLB_INVALIDATE hcall.
* Parameters are (r4) tlbie instruction code, (r5) rS contents,
* (r6) rB contents.
*/
long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu)
{
int ret;
ret = kvmhv_emulate_priv_tlbie(vcpu, kvmppc_get_gpr(vcpu, 4),
kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6));
if (ret)
return H_PARAMETER;
return H_SUCCESS;
}
/* Used to convert a nested guest real address to a L1 guest real address */
static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp,
unsigned long n_gpa, unsigned long dsisr,
struct kvmppc_pte *gpte_p)
{
u64 fault_addr, flags = dsisr & DSISR_ISSTORE;
int ret;
ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr,
&fault_addr);
if (ret) {
/* We didn't find a pte */
if (ret == -EINVAL) {
/* Unsupported mmu config */
flags |= DSISR_UNSUPP_MMU;
} else if (ret == -ENOENT) {
/* No translation found */
flags |= DSISR_NOHPTE;
} else if (ret == -EFAULT) {
/* Couldn't access L1 real address */
flags |= DSISR_PRTABLE_FAULT;
vcpu->arch.fault_gpa = fault_addr;
} else {
/* Unknown error */
return ret;
}
goto forward_to_l1;
} else {
/* We found a pte -> check permissions */
if (dsisr & DSISR_ISSTORE) {
/* Can we write? */
if (!gpte_p->may_write) {
flags |= DSISR_PROTFAULT;
goto forward_to_l1;
}
} else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
/* Can we execute? */
if (!gpte_p->may_execute) {
flags |= SRR1_ISI_N_OR_G;
goto forward_to_l1;
}
} else {
/* Can we read? */
if (!gpte_p->may_read && !gpte_p->may_write) {
flags |= DSISR_PROTFAULT;
goto forward_to_l1;
}
}
}
return 0;
forward_to_l1:
vcpu->arch.fault_dsisr = flags;
if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) {
vcpu->arch.shregs.msr &= ~0x783f0000ul;
vcpu->arch.shregs.msr |= flags;
}
return RESUME_HOST;
}
static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp,
unsigned long n_gpa,
struct kvmppc_pte gpte,
unsigned long dsisr)
{
struct kvm *kvm = vcpu->kvm;
bool writing = !!(dsisr & DSISR_ISSTORE);
u64 pgflags;
bool ret;
/* Are the rc bits set in the L1 partition scoped pte? */
pgflags = _PAGE_ACCESSED;
if (writing)
pgflags |= _PAGE_DIRTY;
if (pgflags & ~gpte.rc)
return RESUME_HOST;
spin_lock(&kvm->mmu_lock);
/* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */
ret = kvmppc_hv_handle_set_rc(kvm, kvm->arch.pgtable, writing,
gpte.raddr, kvm->arch.lpid);
spin_unlock(&kvm->mmu_lock);
if (!ret)
return -EINVAL;
/* Set the rc bit in the pte of the shadow_pgtable for the nest guest */
ret = kvmppc_hv_handle_set_rc(kvm, gp->shadow_pgtable, writing, n_gpa,
gp->shadow_lpid);
if (!ret)
return -EINVAL;
return 0;
}
static inline int kvmppc_radix_level_to_shift(int level)
{
switch (level) {
case 2:
return PUD_SHIFT;
case 1:
return PMD_SHIFT;
default:
return PAGE_SHIFT;
}
}
static inline int kvmppc_radix_shift_to_level(int shift)
{
if (shift == PUD_SHIFT)
return 2;
if (shift == PMD_SHIFT)
return 1;
if (shift == PAGE_SHIFT)
return 0;
WARN_ON_ONCE(1);
return 0;
}
/* called with gp->tlb_lock held */
static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu,
struct kvm_nested_guest *gp)
{
struct kvm *kvm = vcpu->kvm;
struct kvm_memory_slot *memslot;
struct rmap_nested *n_rmap;
struct kvmppc_pte gpte;
pte_t pte, *pte_p;
unsigned long mmu_seq;
unsigned long dsisr = vcpu->arch.fault_dsisr;
unsigned long ea = vcpu->arch.fault_dar;
unsigned long *rmapp;
unsigned long n_gpa, gpa, gfn, perm = 0UL;
unsigned int shift, l1_shift, level;
bool writing = !!(dsisr & DSISR_ISSTORE);
bool kvm_ro = false;
long int ret;
if (!gp->l1_gr_to_hr) {
kvmhv_update_ptbl_cache(gp);
if (!gp->l1_gr_to_hr)
return RESUME_HOST;
}
/* Convert the nested guest real address into a L1 guest real address */
n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL;
if (!(dsisr & DSISR_PRTABLE_FAULT))
n_gpa |= ea & 0xFFF;
ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, &gpte);
/*
* If the hardware found a translation but we don't now have a usable
* translation in the l1 partition-scoped tree, remove the shadow pte
* and let the guest retry.
*/
if (ret == RESUME_HOST &&
(dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G |
DSISR_BAD_COPYPASTE)))
goto inval;
if (ret)
return ret;
/* Failed to set the reference/change bits */
if (dsisr & DSISR_SET_RC) {
ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte, dsisr);
if (ret == RESUME_HOST)
return ret;
if (ret)
goto inval;
dsisr &= ~DSISR_SET_RC;
if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE |
DSISR_PROTFAULT)))
return RESUME_GUEST;
}
/*
* We took an HISI or HDSI while we were running a nested guest which
* means we have no partition scoped translation for that. This means
* we need to insert a pte for the mapping into our shadow_pgtable.
*/
l1_shift = gpte.page_shift;
if (l1_shift < PAGE_SHIFT) {
/* We don't support l1 using a page size smaller than our own */
pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n",
l1_shift, PAGE_SHIFT);
return -EINVAL;
}
gpa = gpte.raddr;
gfn = gpa >> PAGE_SHIFT;
/* 1. Get the corresponding host memslot */
memslot = gfn_to_memslot(kvm, gfn);
if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) {
if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) {
/* unusual error -> reflect to the guest as a DSI */
kvmppc_core_queue_data_storage(vcpu, ea, dsisr);
return RESUME_GUEST;
}
/* passthrough of emulated MMIO case... */
pr_err("emulated MMIO passthrough?\n");
return -EINVAL;
}
if (memslot->flags & KVM_MEM_READONLY) {
if (writing) {
/* Give the guest a DSI */
kvmppc_core_queue_data_storage(vcpu, ea,
DSISR_ISSTORE | DSISR_PROTFAULT);
return RESUME_GUEST;
}
kvm_ro = true;
}
/* 2. Find the host pte for this L1 guest real address */
/* Used to check for invalidations in progress */
mmu_seq = kvm->mmu_notifier_seq;
smp_rmb();
/* See if can find translation in our partition scoped tables for L1 */
pte = __pte(0);
spin_lock(&kvm->mmu_lock);
pte_p = __find_linux_pte(kvm->arch.pgtable, gpa, NULL, &shift);
if (!shift)
shift = PAGE_SHIFT;
if (pte_p)
pte = *pte_p;
spin_unlock(&kvm->mmu_lock);
if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) {
/* No suitable pte found -> try to insert a mapping */
ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot,
writing, kvm_ro, &pte, &level);
if (ret == -EAGAIN)
return RESUME_GUEST;
else if (ret)
return ret;
shift = kvmppc_radix_level_to_shift(level);
}
/* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */
/* The permissions is the combination of the host and l1 guest ptes */
perm |= gpte.may_read ? 0UL : _PAGE_READ;
perm |= gpte.may_write ? 0UL : _PAGE_WRITE;
perm |= gpte.may_execute ? 0UL : _PAGE_EXEC;
pte = __pte(pte_val(pte) & ~perm);
/* What size pte can we insert? */
if (shift > l1_shift) {
u64 mask;
unsigned int actual_shift = PAGE_SHIFT;
if (PMD_SHIFT < l1_shift)
actual_shift = PMD_SHIFT;
mask = (1UL << shift) - (1UL << actual_shift);
pte = __pte(pte_val(pte) | (gpa & mask));
shift = actual_shift;
}
level = kvmppc_radix_shift_to_level(shift);
n_gpa &= ~((1UL << shift) - 1);
/* 4. Insert the pte into our shadow_pgtable */
n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL);
if (!n_rmap)
return RESUME_GUEST; /* Let the guest try again */
n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) |
(((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT);
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level,
mmu_seq, gp->shadow_lpid, rmapp, &n_rmap);
if (n_rmap)
kfree(n_rmap);
if (ret == -EAGAIN)
ret = RESUME_GUEST; /* Let the guest try again */
return ret;
inval:
kvmhv_invalidate_shadow_pte(vcpu, gp, n_gpa, NULL);
return RESUME_GUEST;
}
long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu)
{
struct kvm_nested_guest *gp = vcpu->arch.nested;
long int ret;
mutex_lock(&gp->tlb_lock);
ret = __kvmhv_nested_page_fault(vcpu, gp);
mutex_unlock(&gp->tlb_lock);
return ret;
}
int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid)
{
int ret = -1;
spin_lock(&kvm->mmu_lock);
while (++lpid <= kvm->arch.max_nested_lpid) {
if (kvm->arch.nested_guests[lpid]) {
ret = lpid;
break;
}
}
spin_unlock(&kvm->mmu_lock);
return ret;
}
...@@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void) ...@@ -177,6 +177,7 @@ void kvmppc_subcore_enter_guest(void)
local_paca->sibling_subcore_state->in_guest[subcore_id] = 1; local_paca->sibling_subcore_state->in_guest[subcore_id] = 1;
} }
EXPORT_SYMBOL_GPL(kvmppc_subcore_enter_guest);
void kvmppc_subcore_exit_guest(void) void kvmppc_subcore_exit_guest(void)
{ {
...@@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void) ...@@ -187,6 +188,7 @@ void kvmppc_subcore_exit_guest(void)
local_paca->sibling_subcore_state->in_guest[subcore_id] = 0; local_paca->sibling_subcore_state->in_guest[subcore_id] = 0;
} }
EXPORT_SYMBOL_GPL(kvmppc_subcore_exit_guest);
static bool kvmppc_tb_resync_required(void) static bool kvmppc_tb_resync_required(void)
{ {
...@@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void) ...@@ -331,5 +333,13 @@ long kvmppc_realmode_hmi_handler(void)
} else { } else {
wait_for_tb_resync(); wait_for_tb_resync();
} }
/*
* Reset tb_offset_applied so the guest exit code won't try
* to subtract the previous timebase offset from the timebase.
*/
if (local_paca->kvm_hstate.kvm_vcore)
local_paca->kvm_hstate.kvm_vcore->tb_offset_applied = 0;
return 0; return 0;
} }
...@@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, ...@@ -136,7 +136,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
/* Mark the target VCPU as having an interrupt pending */ /* Mark the target VCPU as having an interrupt pending */
vcpu->stat.queue_intr++; vcpu->stat.queue_intr++;
set_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); set_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
/* Kick self ? Just set MER and return */ /* Kick self ? Just set MER and return */
if (vcpu == this_vcpu) { if (vcpu == this_vcpu) {
...@@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu, ...@@ -170,8 +170,7 @@ static void icp_rm_set_vcpu_irq(struct kvm_vcpu *vcpu,
static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu) static void icp_rm_clr_vcpu_irq(struct kvm_vcpu *vcpu)
{ {
/* Note: Only called on self ! */ /* Note: Only called on self ! */
clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL, clear_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions);
&vcpu->arch.pending_exceptions);
mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER); mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~LPCR_MER);
} }
...@@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again) ...@@ -768,6 +767,14 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, __be32 xirr, bool *again)
void __iomem *xics_phys; void __iomem *xics_phys;
int64_t rc; int64_t rc;
if (kvmhv_on_pseries()) {
unsigned long retbuf[PLPAR_HCALL_BUFSIZE];
iosync();
plpar_hcall_raw(H_EOI, retbuf, hwirq);
return;
}
rc = pnv_opal_pci_msi_eoi(c, hwirq); rc = pnv_opal_pci_msi_eoi(c, hwirq);
if (rc) if (rc)
......
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include <asm/exception-64s.h> #include <asm/exception-64s.h>
#include <asm/kvm_book3s_asm.h> #include <asm/kvm_book3s_asm.h>
#include <asm/book3s/64/mmu-hash.h> #include <asm/book3s/64/mmu-hash.h>
#include <asm/export.h>
#include <asm/tm.h> #include <asm/tm.h>
#include <asm/opal.h> #include <asm/opal.h>
#include <asm/xive-regs.h> #include <asm/xive-regs.h>
...@@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) ...@@ -46,8 +47,9 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#define NAPPING_NOVCPU 2 #define NAPPING_NOVCPU 2
/* Stack frame offsets for kvmppc_hv_entry */ /* Stack frame offsets for kvmppc_hv_entry */
#define SFS 160 #define SFS 208
#define STACK_SLOT_TRAP (SFS-4) #define STACK_SLOT_TRAP (SFS-4)
#define STACK_SLOT_SHORT_PATH (SFS-8)
#define STACK_SLOT_TID (SFS-16) #define STACK_SLOT_TID (SFS-16)
#define STACK_SLOT_PSSCR (SFS-24) #define STACK_SLOT_PSSCR (SFS-24)
#define STACK_SLOT_PID (SFS-32) #define STACK_SLOT_PID (SFS-32)
...@@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) ...@@ -56,6 +58,8 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
#define STACK_SLOT_DAWR (SFS-56) #define STACK_SLOT_DAWR (SFS-56)
#define STACK_SLOT_DAWRX (SFS-64) #define STACK_SLOT_DAWRX (SFS-64)
#define STACK_SLOT_HFSCR (SFS-72) #define STACK_SLOT_HFSCR (SFS-72)
/* the following is used by the P9 short path */
#define STACK_SLOT_NVGPRS (SFS-152) /* 18 gprs */
/* /*
* Call kvmppc_hv_entry in real mode. * Call kvmppc_hv_entry in real mode.
...@@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) ...@@ -113,45 +117,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtspr SPRN_SPRG_VDSO_WRITE,r3 mtspr SPRN_SPRG_VDSO_WRITE,r3
/* Reload the host's PMU registers */ /* Reload the host's PMU registers */
lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */ bl kvmhv_load_host_pmu
cmpwi r4, 0
beq 23f /* skip if not */
BEGIN_FTR_SECTION
ld r3, HSTATE_MMCR0(r13)
andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
cmpwi r4, MMCR0_PMAO
beql kvmppc_fix_pmao
END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
lwz r3, HSTATE_PMC1(r13)
lwz r4, HSTATE_PMC2(r13)
lwz r5, HSTATE_PMC3(r13)
lwz r6, HSTATE_PMC4(r13)
lwz r8, HSTATE_PMC5(r13)
lwz r9, HSTATE_PMC6(r13)
mtspr SPRN_PMC1, r3
mtspr SPRN_PMC2, r4
mtspr SPRN_PMC3, r5
mtspr SPRN_PMC4, r6
mtspr SPRN_PMC5, r8
mtspr SPRN_PMC6, r9
ld r3, HSTATE_MMCR0(r13)
ld r4, HSTATE_MMCR1(r13)
ld r5, HSTATE_MMCRA(r13)
ld r6, HSTATE_SIAR(r13)
ld r7, HSTATE_SDAR(r13)
mtspr SPRN_MMCR1, r4
mtspr SPRN_MMCRA, r5
mtspr SPRN_SIAR, r6
mtspr SPRN_SDAR, r7
BEGIN_FTR_SECTION
ld r8, HSTATE_MMCR2(r13)
ld r9, HSTATE_SIER(r13)
mtspr SPRN_MMCR2, r8
mtspr SPRN_SIER, r9
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
23:
/* /*
* Reload DEC. HDEC interrupts were disabled when * Reload DEC. HDEC interrupts were disabled when
...@@ -796,66 +762,23 @@ BEGIN_FTR_SECTION ...@@ -796,66 +762,23 @@ BEGIN_FTR_SECTION
b 91f b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/* /*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/ */
mr r3, r4 mr r3, r4
ld r4, VCPU_MSR(r3) ld r4, VCPU_MSR(r3)
li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_restore_tm_hv bl kvmppc_restore_tm_hv
nop
ld r4, HSTATE_KVM_VCPU(r13) ld r4, HSTATE_KVM_VCPU(r13)
91: 91:
#endif #endif
/* Load guest PMU registers */ /* Load guest PMU registers; r4 = vcpu pointer here */
/* R4 is live here (vcpu pointer) */ mr r3, r4
li r3, 1 bl kvmhv_load_guest_pmu
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
isync
BEGIN_FTR_SECTION
ld r3, VCPU_MMCR(r4)
andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
cmpwi r5, MMCR0_PMAO
beql kvmppc_fix_pmao
END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */
lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */
lwz r6, VCPU_PMC + 8(r4)
lwz r7, VCPU_PMC + 12(r4)
lwz r8, VCPU_PMC + 16(r4)
lwz r9, VCPU_PMC + 20(r4)
mtspr SPRN_PMC1, r3
mtspr SPRN_PMC2, r5
mtspr SPRN_PMC3, r6
mtspr SPRN_PMC4, r7
mtspr SPRN_PMC5, r8
mtspr SPRN_PMC6, r9
ld r3, VCPU_MMCR(r4)
ld r5, VCPU_MMCR + 8(r4)
ld r6, VCPU_MMCR + 16(r4)
ld r7, VCPU_SIAR(r4)
ld r8, VCPU_SDAR(r4)
mtspr SPRN_MMCR1, r5
mtspr SPRN_MMCRA, r6
mtspr SPRN_SIAR, r7
mtspr SPRN_SDAR, r8
BEGIN_FTR_SECTION
ld r5, VCPU_MMCR + 24(r4)
ld r6, VCPU_SIER(r4)
mtspr SPRN_MMCR2, r5
mtspr SPRN_SIER, r6
BEGIN_FTR_SECTION_NESTED(96)
lwz r7, VCPU_PMC + 24(r4)
lwz r8, VCPU_PMC + 28(r4)
ld r9, VCPU_MMCR + 32(r4)
mtspr SPRN_SPMC1, r7
mtspr SPRN_SPMC2, r8
mtspr SPRN_MMCRS, r9
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
/* Load up FP, VMX and VSX registers */ /* Load up FP, VMX and VSX registers */
ld r4, HSTATE_KVM_VCPU(r13)
bl kvmppc_load_fp bl kvmppc_load_fp
ld r14, VCPU_GPR(R14)(r4) ld r14, VCPU_GPR(R14)(r4)
...@@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) ...@@ -1100,73 +1023,40 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
no_xive: no_xive:
#endif /* CONFIG_KVM_XICS */ #endif /* CONFIG_KVM_XICS */
deliver_guest_interrupt: li r0, 0
ld r6, VCPU_CTR(r4) stw r0, STACK_SLOT_SHORT_PATH(r1)
ld r7, VCPU_XER(r4)
mtctr r6
mtxer r7
kvmppc_cede_reentry: /* r4 = vcpu, r13 = paca */ deliver_guest_interrupt: /* r4 = vcpu, r13 = paca */
ld r10, VCPU_PC(r4) /* Check if we can deliver an external or decrementer interrupt now */
ld r11, VCPU_MSR(r4) ld r0, VCPU_PENDING_EXC(r4)
BEGIN_FTR_SECTION
/* On POWER9, also check for emulated doorbell interrupt */
lbz r3, VCPU_DBELL_REQ(r4)
or r0, r0, r3
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
cmpdi r0, 0
beq 71f
mr r3, r4
bl kvmppc_guest_entry_inject_int
ld r4, HSTATE_KVM_VCPU(r13)
71:
ld r6, VCPU_SRR0(r4) ld r6, VCPU_SRR0(r4)
ld r7, VCPU_SRR1(r4) ld r7, VCPU_SRR1(r4)
mtspr SPRN_SRR0, r6 mtspr SPRN_SRR0, r6
mtspr SPRN_SRR1, r7 mtspr SPRN_SRR1, r7
fast_guest_entry_c:
ld r10, VCPU_PC(r4)
ld r11, VCPU_MSR(r4)
/* r11 = vcpu->arch.msr & ~MSR_HV */ /* r11 = vcpu->arch.msr & ~MSR_HV */
rldicl r11, r11, 63 - MSR_HV_LG, 1 rldicl r11, r11, 63 - MSR_HV_LG, 1
rotldi r11, r11, 1 + MSR_HV_LG rotldi r11, r11, 1 + MSR_HV_LG
ori r11, r11, MSR_ME ori r11, r11, MSR_ME
/* Check if we can deliver an external or decrementer interrupt now */ ld r6, VCPU_CTR(r4)
ld r0, VCPU_PENDING_EXC(r4) ld r7, VCPU_XER(r4)
rldicl r0, r0, 64 - BOOK3S_IRQPRIO_EXTERNAL_LEVEL, 63 mtctr r6
cmpdi cr1, r0, 0 mtxer r7
andi. r8, r11, MSR_EE
mfspr r8, SPRN_LPCR
/* Insert EXTERNAL_LEVEL bit into LPCR at the MER bit position */
rldimi r8, r0, LPCR_MER_SH, 63 - LPCR_MER_SH
mtspr SPRN_LPCR, r8
isync
beq 5f
li r0, BOOK3S_INTERRUPT_EXTERNAL
bne cr1, 12f
mfspr r0, SPRN_DEC
BEGIN_FTR_SECTION
/* On POWER9 check whether the guest has large decrementer enabled */
andis. r8, r8, LPCR_LD@h
bne 15f
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
extsw r0, r0
15: cmpdi r0, 0
li r0, BOOK3S_INTERRUPT_DECREMENTER
bge 5f
12: mtspr SPRN_SRR0, r10
mr r10,r0
mtspr SPRN_SRR1, r11
mr r9, r4
bl kvmppc_msr_interrupt
5:
BEGIN_FTR_SECTION
b fast_guest_return
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/* On POWER9, check for pending doorbell requests */
lbz r0, VCPU_DBELL_REQ(r4)
cmpwi r0, 0
beq fast_guest_return
ld r5, HSTATE_KVM_VCORE(r13)
/* Set DPDES register so the CPU will take a doorbell interrupt */
li r0, 1
mtspr SPRN_DPDES, r0
std r0, VCORE_DPDES(r5)
/* Make sure other cpus see vcore->dpdes set before dbell req clear */
lwsync
/* Clear the pending doorbell request */
li r0, 0
stb r0, VCPU_DBELL_REQ(r4)
/* /*
* Required state: * Required state:
...@@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION ...@@ -1202,7 +1092,7 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
ld r5, VCPU_LR(r4) ld r5, VCPU_LR(r4)
lwz r6, VCPU_CR(r4) ld r6, VCPU_CR(r4)
mtlr r5 mtlr r5
mtcr r6 mtcr r6
...@@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ...@@ -1234,6 +1124,83 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
HRFI_TO_GUEST HRFI_TO_GUEST
b . b .
/*
* Enter the guest on a P9 or later system where we have exactly
* one vcpu per vcore and we don't need to go to real mode
* (which implies that host and guest are both using radix MMU mode).
* r3 = vcpu pointer
* Most SPRs and all the VSRs have been loaded already.
*/
_GLOBAL(__kvmhv_vcpu_entry_p9)
EXPORT_SYMBOL_GPL(__kvmhv_vcpu_entry_p9)
mflr r0
std r0, PPC_LR_STKOFF(r1)
stdu r1, -SFS(r1)
li r0, 1
stw r0, STACK_SLOT_SHORT_PATH(r1)
std r3, HSTATE_KVM_VCPU(r13)
mfcr r4
stw r4, SFS+8(r1)
std r1, HSTATE_HOST_R1(r13)
reg = 14
.rept 18
std reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
reg = reg + 1
.endr
reg = 14
.rept 18
ld reg, __VCPU_GPR(reg)(r3)
reg = reg + 1
.endr
mfmsr r10
std r10, HSTATE_HOST_MSR(r13)
mr r4, r3
b fast_guest_entry_c
guest_exit_short_path:
li r0, KVM_GUEST_MODE_NONE
stb r0, HSTATE_IN_GUEST(r13)
reg = 14
.rept 18
std reg, __VCPU_GPR(reg)(r9)
reg = reg + 1
.endr
reg = 14
.rept 18
ld reg, STACK_SLOT_NVGPRS + ((reg - 14) * 8)(r1)
reg = reg + 1
.endr
lwz r4, SFS+8(r1)
mtcr r4
mr r3, r12 /* trap number */
addi r1, r1, SFS
ld r0, PPC_LR_STKOFF(r1)
mtlr r0
/* If we are in real mode, do a rfid to get back to the caller */
mfmsr r4
andi. r5, r4, MSR_IR
bnelr
rldicl r5, r4, 64 - MSR_TS_S_LG, 62 /* extract TS field */
mtspr SPRN_SRR0, r0
ld r10, HSTATE_HOST_MSR(r13)
rldimi r10, r5, MSR_TS_S_LG, 63 - MSR_TS_T_LG
mtspr SPRN_SRR1, r10
RFI_TO_KERNEL
b .
secondary_too_late: secondary_too_late:
li r12, 0 li r12, 0
stw r12, STACK_SLOT_TRAP(r1) stw r12, STACK_SLOT_TRAP(r1)
...@@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv: ...@@ -1313,7 +1280,7 @@ kvmppc_interrupt_hv:
std r3, VCPU_GPR(R12)(r9) std r3, VCPU_GPR(R12)(r9)
/* CR is in the high half of r12 */ /* CR is in the high half of r12 */
srdi r4, r12, 32 srdi r4, r12, 32
stw r4, VCPU_CR(r9) std r4, VCPU_CR(r9)
BEGIN_FTR_SECTION BEGIN_FTR_SECTION
ld r3, HSTATE_CFAR(r13) ld r3, HSTATE_CFAR(r13)
std r3, VCPU_CFAR(r9) std r3, VCPU_CFAR(r9)
...@@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ...@@ -1387,18 +1354,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
std r3, VCPU_CTR(r9) std r3, VCPU_CTR(r9)
std r4, VCPU_XER(r9) std r4, VCPU_XER(r9)
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM /* Save more register state */
/* For softpatch interrupt, go off and do TM instruction emulation */ mfdar r3
cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH mfdsisr r4
beq kvmppc_tm_emul std r3, VCPU_DAR(r9)
#endif stw r4, VCPU_DSISR(r9)
/* If this is a page table miss then see if it's theirs or ours */ /* If this is a page table miss then see if it's theirs or ours */
cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE cmpwi r12, BOOK3S_INTERRUPT_H_DATA_STORAGE
beq kvmppc_hdsi beq kvmppc_hdsi
std r3, VCPU_FAULT_DAR(r9)
stw r4, VCPU_FAULT_DSISR(r9)
cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE cmpwi r12, BOOK3S_INTERRUPT_H_INST_STORAGE
beq kvmppc_hisi beq kvmppc_hisi
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/* For softpatch interrupt, go off and do TM instruction emulation */
cmpwi r12, BOOK3S_INTERRUPT_HV_SOFTPATCH
beq kvmppc_tm_emul
#endif
/* See if this is a leftover HDEC interrupt */ /* See if this is a leftover HDEC interrupt */
cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER cmpwi r12,BOOK3S_INTERRUPT_HV_DECREMENTER
bne 2f bne 2f
...@@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ...@@ -1418,10 +1393,14 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
BEGIN_FTR_SECTION BEGIN_FTR_SECTION
PPC_MSGSYNC PPC_MSGSYNC
lwsync lwsync
/* always exit if we're running a nested guest */
ld r0, VCPU_NESTED(r9)
cmpdi r0, 0
bne guest_exit_cont
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r0, HSTATE_HOST_IPI(r13) lbz r0, HSTATE_HOST_IPI(r13)
cmpwi r0, 0 cmpwi r0, 0
beq 4f beq maybe_reenter_guest
b guest_exit_cont b guest_exit_cont
3: 3:
/* If it's a hypervisor facility unavailable interrupt, save HFSCR */ /* If it's a hypervisor facility unavailable interrupt, save HFSCR */
...@@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ...@@ -1433,82 +1412,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
14: 14:
/* External interrupt ? */ /* External interrupt ? */
cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL cmpwi r12, BOOK3S_INTERRUPT_EXTERNAL
bne+ guest_exit_cont beq kvmppc_guest_external
/* External interrupt, first check for host_ipi. If this is
* set, we know the host wants us out so let's do it now
*/
bl kvmppc_read_intr
/*
* Restore the active volatile registers after returning from
* a C function.
*/
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_EXTERNAL
/*
* kvmppc_read_intr return codes:
*
* Exit to host (r3 > 0)
* 1 An interrupt is pending that needs to be handled by the host
* Exit guest and return to host by branching to guest_exit_cont
*
* 2 Passthrough that needs completion in the host
* Exit guest and return to host by branching to guest_exit_cont
* However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
* to indicate to the host to complete handling the interrupt
*
* Before returning to guest, we check if any CPU is heading out
* to the host and if so, we head out also. If no CPUs are heading
* check return values <= 0.
*
* Return to guest (r3 <= 0)
* 0 No external interrupt is pending
* -1 A guest wakeup IPI (which has now been cleared)
* In either case, we return to guest to deliver any pending
* guest interrupts.
*
* -2 A PCI passthrough external interrupt was handled
* (interrupt was delivered directly to guest)
* Return to guest to deliver any pending guest interrupts.
*/
cmpdi r3, 1
ble 1f
/* Return code = 2 */
li r12, BOOK3S_INTERRUPT_HV_RM_HARD
stw r12, VCPU_TRAP(r9)
b guest_exit_cont
1: /* Return code <= 1 */
cmpdi r3, 0
bgt guest_exit_cont
/* Return code <= 0 */
4: ld r5, HSTATE_KVM_VCORE(r13)
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
mr r4, r9
blt deliver_guest_interrupt
guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
/* Save more register state */
mfdar r6
mfdsisr r7
std r6, VCPU_DAR(r9)
stw r7, VCPU_DSISR(r9)
/* don't overwrite fault_dar/fault_dsisr if HDSI */
cmpwi r12,BOOK3S_INTERRUPT_H_DATA_STORAGE
beq mc_cont
std r6, VCPU_FAULT_DAR(r9)
stw r7, VCPU_FAULT_DSISR(r9)
/* See if it is a machine check */ /* See if it is a machine check */
cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK cmpwi r12, BOOK3S_INTERRUPT_MACHINE_CHECK
beq machine_check_realmode beq machine_check_realmode
mc_cont: /* Or a hypervisor maintenance interrupt */
cmpwi r12, BOOK3S_INTERRUPT_HMI
beq hmi_realmode
guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */
#ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING #ifdef CONFIG_KVM_BOOK3S_HV_EXIT_TIMING
addi r3, r9, VCPU_TB_RMEXIT addi r3, r9, VCPU_TB_RMEXIT
mr r4, r9 mr r4, r9
...@@ -1552,6 +1465,11 @@ mc_cont: ...@@ -1552,6 +1465,11 @@ mc_cont:
1: 1:
#endif /* CONFIG_KVM_XICS */ #endif /* CONFIG_KVM_XICS */
/* If we came in through the P9 short path, go back out to C now */
lwz r0, STACK_SLOT_SHORT_PATH(r1)
cmpwi r0, 0
bne guest_exit_short_path
/* For hash guest, read the guest SLB and save it away */ /* For hash guest, read the guest SLB and save it away */
ld r5, VCPU_KVM(r9) ld r5, VCPU_KVM(r9)
lbz r0, KVM_RADIX(r5) lbz r0, KVM_RADIX(r5)
...@@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION ...@@ -1780,11 +1698,13 @@ BEGIN_FTR_SECTION
b 91f b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/* /*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/ */
mr r3, r9 mr r3, r9
ld r4, VCPU_MSR(r3) ld r4, VCPU_MSR(r3)
li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_save_tm_hv bl kvmppc_save_tm_hv
nop
ld r9, HSTATE_KVM_VCPU(r13) ld r9, HSTATE_KVM_VCPU(r13)
91: 91:
#endif #endif
...@@ -1802,90 +1722,19 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) ...@@ -1802,90 +1722,19 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
25: 25:
/* Save PMU registers if requested */ /* Save PMU registers if requested */
/* r8 and cr0.eq are live here */ /* r8 and cr0.eq are live here */
mr r3, r9
li r4, 1
beq 21f /* if no VPA, save PMU stuff anyway */
lbz r4, LPPACA_PMCINUSE(r8)
21: bl kvmhv_save_guest_pmu
ld r9, HSTATE_KVM_VCPU(r13)
/* Restore host values of some registers */
BEGIN_FTR_SECTION BEGIN_FTR_SECTION
/* ld r5, STACK_SLOT_CIABR(r1)
* POWER8 seems to have a hardware bug where setting ld r6, STACK_SLOT_DAWR(r1)
* MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE] ld r7, STACK_SLOT_DAWRX(r1)
* when some counters are already negative doesn't seem mtspr SPRN_CIABR, r5
* to cause a performance monitor alert (and hence interrupt).
* The effect of this is that when saving the PMU state,
* if there is no PMU alert pending when we read MMCR0
* before freezing the counters, but one becomes pending
* before we read the counters, we lose it.
* To work around this, we need a way to freeze the counters
* before reading MMCR0. Normally, freezing the counters
* is done by writing MMCR0 (to set MMCR0[FC]) which
* unavoidably writes MMCR0[PMA0] as well. On POWER8,
* we can also freeze the counters using MMCR2, by writing
* 1s to all the counter freeze condition bits (there are
* 9 bits each for 6 counters).
*/
li r3, -1 /* set all freeze bits */
clrrdi r3, r3, 10
mfspr r10, SPRN_MMCR2
mtspr SPRN_MMCR2, r3
isync
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mfspr r4, SPRN_MMCR0 /* save MMCR0 */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
mfspr r6, SPRN_MMCRA
/* Clear MMCRA in order to disable SDAR updates */
li r7, 0
mtspr SPRN_MMCRA, r7
isync
beq 21f /* if no VPA, save PMU stuff anyway */
lbz r7, LPPACA_PMCINUSE(r8)
cmpwi r7, 0 /* did they ask for PMU stuff to be saved? */
bne 21f
std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */
b 22f
21: mfspr r5, SPRN_MMCR1
mfspr r7, SPRN_SIAR
mfspr r8, SPRN_SDAR
std r4, VCPU_MMCR(r9)
std r5, VCPU_MMCR + 8(r9)
std r6, VCPU_MMCR + 16(r9)
BEGIN_FTR_SECTION
std r10, VCPU_MMCR + 24(r9)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
std r7, VCPU_SIAR(r9)
std r8, VCPU_SDAR(r9)
mfspr r3, SPRN_PMC1
mfspr r4, SPRN_PMC2
mfspr r5, SPRN_PMC3
mfspr r6, SPRN_PMC4
mfspr r7, SPRN_PMC5
mfspr r8, SPRN_PMC6
stw r3, VCPU_PMC(r9)
stw r4, VCPU_PMC + 4(r9)
stw r5, VCPU_PMC + 8(r9)
stw r6, VCPU_PMC + 12(r9)
stw r7, VCPU_PMC + 16(r9)
stw r8, VCPU_PMC + 20(r9)
BEGIN_FTR_SECTION
mfspr r5, SPRN_SIER
std r5, VCPU_SIER(r9)
BEGIN_FTR_SECTION_NESTED(96)
mfspr r6, SPRN_SPMC1
mfspr r7, SPRN_SPMC2
mfspr r8, SPRN_MMCRS
stw r6, VCPU_PMC + 24(r9)
stw r7, VCPU_PMC + 28(r9)
std r8, VCPU_MMCR + 32(r9)
lis r4, 0x8000
mtspr SPRN_MMCRS, r4
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
22:
/* Restore host values of some registers */
BEGIN_FTR_SECTION
ld r5, STACK_SLOT_CIABR(r1)
ld r6, STACK_SLOT_DAWR(r1)
ld r7, STACK_SLOT_DAWRX(r1)
mtspr SPRN_CIABR, r5
/* /*
* If the DAWR doesn't work, it's ok to write these here as * If the DAWR doesn't work, it's ok to write these here as
* this value should always be zero * this value should always be zero
...@@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION ...@@ -2010,24 +1859,6 @@ BEGIN_FTR_SECTION
mtspr SPRN_DPDES, r8 mtspr SPRN_DPDES, r8
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* If HMI, call kvmppc_realmode_hmi_handler() */
lwz r12, STACK_SLOT_TRAP(r1)
cmpwi r12, BOOK3S_INTERRUPT_HMI
bne 27f
bl kvmppc_realmode_hmi_handler
nop
cmpdi r3, 0
/*
* At this point kvmppc_realmode_hmi_handler may have resync-ed
* the TB, and if it has, we must not subtract the guest timebase
* offset from the timebase. So, skip it.
*
* Also, do not call kvmppc_subcore_exit_guest() because it has
* been invoked as part of kvmppc_realmode_hmi_handler().
*/
beq 30f
27:
/* Subtract timebase offset from timebase */ /* Subtract timebase offset from timebase */
ld r8, VCORE_TB_OFFSET_APPL(r5) ld r8, VCORE_TB_OFFSET_APPL(r5)
cmpdi r8,0 cmpdi r8,0
...@@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) ...@@ -2045,7 +1876,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
addis r8,r8,0x100 /* if so, increment upper 40 bits */ addis r8,r8,0x100 /* if so, increment upper 40 bits */
mtspr SPRN_TBU40,r8 mtspr SPRN_TBU40,r8
17: bl kvmppc_subcore_exit_guest 17:
/*
* If this is an HMI, we called kvmppc_realmode_hmi_handler
* above, which may or may not have already called
* kvmppc_subcore_exit_guest. Fortunately, all that
* kvmppc_subcore_exit_guest does is clear a flag, so calling
* it again here is benign even if kvmppc_realmode_hmi_handler
* has already called it.
*/
bl kvmppc_subcore_exit_guest
nop nop
30: ld r5,HSTATE_KVM_VCORE(r13) 30: ld r5,HSTATE_KVM_VCORE(r13)
ld r4,VCORE_KVM(r5) /* pointer to struct kvm */ ld r4,VCORE_KVM(r5) /* pointer to struct kvm */
...@@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) ...@@ -2099,6 +1939,67 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
mtlr r0 mtlr r0
blr blr
kvmppc_guest_external:
/* External interrupt, first check for host_ipi. If this is
* set, we know the host wants us out so let's do it now
*/
bl kvmppc_read_intr
/*
* Restore the active volatile registers after returning from
* a C function.
*/
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_EXTERNAL
/*
* kvmppc_read_intr return codes:
*
* Exit to host (r3 > 0)
* 1 An interrupt is pending that needs to be handled by the host
* Exit guest and return to host by branching to guest_exit_cont
*
* 2 Passthrough that needs completion in the host
* Exit guest and return to host by branching to guest_exit_cont
* However, we also set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
* to indicate to the host to complete handling the interrupt
*
* Before returning to guest, we check if any CPU is heading out
* to the host and if so, we head out also. If no CPUs are heading
* check return values <= 0.
*
* Return to guest (r3 <= 0)
* 0 No external interrupt is pending
* -1 A guest wakeup IPI (which has now been cleared)
* In either case, we return to guest to deliver any pending
* guest interrupts.
*
* -2 A PCI passthrough external interrupt was handled
* (interrupt was delivered directly to guest)
* Return to guest to deliver any pending guest interrupts.
*/
cmpdi r3, 1
ble 1f
/* Return code = 2 */
li r12, BOOK3S_INTERRUPT_HV_RM_HARD
stw r12, VCPU_TRAP(r9)
b guest_exit_cont
1: /* Return code <= 1 */
cmpdi r3, 0
bgt guest_exit_cont
/* Return code <= 0 */
maybe_reenter_guest:
ld r5, HSTATE_KVM_VCORE(r13)
lwz r0, VCORE_ENTRY_EXIT(r5)
cmpwi r0, 0x100
mr r4, r9
blt deliver_guest_interrupt
b guest_exit_cont
#ifdef CONFIG_PPC_TRANSACTIONAL_MEM #ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/* /*
* Softpatch interrupt for transactional memory emulation cases * Softpatch interrupt for transactional memory emulation cases
...@@ -2302,6 +2203,10 @@ hcall_try_real_mode: ...@@ -2302,6 +2203,10 @@ hcall_try_real_mode:
andi. r0,r11,MSR_PR andi. r0,r11,MSR_PR
/* sc 1 from userspace - reflect to guest syscall */ /* sc 1 from userspace - reflect to guest syscall */
bne sc_1_fast_return bne sc_1_fast_return
/* sc 1 from nested guest - give it to L1 to handle */
ld r0, VCPU_NESTED(r9)
cmpdi r0, 0
bne guest_exit_cont
clrrdi r3,r3,2 clrrdi r3,r3,2
cmpldi r3,hcall_real_table_end - hcall_real_table cmpldi r3,hcall_real_table_end - hcall_real_table
bge guest_exit_cont bge guest_exit_cont
...@@ -2561,6 +2466,7 @@ hcall_real_table: ...@@ -2561,6 +2466,7 @@ hcall_real_table:
hcall_real_table_end: hcall_real_table_end:
_GLOBAL(kvmppc_h_set_xdabr) _GLOBAL(kvmppc_h_set_xdabr)
EXPORT_SYMBOL_GPL(kvmppc_h_set_xdabr)
andi. r0, r5, DABRX_USER | DABRX_KERNEL andi. r0, r5, DABRX_USER | DABRX_KERNEL
beq 6f beq 6f
li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI li r0, DABRX_USER | DABRX_KERNEL | DABRX_BTI
...@@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr) ...@@ -2570,6 +2476,7 @@ _GLOBAL(kvmppc_h_set_xdabr)
blr blr
_GLOBAL(kvmppc_h_set_dabr) _GLOBAL(kvmppc_h_set_dabr)
EXPORT_SYMBOL_GPL(kvmppc_h_set_dabr)
li r5, DABRX_USER | DABRX_KERNEL li r5, DABRX_USER | DABRX_KERNEL
3: 3:
BEGIN_FTR_SECTION BEGIN_FTR_SECTION
...@@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION ...@@ -2682,11 +2589,13 @@ BEGIN_FTR_SECTION
b 91f b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/* /*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/ */
ld r3, HSTATE_KVM_VCPU(r13) ld r3, HSTATE_KVM_VCPU(r13)
ld r4, VCPU_MSR(r3) ld r4, VCPU_MSR(r3)
li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_save_tm_hv bl kvmppc_save_tm_hv
nop
91: 91:
#endif #endif
...@@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION ...@@ -2802,11 +2711,13 @@ BEGIN_FTR_SECTION
b 91f b 91f
END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
/* /*
* NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS (but not CR)
*/ */
mr r3, r4 mr r3, r4
ld r4, VCPU_MSR(r3) ld r4, VCPU_MSR(r3)
li r5, 0 /* don't preserve non-vol regs */
bl kvmppc_restore_tm_hv bl kvmppc_restore_tm_hv
nop
ld r4, HSTATE_KVM_VCPU(r13) ld r4, HSTATE_KVM_VCPU(r13)
91: 91:
#endif #endif
...@@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0) ...@@ -2874,13 +2785,7 @@ END_FTR_SECTION(CPU_FTR_TM | CPU_FTR_P9_TM_HV_ASSIST, 0)
mr r9, r4 mr r9, r4
cmpdi r3, 0 cmpdi r3, 0
bgt guest_exit_cont bgt guest_exit_cont
b maybe_reenter_guest
/* see if any other thread is already exiting */
lwz r0,VCORE_ENTRY_EXIT(r5)
cmpwi r0,0x100
bge guest_exit_cont
b kvmppc_cede_reentry /* if not go back to guest */
/* cede when already previously prodded case */ /* cede when already previously prodded case */
kvm_cede_prodded: kvm_cede_prodded:
...@@ -2947,12 +2852,12 @@ machine_check_realmode: ...@@ -2947,12 +2852,12 @@ machine_check_realmode:
*/ */
ld r11, VCPU_MSR(r9) ld r11, VCPU_MSR(r9)
rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */ rldicl. r0, r11, 64-MSR_HV_LG, 63 /* check if it happened in HV mode */
bne mc_cont /* if so, exit to host */ bne guest_exit_cont /* if so, exit to host */
/* Check if guest is capable of handling NMI exit */ /* Check if guest is capable of handling NMI exit */
ld r10, VCPU_KVM(r9) ld r10, VCPU_KVM(r9)
lbz r10, KVM_FWNMI(r10) lbz r10, KVM_FWNMI(r10)
cmpdi r10, 1 /* FWNMI capable? */ cmpdi r10, 1 /* FWNMI capable? */
beq mc_cont /* if so, exit with KVM_EXIT_NMI. */ beq guest_exit_cont /* if so, exit with KVM_EXIT_NMI. */
/* if not, fall through for backward compatibility. */ /* if not, fall through for backward compatibility. */
andi. r10, r11, MSR_RI /* check for unrecoverable exception */ andi. r10, r11, MSR_RI /* check for unrecoverable exception */
...@@ -2965,6 +2870,21 @@ machine_check_realmode: ...@@ -2965,6 +2870,21 @@ machine_check_realmode:
bl kvmppc_msr_interrupt bl kvmppc_msr_interrupt
2: b fast_interrupt_c_return 2: b fast_interrupt_c_return
/*
* Call C code to handle a HMI in real mode.
* Only the primary thread does the call, secondary threads are handled
* by calling hmi_exception_realmode() after kvmppc_hv_entry returns.
* r9 points to the vcpu on entry
*/
hmi_realmode:
lbz r0, HSTATE_PTID(r13)
cmpwi r0, 0
bne guest_exit_cont
bl kvmppc_realmode_hmi_handler
ld r9, HSTATE_KVM_VCPU(r13)
li r12, BOOK3S_INTERRUPT_HMI
b guest_exit_cont
/* /*
* Check the reason we woke from nap, and take appropriate action. * Check the reason we woke from nap, and take appropriate action.
* Returns (in r3): * Returns (in r3):
...@@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) ...@@ -3130,10 +3050,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
* Save transactional state and TM-related registers. * Save transactional state and TM-related registers.
* Called with r3 pointing to the vcpu struct and r4 containing * Called with r3 pointing to the vcpu struct and r4 containing
* the guest MSR value. * the guest MSR value.
* This can modify all checkpointed registers, but * r5 is non-zero iff non-volatile register state needs to be maintained.
* If r5 == 0, this can modify all checkpointed registers, but
* restores r1 and r2 before exit. * restores r1 and r2 before exit.
*/ */
kvmppc_save_tm_hv: _GLOBAL_TOC(kvmppc_save_tm_hv)
EXPORT_SYMBOL_GPL(kvmppc_save_tm_hv)
/* See if we need to handle fake suspend mode */ /* See if we need to handle fake suspend mode */
BEGIN_FTR_SECTION BEGIN_FTR_SECTION
b __kvmppc_save_tm b __kvmppc_save_tm
...@@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION ...@@ -3161,12 +3083,6 @@ BEGIN_FTR_SECTION
END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
nop nop
std r1, HSTATE_HOST_R1(r13)
/* Clear the MSR RI since r1, r13 may be foobar. */
li r5, 0
mtmsrd r5, 1
/* We have to treclaim here because that's the only way to do S->N */ /* We have to treclaim here because that's the only way to do S->N */
li r3, TM_CAUSE_KVM_RESCHED li r3, TM_CAUSE_KVM_RESCHED
TRECLAIM(R3) TRECLAIM(R3)
...@@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG) ...@@ -3175,22 +3091,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
* We were in fake suspend, so we are not going to save the * We were in fake suspend, so we are not going to save the
* register state as the guest checkpointed state (since * register state as the guest checkpointed state (since
* we already have it), therefore we can now use any volatile GPR. * we already have it), therefore we can now use any volatile GPR.
* In fact treclaim in fake suspend state doesn't modify
* any registers.
*/ */
/* Reload PACA pointer, stack pointer and TOC. */
GET_PACA(r13)
ld r1, HSTATE_HOST_R1(r13)
ld r2, PACATOC(r13)
/* Set MSR RI now we have r1 and r13 back. */
li r5, MSR_RI
mtmsrd r5, 1
HMT_MEDIUM BEGIN_FTR_SECTION
ld r6, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r6
BEGIN_FTR_SECTION_NESTED(96)
bl pnv_power9_force_smt4_release bl pnv_power9_force_smt4_release
END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96) END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_XER_SO_BUG)
nop nop
4: 4:
...@@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96) ...@@ -3216,10 +3123,12 @@ END_FTR_SECTION_NESTED(CPU_FTR_P9_TM_XER_SO_BUG, CPU_FTR_P9_TM_XER_SO_BUG, 96)
* Restore transactional state and TM-related registers. * Restore transactional state and TM-related registers.
* Called with r3 pointing to the vcpu struct * Called with r3 pointing to the vcpu struct
* and r4 containing the guest MSR value. * and r4 containing the guest MSR value.
* r5 is non-zero iff non-volatile register state needs to be maintained.
* This potentially modifies all checkpointed registers. * This potentially modifies all checkpointed registers.
* It restores r1 and r2 from the PACA. * It restores r1 and r2 from the PACA.
*/ */
kvmppc_restore_tm_hv: _GLOBAL_TOC(kvmppc_restore_tm_hv)
EXPORT_SYMBOL_GPL(kvmppc_restore_tm_hv)
/* /*
* If we are doing TM emulation for the guest on a POWER9 DD2, * If we are doing TM emulation for the guest on a POWER9 DD2,
* then we don't actually do a trechkpt -- we either set up * then we don't actually do a trechkpt -- we either set up
...@@ -3423,6 +3332,194 @@ kvmppc_msr_interrupt: ...@@ -3423,6 +3332,194 @@ kvmppc_msr_interrupt:
1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG 1: rldimi r11, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
blr blr
/*
* Load up guest PMU state. R3 points to the vcpu struct.
*/
_GLOBAL(kvmhv_load_guest_pmu)
EXPORT_SYMBOL_GPL(kvmhv_load_guest_pmu)
mr r4, r3
mflr r0
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
isync
BEGIN_FTR_SECTION
ld r3, VCPU_MMCR(r4)
andi. r5, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
cmpwi r5, MMCR0_PMAO
beql kvmppc_fix_pmao
END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
lwz r3, VCPU_PMC(r4) /* always load up guest PMU registers */
lwz r5, VCPU_PMC + 4(r4) /* to prevent information leak */
lwz r6, VCPU_PMC + 8(r4)
lwz r7, VCPU_PMC + 12(r4)
lwz r8, VCPU_PMC + 16(r4)
lwz r9, VCPU_PMC + 20(r4)
mtspr SPRN_PMC1, r3
mtspr SPRN_PMC2, r5
mtspr SPRN_PMC3, r6
mtspr SPRN_PMC4, r7
mtspr SPRN_PMC5, r8
mtspr SPRN_PMC6, r9
ld r3, VCPU_MMCR(r4)
ld r5, VCPU_MMCR + 8(r4)
ld r6, VCPU_MMCR + 16(r4)
ld r7, VCPU_SIAR(r4)
ld r8, VCPU_SDAR(r4)
mtspr SPRN_MMCR1, r5
mtspr SPRN_MMCRA, r6
mtspr SPRN_SIAR, r7
mtspr SPRN_SDAR, r8
BEGIN_FTR_SECTION
ld r5, VCPU_MMCR + 24(r4)
ld r6, VCPU_SIER(r4)
mtspr SPRN_MMCR2, r5
mtspr SPRN_SIER, r6
BEGIN_FTR_SECTION_NESTED(96)
lwz r7, VCPU_PMC + 24(r4)
lwz r8, VCPU_PMC + 28(r4)
ld r9, VCPU_MMCR + 32(r4)
mtspr SPRN_SPMC1, r7
mtspr SPRN_SPMC2, r8
mtspr SPRN_MMCRS, r9
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
mtlr r0
blr
/*
* Reload host PMU state saved in the PACA by kvmhv_save_host_pmu.
*/
_GLOBAL(kvmhv_load_host_pmu)
EXPORT_SYMBOL_GPL(kvmhv_load_host_pmu)
mflr r0
lbz r4, PACA_PMCINUSE(r13) /* is the host using the PMU? */
cmpwi r4, 0
beq 23f /* skip if not */
BEGIN_FTR_SECTION
ld r3, HSTATE_MMCR0(r13)
andi. r4, r3, MMCR0_PMAO_SYNC | MMCR0_PMAO
cmpwi r4, MMCR0_PMAO
beql kvmppc_fix_pmao
END_FTR_SECTION_IFSET(CPU_FTR_PMAO_BUG)
lwz r3, HSTATE_PMC1(r13)
lwz r4, HSTATE_PMC2(r13)
lwz r5, HSTATE_PMC3(r13)
lwz r6, HSTATE_PMC4(r13)
lwz r8, HSTATE_PMC5(r13)
lwz r9, HSTATE_PMC6(r13)
mtspr SPRN_PMC1, r3
mtspr SPRN_PMC2, r4
mtspr SPRN_PMC3, r5
mtspr SPRN_PMC4, r6
mtspr SPRN_PMC5, r8
mtspr SPRN_PMC6, r9
ld r3, HSTATE_MMCR0(r13)
ld r4, HSTATE_MMCR1(r13)
ld r5, HSTATE_MMCRA(r13)
ld r6, HSTATE_SIAR(r13)
ld r7, HSTATE_SDAR(r13)
mtspr SPRN_MMCR1, r4
mtspr SPRN_MMCRA, r5
mtspr SPRN_SIAR, r6
mtspr SPRN_SDAR, r7
BEGIN_FTR_SECTION
ld r8, HSTATE_MMCR2(r13)
ld r9, HSTATE_SIER(r13)
mtspr SPRN_MMCR2, r8
mtspr SPRN_SIER, r9
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
mtspr SPRN_MMCR0, r3
isync
mtlr r0
23: blr
/*
* Save guest PMU state into the vcpu struct.
* r3 = vcpu, r4 = full save flag (PMU in use flag set in VPA)
*/
_GLOBAL(kvmhv_save_guest_pmu)
EXPORT_SYMBOL_GPL(kvmhv_save_guest_pmu)
mr r9, r3
mr r8, r4
BEGIN_FTR_SECTION
/*
* POWER8 seems to have a hardware bug where setting
* MMCR0[PMAE] along with MMCR0[PMC1CE] and/or MMCR0[PMCjCE]
* when some counters are already negative doesn't seem
* to cause a performance monitor alert (and hence interrupt).
* The effect of this is that when saving the PMU state,
* if there is no PMU alert pending when we read MMCR0
* before freezing the counters, but one becomes pending
* before we read the counters, we lose it.
* To work around this, we need a way to freeze the counters
* before reading MMCR0. Normally, freezing the counters
* is done by writing MMCR0 (to set MMCR0[FC]) which
* unavoidably writes MMCR0[PMA0] as well. On POWER8,
* we can also freeze the counters using MMCR2, by writing
* 1s to all the counter freeze condition bits (there are
* 9 bits each for 6 counters).
*/
li r3, -1 /* set all freeze bits */
clrrdi r3, r3, 10
mfspr r10, SPRN_MMCR2
mtspr SPRN_MMCR2, r3
isync
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
li r3, 1
sldi r3, r3, 31 /* MMCR0_FC (freeze counters) bit */
mfspr r4, SPRN_MMCR0 /* save MMCR0 */
mtspr SPRN_MMCR0, r3 /* freeze all counters, disable ints */
mfspr r6, SPRN_MMCRA
/* Clear MMCRA in order to disable SDAR updates */
li r7, 0
mtspr SPRN_MMCRA, r7
isync
cmpwi r8, 0 /* did they ask for PMU stuff to be saved? */
bne 21f
std r3, VCPU_MMCR(r9) /* if not, set saved MMCR0 to FC */
b 22f
21: mfspr r5, SPRN_MMCR1
mfspr r7, SPRN_SIAR
mfspr r8, SPRN_SDAR
std r4, VCPU_MMCR(r9)
std r5, VCPU_MMCR + 8(r9)
std r6, VCPU_MMCR + 16(r9)
BEGIN_FTR_SECTION
std r10, VCPU_MMCR + 24(r9)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
std r7, VCPU_SIAR(r9)
std r8, VCPU_SDAR(r9)
mfspr r3, SPRN_PMC1
mfspr r4, SPRN_PMC2
mfspr r5, SPRN_PMC3
mfspr r6, SPRN_PMC4
mfspr r7, SPRN_PMC5
mfspr r8, SPRN_PMC6
stw r3, VCPU_PMC(r9)
stw r4, VCPU_PMC + 4(r9)
stw r5, VCPU_PMC + 8(r9)
stw r6, VCPU_PMC + 12(r9)
stw r7, VCPU_PMC + 16(r9)
stw r8, VCPU_PMC + 20(r9)
BEGIN_FTR_SECTION
mfspr r5, SPRN_SIER
std r5, VCPU_SIER(r9)
BEGIN_FTR_SECTION_NESTED(96)
mfspr r6, SPRN_SPMC1
mfspr r7, SPRN_SPMC2
mfspr r8, SPRN_MMCRS
stw r6, VCPU_PMC + 24(r9)
stw r7, VCPU_PMC + 28(r9)
std r8, VCPU_MMCR + 32(r9)
lis r4, 0x8000
mtspr SPRN_MMCRS, r4
END_FTR_SECTION_NESTED(CPU_FTR_ARCH_300, 0, 96)
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
22: blr
/* /*
* This works around a hardware bug on POWER8E processors, where * This works around a hardware bug on POWER8E processors, where
* writing a 1 to the MMCR0[PMAO] bit doesn't generate a * writing a 1 to the MMCR0[PMAO] bit doesn't generate a
......
...@@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) ...@@ -130,7 +130,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
return RESUME_GUEST; return RESUME_GUEST;
} }
/* Set CR0 to indicate previous transactional state */ /* Set CR0 to indicate previous transactional state */
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
/* L=1 => tresume, L=0 => tsuspend */ /* L=1 => tresume, L=0 => tsuspend */
if (instr & (1 << 21)) { if (instr & (1 << 21)) {
...@@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) ...@@ -174,7 +174,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
copy_from_checkpoint(vcpu); copy_from_checkpoint(vcpu);
/* Set CR0 to indicate previous transactional state */ /* Set CR0 to indicate previous transactional state */
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
vcpu->arch.shregs.msr &= ~MSR_TS_MASK; vcpu->arch.shregs.msr &= ~MSR_TS_MASK;
return RESUME_GUEST; return RESUME_GUEST;
...@@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu) ...@@ -204,7 +204,7 @@ int kvmhv_p9_tm_emulation(struct kvm_vcpu *vcpu)
copy_to_checkpoint(vcpu); copy_to_checkpoint(vcpu);
/* Set CR0 to indicate previous transactional state */ /* Set CR0 to indicate previous transactional state */
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
(((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28); (((msr & MSR_TS_MASK) >> MSR_TS_S_LG) << 28);
vcpu->arch.shregs.msr = msr | MSR_TS_S; vcpu->arch.shregs.msr = msr | MSR_TS_S;
return RESUME_GUEST; return RESUME_GUEST;
......
...@@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu) ...@@ -89,7 +89,8 @@ int kvmhv_p9_tm_emulation_early(struct kvm_vcpu *vcpu)
if (instr & (1 << 21)) if (instr & (1 << 21))
vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T; vcpu->arch.shregs.msr = (msr & ~MSR_TS_MASK) | MSR_TS_T;
/* Set CR0 to 0b0010 */ /* Set CR0 to 0b0010 */
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0x20000000; vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) |
0x20000000;
return 1; return 1;
} }
...@@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu) ...@@ -105,5 +106,5 @@ void kvmhv_emulate_tm_rollback(struct kvm_vcpu *vcpu)
vcpu->arch.shregs.msr &= ~MSR_TS_MASK; /* go to N state */ vcpu->arch.shregs.msr &= ~MSR_TS_MASK; /* go to N state */
vcpu->arch.regs.nip = vcpu->arch.tfhar; vcpu->arch.regs.nip = vcpu->arch.tfhar;
copy_from_checkpoint(vcpu); copy_from_checkpoint(vcpu);
vcpu->arch.cr = (vcpu->arch.cr & 0x0fffffff) | 0xa0000000; vcpu->arch.regs.ccr = (vcpu->arch.regs.ccr & 0x0fffffff) | 0xa0000000;
} }
...@@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu) ...@@ -167,7 +167,7 @@ void kvmppc_copy_to_svcpu(struct kvm_vcpu *vcpu)
svcpu->gpr[11] = vcpu->arch.regs.gpr[11]; svcpu->gpr[11] = vcpu->arch.regs.gpr[11];
svcpu->gpr[12] = vcpu->arch.regs.gpr[12]; svcpu->gpr[12] = vcpu->arch.regs.gpr[12];
svcpu->gpr[13] = vcpu->arch.regs.gpr[13]; svcpu->gpr[13] = vcpu->arch.regs.gpr[13];
svcpu->cr = vcpu->arch.cr; svcpu->cr = vcpu->arch.regs.ccr;
svcpu->xer = vcpu->arch.regs.xer; svcpu->xer = vcpu->arch.regs.xer;
svcpu->ctr = vcpu->arch.regs.ctr; svcpu->ctr = vcpu->arch.regs.ctr;
svcpu->lr = vcpu->arch.regs.link; svcpu->lr = vcpu->arch.regs.link;
...@@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu) ...@@ -249,7 +249,7 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu)
vcpu->arch.regs.gpr[11] = svcpu->gpr[11]; vcpu->arch.regs.gpr[11] = svcpu->gpr[11];
vcpu->arch.regs.gpr[12] = svcpu->gpr[12]; vcpu->arch.regs.gpr[12] = svcpu->gpr[12];
vcpu->arch.regs.gpr[13] = svcpu->gpr[13]; vcpu->arch.regs.gpr[13] = svcpu->gpr[13];
vcpu->arch.cr = svcpu->cr; vcpu->arch.regs.ccr = svcpu->cr;
vcpu->arch.regs.xer = svcpu->xer; vcpu->arch.regs.xer = svcpu->xer;
vcpu->arch.regs.ctr = svcpu->ctr; vcpu->arch.regs.ctr = svcpu->ctr;
vcpu->arch.regs.link = svcpu->lr; vcpu->arch.regs.link = svcpu->lr;
...@@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, ...@@ -1246,7 +1246,6 @@ int kvmppc_handle_exit_pr(struct kvm_run *run, struct kvm_vcpu *vcpu,
r = RESUME_GUEST; r = RESUME_GUEST;
break; break;
case BOOK3S_INTERRUPT_EXTERNAL: case BOOK3S_INTERRUPT_EXTERNAL:
case BOOK3S_INTERRUPT_EXTERNAL_LEVEL:
case BOOK3S_INTERRUPT_EXTERNAL_HV: case BOOK3S_INTERRUPT_EXTERNAL_HV:
case BOOK3S_INTERRUPT_H_VIRT: case BOOK3S_INTERRUPT_H_VIRT:
vcpu->stat.ext_intr_exits++; vcpu->stat.ext_intr_exits++;
......
...@@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp, ...@@ -310,7 +310,7 @@ static inline bool icp_try_update(struct kvmppc_icp *icp,
*/ */
if (new.out_ee) { if (new.out_ee) {
kvmppc_book3s_queue_irqprio(icp->vcpu, kvmppc_book3s_queue_irqprio(icp->vcpu,
BOOK3S_INTERRUPT_EXTERNAL_LEVEL); BOOK3S_INTERRUPT_EXTERNAL);
if (!change_self) if (!change_self)
kvmppc_fast_vcpu_kick(icp->vcpu); kvmppc_fast_vcpu_kick(icp->vcpu);
} }
...@@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu) ...@@ -593,8 +593,7 @@ static noinline unsigned long kvmppc_h_xirr(struct kvm_vcpu *vcpu)
u32 xirr; u32 xirr;
/* First, remove EE from the processor */ /* First, remove EE from the processor */
kvmppc_book3s_dequeue_irqprio(icp->vcpu, kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
/* /*
* ICP State: Accept_Interrupt * ICP State: Accept_Interrupt
...@@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr) ...@@ -754,8 +753,7 @@ static noinline void kvmppc_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
* We can remove EE from the current processor, the update * We can remove EE from the current processor, the update
* transaction will set it again if needed * transaction will set it again if needed
*/ */
kvmppc_book3s_dequeue_irqprio(icp->vcpu, kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
do { do {
old_state = new_state = READ_ONCE(icp->state); old_state = new_state = READ_ONCE(icp->state);
...@@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval) ...@@ -1167,8 +1165,7 @@ int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval)
* Deassert the CPU interrupt request. * Deassert the CPU interrupt request.
* icp_try_update will reassert it if necessary. * icp_try_update will reassert it if necessary.
*/ */
kvmppc_book3s_dequeue_irqprio(icp->vcpu, kvmppc_book3s_dequeue_irqprio(icp->vcpu, BOOK3S_INTERRUPT_EXTERNAL);
BOOK3S_INTERRUPT_EXTERNAL_LEVEL);
/* /*
* Note that if we displace an interrupt from old_state.xisr, * Note that if we displace an interrupt from old_state.xisr,
...@@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type) ...@@ -1393,7 +1390,8 @@ static int kvmppc_xics_create(struct kvm_device *dev, u32 type)
} }
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
if (cpu_has_feature(CPU_FTR_ARCH_206)) { if (cpu_has_feature(CPU_FTR_ARCH_206) &&
cpu_has_feature(CPU_FTR_HVMODE)) {
/* Enable real mode support */ /* Enable real mode support */
xics->real_mode = ENABLE_REALMODE; xics->real_mode = ENABLE_REALMODE;
xics->real_mode_dbg = DEBUG_REALMODE; xics->real_mode_dbg = DEBUG_REALMODE;
......
...@@ -61,6 +61,69 @@ ...@@ -61,6 +61,69 @@
*/ */
#define XIVE_Q_GAP 2 #define XIVE_Q_GAP 2
/*
* Push a vcpu's context to the XIVE on guest entry.
* This assumes we are in virtual mode (MMU on)
*/
void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu)
{
void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt;
u64 pq;
if (!tima)
return;
eieio();
__raw_writeq(vcpu->arch.xive_saved_state.w01, tima + TM_QW1_OS);
__raw_writel(vcpu->arch.xive_cam_word, tima + TM_QW1_OS + TM_WORD2);
vcpu->arch.xive_pushed = 1;
eieio();
/*
* We clear the irq_pending flag. There is a small chance of a
* race vs. the escalation interrupt happening on another
* processor setting it again, but the only consequence is to
* cause a spurious wakeup on the next H_CEDE, which is not an
* issue.
*/
vcpu->arch.irq_pending = 0;
/*
* In single escalation mode, if the escalation interrupt is
* on, we mask it.
*/
if (vcpu->arch.xive_esc_on) {
pq = __raw_readq((void __iomem *)(vcpu->arch.xive_esc_vaddr +
XIVE_ESB_SET_PQ_01));
mb();
/*
* We have a possible subtle race here: The escalation
* interrupt might have fired and be on its way to the
* host queue while we mask it, and if we unmask it
* early enough (re-cede right away), there is a
* theorical possibility that it fires again, thus
* landing in the target queue more than once which is
* a big no-no.
*
* Fortunately, solving this is rather easy. If the
* above load setting PQ to 01 returns a previous
* value where P is set, then we know the escalation
* interrupt is somewhere on its way to the host. In
* that case we simply don't clear the xive_esc_on
* flag below. It will be eventually cleared by the
* handler for the escalation interrupt.
*
* Then, when doing a cede, we check that flag again
* before re-enabling the escalation interrupt, and if
* set, we abort the cede.
*/
if (!(pq & XIVE_ESB_VAL_P))
/* Now P is 0, we can clear the flag */
vcpu->arch.xive_esc_on = 0;
}
}
EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu);
/* /*
* This is a simple trigger for a generic XIVE IRQ. This must * This is a simple trigger for a generic XIVE IRQ. This must
* only be called for interrupts that support a trigger page * only be called for interrupts that support a trigger page
......
...@@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu) ...@@ -280,14 +280,6 @@ X_STATIC unsigned long GLUE(X_PFX,h_xirr)(struct kvm_vcpu *vcpu)
/* First collect pending bits from HW */ /* First collect pending bits from HW */
GLUE(X_PFX,ack_pending)(xc); GLUE(X_PFX,ack_pending)(xc);
/*
* Cleanup the old-style bits if needed (they may have been
* set by pull or an escalation interrupts).
*/
if (test_bit(BOOK3S_IRQPRIO_EXTERNAL, &vcpu->arch.pending_exceptions))
clear_bit(BOOK3S_IRQPRIO_EXTERNAL_LEVEL,
&vcpu->arch.pending_exceptions);
pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n", pr_devel(" new pending=0x%02x hw_cppr=%d cppr=%d\n",
xc->pending, xc->hw_cppr, xc->cppr); xc->pending, xc->hw_cppr, xc->cppr);
......
...@@ -182,7 +182,7 @@ ...@@ -182,7 +182,7 @@
*/ */
PPC_LL r4, PACACURRENT(r13) PPC_LL r4, PACACURRENT(r13)
PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4) PPC_LL r4, (THREAD + THREAD_KVM_VCPU)(r4)
stw r10, VCPU_CR(r4) PPC_STL r10, VCPU_CR(r4)
PPC_STL r11, VCPU_GPR(R4)(r4) PPC_STL r11, VCPU_GPR(R4)(r4)
PPC_STL r5, VCPU_GPR(R5)(r4) PPC_STL r5, VCPU_GPR(R5)(r4)
PPC_STL r6, VCPU_GPR(R6)(r4) PPC_STL r6, VCPU_GPR(R6)(r4)
...@@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) ...@@ -292,7 +292,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
PPC_STL r4, VCPU_GPR(R4)(r11) PPC_STL r4, VCPU_GPR(R4)(r11)
PPC_LL r4, THREAD_NORMSAVE(0)(r10) PPC_LL r4, THREAD_NORMSAVE(0)(r10)
PPC_STL r5, VCPU_GPR(R5)(r11) PPC_STL r5, VCPU_GPR(R5)(r11)
stw r13, VCPU_CR(r11) PPC_STL r13, VCPU_CR(r11)
mfspr r5, \srr0 mfspr r5, \srr0
PPC_STL r3, VCPU_GPR(R10)(r11) PPC_STL r3, VCPU_GPR(R10)(r11)
PPC_LL r3, THREAD_NORMSAVE(2)(r10) PPC_LL r3, THREAD_NORMSAVE(2)(r10)
...@@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1) ...@@ -319,7 +319,7 @@ _GLOBAL(kvmppc_handler_\intno\()_\srr1)
PPC_STL r4, VCPU_GPR(R4)(r11) PPC_STL r4, VCPU_GPR(R4)(r11)
PPC_LL r4, GPR9(r8) PPC_LL r4, GPR9(r8)
PPC_STL r5, VCPU_GPR(R5)(r11) PPC_STL r5, VCPU_GPR(R5)(r11)
stw r9, VCPU_CR(r11) PPC_STL r9, VCPU_CR(r11)
mfspr r5, \srr0 mfspr r5, \srr0
PPC_STL r3, VCPU_GPR(R8)(r11) PPC_STL r3, VCPU_GPR(R8)(r11)
PPC_LL r3, GPR10(r8) PPC_LL r3, GPR10(r8)
...@@ -643,7 +643,7 @@ lightweight_exit: ...@@ -643,7 +643,7 @@ lightweight_exit:
PPC_LL r3, VCPU_LR(r4) PPC_LL r3, VCPU_LR(r4)
PPC_LL r5, VCPU_XER(r4) PPC_LL r5, VCPU_XER(r4)
PPC_LL r6, VCPU_CTR(r4) PPC_LL r6, VCPU_CTR(r4)
lwz r7, VCPU_CR(r4) PPC_LL r7, VCPU_CR(r4)
PPC_LL r8, VCPU_PC(r4) PPC_LL r8, VCPU_PC(r4)
PPC_LD(r9, VCPU_SHARED_MSR, r11) PPC_LD(r9, VCPU_SHARED_MSR, r11)
PPC_LL r0, VCPU_GPR(R0)(r4) PPC_LL r0, VCPU_GPR(R0)(r4)
......
...@@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu) ...@@ -117,7 +117,6 @@ int kvmppc_emulate_loadstore(struct kvm_vcpu *vcpu)
emulated = EMULATE_FAIL; emulated = EMULATE_FAIL;
vcpu->arch.regs.msr = vcpu->arch.shared->msr; vcpu->arch.regs.msr = vcpu->arch.shared->msr;
vcpu->arch.regs.ccr = vcpu->arch.cr;
if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) { if (analyse_instr(&op, &vcpu->arch.regs, inst) == 0) {
int type = op.type & INSTR_TYPE_MASK; int type = op.type & INSTR_TYPE_MASK;
int size = GETSIZE(op.type); int size = GETSIZE(op.type);
......
...@@ -594,7 +594,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) ...@@ -594,7 +594,12 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && radix_enabled()); r = !!(hv_enabled && radix_enabled());
break; break;
case KVM_CAP_PPC_MMU_HASH_V3: case KVM_CAP_PPC_MMU_HASH_V3:
r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300)); r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300) &&
cpu_has_feature(CPU_FTR_HVMODE));
break;
case KVM_CAP_PPC_NESTED_HV:
r = !!(hv_enabled && kvmppc_hv_ops->enable_nested &&
!kvmppc_hv_ops->enable_nested(NULL));
break; break;
#endif #endif
case KVM_CAP_SYNC_MMU: case KVM_CAP_SYNC_MMU:
...@@ -2114,6 +2119,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm, ...@@ -2114,6 +2119,14 @@ static int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags); r = kvm->arch.kvm_ops->set_smt_mode(kvm, mode, flags);
break; break;
} }
case KVM_CAP_PPC_NESTED_HV:
r = -EINVAL;
if (!is_kvmppc_hv_enabled(kvm) ||
!kvm->arch.kvm_ops->enable_nested)
break;
r = kvm->arch.kvm_ops->enable_nested(kvm);
break;
#endif #endif
default: default:
r = -EINVAL; r = -EINVAL;
......
...@@ -28,17 +28,25 @@ ...@@ -28,17 +28,25 @@
* Save transactional state and TM-related registers. * Save transactional state and TM-related registers.
* Called with: * Called with:
* - r3 pointing to the vcpu struct * - r3 pointing to the vcpu struct
* - r4 points to the MSR with current TS bits: * - r4 containing the MSR with current TS bits:
* (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR). * (For HV KVM, it is VCPU_MSR ; For PR KVM, it is host MSR).
* This can modify all checkpointed registers, but * - r5 containing a flag indicating that non-volatile registers
* restores r1, r2 before exit. * must be preserved.
* If r5 == 0, this can modify all checkpointed registers, but
* restores r1, r2 before exit. If r5 != 0, this restores the
* MSR TM/FP/VEC/VSX bits to their state on entry.
*/ */
_GLOBAL(__kvmppc_save_tm) _GLOBAL(__kvmppc_save_tm)
mflr r0 mflr r0
std r0, PPC_LR_STKOFF(r1) std r0, PPC_LR_STKOFF(r1)
stdu r1, -SWITCH_FRAME_SIZE(r1)
mr r9, r3
cmpdi cr7, r5, 0
/* Turn on TM. */ /* Turn on TM. */
mfmsr r8 mfmsr r8
mr r10, r8
li r0, 1 li r0, 1
rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG rldimi r8, r0, MSR_TM_LG, 63-MSR_TM_LG
ori r8, r8, MSR_FP ori r8, r8, MSR_FP
...@@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm) ...@@ -51,6 +59,27 @@ _GLOBAL(__kvmppc_save_tm)
std r1, HSTATE_SCRATCH2(r13) std r1, HSTATE_SCRATCH2(r13)
std r3, HSTATE_SCRATCH1(r13) std r3, HSTATE_SCRATCH1(r13)
/* Save CR on the stack - even if r5 == 0 we need to get cr7 back. */
mfcr r6
SAVE_GPR(6, r1)
/* Save DSCR so we can restore it to avoid running with user value */
mfspr r7, SPRN_DSCR
SAVE_GPR(7, r1)
/*
* We are going to do treclaim., which will modify all checkpointed
* registers. Save the non-volatile registers on the stack if
* preservation of non-volatile state has been requested.
*/
beq cr7, 3f
SAVE_NVGPRS(r1)
/* MSR[TS] will be 0 (non-transactional) once we do treclaim. */
li r0, 0
rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
SAVE_GPR(10, r1) /* final MSR value */
3:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
BEGIN_FTR_SECTION BEGIN_FTR_SECTION
/* Emulation of the treclaim instruction needs TEXASR before treclaim */ /* Emulation of the treclaim instruction needs TEXASR before treclaim */
...@@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) ...@@ -74,22 +103,25 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
std r9, PACATMSCRATCH(r13) std r9, PACATMSCRATCH(r13)
ld r9, HSTATE_SCRATCH1(r13) ld r9, HSTATE_SCRATCH1(r13)
/* Get a few more GPRs free. */ /* Save away PPR soon so we don't run with user value. */
std r29, VCPU_GPRS_TM(29)(r9) std r0, VCPU_GPRS_TM(0)(r9)
std r30, VCPU_GPRS_TM(30)(r9) mfspr r0, SPRN_PPR
std r31, VCPU_GPRS_TM(31)(r9)
/* Save away PPR and DSCR soon so don't run with user values. */
mfspr r31, SPRN_PPR
HMT_MEDIUM HMT_MEDIUM
mfspr r30, SPRN_DSCR
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
ld r29, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r29
#endif
/* Save all but r9, r13 & r29-r31 */ /* Reload stack pointer. */
reg = 0 std r1, VCPU_GPRS_TM(1)(r9)
ld r1, HSTATE_SCRATCH2(r13)
/* Set MSR RI now we have r1 and r13 back. */
std r2, VCPU_GPRS_TM(2)(r9)
li r2, MSR_RI
mtmsrd r2, 1
/* Reload TOC pointer. */
ld r2, PACATOC(r13)
/* Save all but r0-r2, r9 & r13 */
reg = 3
.rept 29 .rept 29
.if (reg != 9) && (reg != 13) .if (reg != 9) && (reg != 13)
std reg, VCPU_GPRS_TM(reg)(r9) std reg, VCPU_GPRS_TM(reg)(r9)
...@@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) ...@@ -103,33 +135,29 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
ld r4, PACATMSCRATCH(r13) ld r4, PACATMSCRATCH(r13)
std r4, VCPU_GPRS_TM(9)(r9) std r4, VCPU_GPRS_TM(9)(r9)
/* Reload stack pointer and TOC. */ /* Restore host DSCR and CR values, after saving guest values */
ld r1, HSTATE_SCRATCH2(r13) mfcr r6
ld r2, PACATOC(r13) mfspr r7, SPRN_DSCR
stw r6, VCPU_CR_TM(r9)
/* Set MSR RI now we have r1 and r13 back. */ std r7, VCPU_DSCR_TM(r9)
li r5, MSR_RI REST_GPR(6, r1)
mtmsrd r5, 1 REST_GPR(7, r1)
mtcr r6
mtspr SPRN_DSCR, r7
/* Save away checkpinted SPRs. */ /* Save away checkpointed SPRs. */
std r31, VCPU_PPR_TM(r9) std r0, VCPU_PPR_TM(r9)
std r30, VCPU_DSCR_TM(r9)
mflr r5 mflr r5
mfcr r6
mfctr r7 mfctr r7
mfspr r8, SPRN_AMR mfspr r8, SPRN_AMR
mfspr r10, SPRN_TAR mfspr r10, SPRN_TAR
mfxer r11 mfxer r11
std r5, VCPU_LR_TM(r9) std r5, VCPU_LR_TM(r9)
stw r6, VCPU_CR_TM(r9)
std r7, VCPU_CTR_TM(r9) std r7, VCPU_CTR_TM(r9)
std r8, VCPU_AMR_TM(r9) std r8, VCPU_AMR_TM(r9)
std r10, VCPU_TAR_TM(r9) std r10, VCPU_TAR_TM(r9)
std r11, VCPU_XER_TM(r9) std r11, VCPU_XER_TM(r9)
/* Restore r12 as trap number. */
lwz r12, VCPU_TRAP(r9)
/* Save FP/VSX. */ /* Save FP/VSX. */
addi r3, r9, VCPU_FPRS_TM addi r3, r9, VCPU_FPRS_TM
bl store_fp_state bl store_fp_state
...@@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) ...@@ -137,6 +165,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
bl store_vr_state bl store_vr_state
mfspr r6, SPRN_VRSAVE mfspr r6, SPRN_VRSAVE
stw r6, VCPU_VRSAVE_TM(r9) stw r6, VCPU_VRSAVE_TM(r9)
/* Restore non-volatile registers if requested to */
beq cr7, 1f
REST_NVGPRS(r1)
REST_GPR(10, r1)
1: 1:
/* /*
* We need to save these SPRs after the treclaim so that the software * We need to save these SPRs after the treclaim so that the software
...@@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) ...@@ -146,12 +179,16 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
*/ */
mfspr r7, SPRN_TEXASR mfspr r7, SPRN_TEXASR
std r7, VCPU_TEXASR(r9) std r7, VCPU_TEXASR(r9)
11:
mfspr r5, SPRN_TFHAR mfspr r5, SPRN_TFHAR
mfspr r6, SPRN_TFIAR mfspr r6, SPRN_TFIAR
std r5, VCPU_TFHAR(r9) std r5, VCPU_TFHAR(r9)
std r6, VCPU_TFIAR(r9) std r6, VCPU_TFIAR(r9)
/* Restore MSR state if requested */
beq cr7, 2f
mtmsrd r10, 0
2:
addi r1, r1, SWITCH_FRAME_SIZE
ld r0, PPC_LR_STKOFF(r1) ld r0, PPC_LR_STKOFF(r1)
mtlr r0 mtlr r0
blr blr
...@@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST) ...@@ -161,49 +198,22 @@ END_FTR_SECTION_IFSET(CPU_FTR_P9_TM_HV_ASSIST)
* be invoked from C function by PR KVM only. * be invoked from C function by PR KVM only.
*/ */
_GLOBAL(_kvmppc_save_tm_pr) _GLOBAL(_kvmppc_save_tm_pr)
mflr r5 mflr r0
std r5, PPC_LR_STKOFF(r1) std r0, PPC_LR_STKOFF(r1)
stdu r1, -SWITCH_FRAME_SIZE(r1) stdu r1, -PPC_MIN_STKFRM(r1)
SAVE_NVGPRS(r1)
/* save MSR since TM/math bits might be impacted
* by __kvmppc_save_tm().
*/
mfmsr r5
SAVE_GPR(5, r1)
/* also save DSCR/CR/TAR so that it can be recovered later */
mfspr r6, SPRN_DSCR
SAVE_GPR(6, r1)
mfcr r7
stw r7, _CCR(r1)
mfspr r8, SPRN_TAR mfspr r8, SPRN_TAR
SAVE_GPR(8, r1) std r8, PPC_MIN_STKFRM-8(r1)
li r5, 1 /* preserve non-volatile registers */
bl __kvmppc_save_tm bl __kvmppc_save_tm
REST_GPR(8, r1) ld r8, PPC_MIN_STKFRM-8(r1)
mtspr SPRN_TAR, r8 mtspr SPRN_TAR, r8
ld r7, _CCR(r1) addi r1, r1, PPC_MIN_STKFRM
mtcr r7 ld r0, PPC_LR_STKOFF(r1)
mtlr r0
REST_GPR(6, r1)
mtspr SPRN_DSCR, r6
/* need preserve current MSR's MSR_TS bits */
REST_GPR(5, r1)
mfmsr r6
rldicl r6, r6, 64 - MSR_TS_S_LG, 62
rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
mtmsrd r5
REST_NVGPRS(r1)
addi r1, r1, SWITCH_FRAME_SIZE
ld r5, PPC_LR_STKOFF(r1)
mtlr r5
blr blr
EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr); EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
...@@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr); ...@@ -215,15 +225,21 @@ EXPORT_SYMBOL_GPL(_kvmppc_save_tm_pr);
* - r4 is the guest MSR with desired TS bits: * - r4 is the guest MSR with desired TS bits:
* For HV KVM, it is VCPU_MSR * For HV KVM, it is VCPU_MSR
* For PR KVM, it is provided by caller * For PR KVM, it is provided by caller
* This potentially modifies all checkpointed registers. * - r5 containing a flag indicating that non-volatile registers
* It restores r1, r2 from the PACA. * must be preserved.
* If r5 == 0, this potentially modifies all checkpointed registers, but
* restores r1, r2 from the PACA before exit.
* If r5 != 0, this restores the MSR TM/FP/VEC/VSX bits to their state on entry.
*/ */
_GLOBAL(__kvmppc_restore_tm) _GLOBAL(__kvmppc_restore_tm)
mflr r0 mflr r0
std r0, PPC_LR_STKOFF(r1) std r0, PPC_LR_STKOFF(r1)
cmpdi cr7, r5, 0
/* Turn on TM/FP/VSX/VMX so we can restore them. */ /* Turn on TM/FP/VSX/VMX so we can restore them. */
mfmsr r5 mfmsr r5
mr r10, r5
li r6, MSR_TM >> 32 li r6, MSR_TM >> 32
sldi r6, r6, 32 sldi r6, r6, 32
or r5, r5, r6 or r5, r5, r6
...@@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm) ...@@ -244,8 +260,7 @@ _GLOBAL(__kvmppc_restore_tm)
mr r5, r4 mr r5, r4
rldicl. r5, r5, 64 - MSR_TS_S_LG, 62 rldicl. r5, r5, 64 - MSR_TS_S_LG, 62
beqlr /* TM not active in guest */ beq 9f /* TM not active in guest */
std r1, HSTATE_SCRATCH2(r13)
/* Make sure the failure summary is set, otherwise we'll program check /* Make sure the failure summary is set, otherwise we'll program check
* when we trechkpt. It's possible that this might have been not set * when we trechkpt. It's possible that this might have been not set
...@@ -255,6 +270,26 @@ _GLOBAL(__kvmppc_restore_tm) ...@@ -255,6 +270,26 @@ _GLOBAL(__kvmppc_restore_tm)
oris r7, r7, (TEXASR_FS)@h oris r7, r7, (TEXASR_FS)@h
mtspr SPRN_TEXASR, r7 mtspr SPRN_TEXASR, r7
/*
* Make a stack frame and save non-volatile registers if requested.
*/
stdu r1, -SWITCH_FRAME_SIZE(r1)
std r1, HSTATE_SCRATCH2(r13)
mfcr r6
mfspr r7, SPRN_DSCR
SAVE_GPR(2, r1)
SAVE_GPR(6, r1)
SAVE_GPR(7, r1)
beq cr7, 4f
SAVE_NVGPRS(r1)
/* MSR[TS] will be 1 (suspended) once we do trechkpt */
li r0, 1
rldimi r10, r0, MSR_TS_S_LG, 63 - MSR_TS_T_LG
SAVE_GPR(10, r1) /* final MSR value */
4:
/* /*
* We need to load up the checkpointed state for the guest. * We need to load up the checkpointed state for the guest.
* We need to do this early as it will blow away any GPRs, VSRs and * We need to do this early as it will blow away any GPRs, VSRs and
...@@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm) ...@@ -291,8 +326,6 @@ _GLOBAL(__kvmppc_restore_tm)
ld r29, VCPU_DSCR_TM(r3) ld r29, VCPU_DSCR_TM(r3)
ld r30, VCPU_PPR_TM(r3) ld r30, VCPU_PPR_TM(r3)
std r2, PACATMSCRATCH(r13) /* Save TOC */
/* Clear the MSR RI since r1, r13 are all going to be foobar. */ /* Clear the MSR RI since r1, r13 are all going to be foobar. */
li r5, 0 li r5, 0
mtmsrd r5, 1 mtmsrd r5, 1
...@@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm) ...@@ -318,18 +351,31 @@ _GLOBAL(__kvmppc_restore_tm)
/* Now let's get back the state we need. */ /* Now let's get back the state we need. */
HMT_MEDIUM HMT_MEDIUM
GET_PACA(r13) GET_PACA(r13)
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
ld r29, HSTATE_DSCR(r13)
mtspr SPRN_DSCR, r29
#endif
ld r1, HSTATE_SCRATCH2(r13) ld r1, HSTATE_SCRATCH2(r13)
ld r2, PACATMSCRATCH(r13) REST_GPR(7, r1)
mtspr SPRN_DSCR, r7
/* Set the MSR RI since we have our registers back. */ /* Set the MSR RI since we have our registers back. */
li r5, MSR_RI li r5, MSR_RI
mtmsrd r5, 1 mtmsrd r5, 1
/* Restore TOC pointer and CR */
REST_GPR(2, r1)
REST_GPR(6, r1)
mtcr r6
/* Restore non-volatile registers if requested to. */
beq cr7, 5f
REST_GPR(10, r1)
REST_NVGPRS(r1)
5: addi r1, r1, SWITCH_FRAME_SIZE
ld r0, PPC_LR_STKOFF(r1) ld r0, PPC_LR_STKOFF(r1)
mtlr r0 mtlr r0
9: /* Restore MSR bits if requested */
beqlr cr7
mtmsrd r10, 0
blr blr
/* /*
...@@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm) ...@@ -337,47 +383,23 @@ _GLOBAL(__kvmppc_restore_tm)
* can be invoked from C function by PR KVM only. * can be invoked from C function by PR KVM only.
*/ */
_GLOBAL(_kvmppc_restore_tm_pr) _GLOBAL(_kvmppc_restore_tm_pr)
mflr r5 mflr r0
std r5, PPC_LR_STKOFF(r1) std r0, PPC_LR_STKOFF(r1)
stdu r1, -SWITCH_FRAME_SIZE(r1) stdu r1, -PPC_MIN_STKFRM(r1)
SAVE_NVGPRS(r1)
/* save MSR to avoid TM/math bits change */
mfmsr r5
SAVE_GPR(5, r1)
/* also save DSCR/CR/TAR so that it can be recovered later */
mfspr r6, SPRN_DSCR
SAVE_GPR(6, r1)
mfcr r7
stw r7, _CCR(r1)
/* save TAR so that it can be recovered later */
mfspr r8, SPRN_TAR mfspr r8, SPRN_TAR
SAVE_GPR(8, r1) std r8, PPC_MIN_STKFRM-8(r1)
li r5, 1
bl __kvmppc_restore_tm bl __kvmppc_restore_tm
REST_GPR(8, r1) ld r8, PPC_MIN_STKFRM-8(r1)
mtspr SPRN_TAR, r8 mtspr SPRN_TAR, r8
ld r7, _CCR(r1) addi r1, r1, PPC_MIN_STKFRM
mtcr r7 ld r0, PPC_LR_STKOFF(r1)
mtlr r0
REST_GPR(6, r1)
mtspr SPRN_DSCR, r6
/* need preserve current MSR's MSR_TS bits */
REST_GPR(5, r1)
mfmsr r6
rldicl r6, r6, 64 - MSR_TS_S_LG, 62
rldimi r5, r6, MSR_TS_S_LG, 63 - MSR_TS_T_LG
mtmsrd r5
REST_NVGPRS(r1)
addi r1, r1, SWITCH_FRAME_SIZE
ld r5, PPC_LR_STKOFF(r1)
mtlr r5
blr blr
EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr); EXPORT_SYMBOL_GPL(_kvmppc_restore_tm_pr);
......
...@@ -14,7 +14,6 @@ ...@@ -14,7 +14,6 @@
{0x400, "INST_STORAGE"}, \ {0x400, "INST_STORAGE"}, \
{0x480, "INST_SEGMENT"}, \ {0x480, "INST_SEGMENT"}, \
{0x500, "EXTERNAL"}, \ {0x500, "EXTERNAL"}, \
{0x501, "EXTERNAL_LEVEL"}, \
{0x502, "EXTERNAL_HV"}, \ {0x502, "EXTERNAL_HV"}, \
{0x600, "ALIGNMENT"}, \ {0x600, "ALIGNMENT"}, \
{0x700, "PROGRAM"}, \ {0x700, "PROGRAM"}, \
......
...@@ -830,6 +830,15 @@ void radix__flush_pwc_lpid(unsigned int lpid) ...@@ -830,6 +830,15 @@ void radix__flush_pwc_lpid(unsigned int lpid)
} }
EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid); EXPORT_SYMBOL_GPL(radix__flush_pwc_lpid);
/*
* Flush partition scoped translations from LPID (=LPIDR)
*/
void radix__flush_tlb_lpid(unsigned int lpid)
{
_tlbie_lpid(lpid, RIC_FLUSH_ALL);
}
EXPORT_SYMBOL_GPL(radix__flush_tlb_lpid);
/* /*
* Flush partition scoped translations from LPID (=LPIDR) * Flush partition scoped translations from LPID (=LPIDR)
*/ */
......
...@@ -719,6 +719,7 @@ struct kvm_ppc_one_seg_page_size { ...@@ -719,6 +719,7 @@ struct kvm_ppc_one_seg_page_size {
#define KVM_PPC_PAGE_SIZES_REAL 0x00000001 #define KVM_PPC_PAGE_SIZES_REAL 0x00000001
#define KVM_PPC_1T_SEGMENTS 0x00000002 #define KVM_PPC_1T_SEGMENTS 0x00000002
#define KVM_PPC_NO_HASH 0x00000004
struct kvm_ppc_smmu_info { struct kvm_ppc_smmu_info {
__u64 flags; __u64 flags;
...@@ -953,6 +954,7 @@ struct kvm_ppc_resize_hpt { ...@@ -953,6 +954,7 @@ struct kvm_ppc_resize_hpt {
#define KVM_CAP_NESTED_STATE 157 #define KVM_CAP_NESTED_STATE 157
#define KVM_CAP_ARM_INJECT_SERROR_ESR 158 #define KVM_CAP_ARM_INJECT_SERROR_ESR 158
#define KVM_CAP_MSR_PLATFORM_INFO 159 #define KVM_CAP_MSR_PLATFORM_INFO 159
#define KVM_CAP_PPC_NESTED_HV 160
#ifdef KVM_CAP_IRQ_ROUTING #ifdef KVM_CAP_IRQ_ROUTING
......
...@@ -15,7 +15,6 @@ ...@@ -15,7 +15,6 @@
{0x400, "INST_STORAGE"}, \ {0x400, "INST_STORAGE"}, \
{0x480, "INST_SEGMENT"}, \ {0x480, "INST_SEGMENT"}, \
{0x500, "EXTERNAL"}, \ {0x500, "EXTERNAL"}, \
{0x501, "EXTERNAL_LEVEL"}, \
{0x502, "EXTERNAL_HV"}, \ {0x502, "EXTERNAL_HV"}, \
{0x600, "ALIGNMENT"}, \ {0x600, "ALIGNMENT"}, \
{0x700, "PROGRAM"}, \ {0x700, "PROGRAM"}, \
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment