Commit 6d6ab940 authored by Paolo Bonzini's avatar Paolo Bonzini

Merge branch 'kvm-ppc-next' of...

Merge branch 'kvm-ppc-next' of git://git.kernel.org/pub/scm/linux/kernel/git/paulus/powerpc into HEAD

Apart from various bugfixes and code cleanups, the major new feature
is the ability to run guests using the hashed page table (HPT) MMU
mode on a host that is using the radix MMU mode.  Because of limitations
in the current POWER9 chip (all SMT threads in each core must use the
same MMU mode, HPT or radix), this requires the host to be configured
to run similar to POWER8: the host runs in single-threaded mode (only
thread 0 of each core online), and have KVM be able to wake up the other
threads when a KVM guest is to be run, and use the other threads for
running guest VCPUs.  A new module parameter, called "indep_threads_mode",
is normally Y on POWER9 but must be set to N before any HPT guests can
be run on a radix host:

    # echo N >/sys/module/kvm_hv/parameters/indep_threads_mode
    # ppc64_cpu --smt=off
Signed-off-by: default avatarPaolo Bonzini <pbonzini@redhat.com>
parents 9ffd986c c0101509
......@@ -216,7 +216,8 @@ extern kvm_pfn_t kvmppc_gpa_to_pfn(struct kvm_vcpu *vcpu, gpa_t gpa,
bool writing, bool *writable);
extern void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
unsigned long *rmap, long pte_index, int realmode);
extern void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize);
extern void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
unsigned long gfn, unsigned long psize);
extern void kvmppc_invalidate_hpte(struct kvm *kvm, __be64 *hptep,
unsigned long pte_index);
void kvmppc_clear_ref_hpte(struct kvm *kvm, __be64 *hptep,
......
......@@ -20,6 +20,8 @@
#ifndef __ASM_KVM_BOOK3S_64_H__
#define __ASM_KVM_BOOK3S_64_H__
#include <linux/string.h>
#include <asm/bitops.h>
#include <asm/book3s/64/mmu-hash.h>
/* Power architecture requires HPT is at least 256kiB, at most 64TiB */
......@@ -107,18 +109,96 @@ static inline void __unlock_hpte(__be64 *hpte, unsigned long hpte_v)
hpte[0] = cpu_to_be64(hpte_v);
}
/*
* These functions encode knowledge of the POWER7/8/9 hardware
* interpretations of the HPTE LP (large page size) field.
*/
static inline int kvmppc_hpte_page_shifts(unsigned long h, unsigned long l)
{
unsigned int lphi;
if (!(h & HPTE_V_LARGE))
return 12; /* 4kB */
lphi = (l >> 16) & 0xf;
switch ((l >> 12) & 0xf) {
case 0:
return !lphi ? 24 : -1; /* 16MB */
break;
case 1:
return 16; /* 64kB */
break;
case 3:
return !lphi ? 34 : -1; /* 16GB */
break;
case 7:
return (16 << 8) + 12; /* 64kB in 4kB */
break;
case 8:
if (!lphi)
return (24 << 8) + 16; /* 16MB in 64kkB */
if (lphi == 3)
return (24 << 8) + 12; /* 16MB in 4kB */
break;
}
return -1;
}
static inline int kvmppc_hpte_base_page_shift(unsigned long h, unsigned long l)
{
return kvmppc_hpte_page_shifts(h, l) & 0xff;
}
static inline int kvmppc_hpte_actual_page_shift(unsigned long h, unsigned long l)
{
int tmp = kvmppc_hpte_page_shifts(h, l);
if (tmp >= 0x100)
tmp >>= 8;
return tmp;
}
static inline unsigned long kvmppc_actual_pgsz(unsigned long v, unsigned long r)
{
return 1ul << kvmppc_hpte_actual_page_shift(v, r);
}
static inline int kvmppc_pgsize_lp_encoding(int base_shift, int actual_shift)
{
switch (base_shift) {
case 12:
switch (actual_shift) {
case 12:
return 0;
case 16:
return 7;
case 24:
return 0x38;
}
break;
case 16:
switch (actual_shift) {
case 16:
return 1;
case 24:
return 8;
}
break;
case 24:
return 0;
}
return -1;
}
static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
unsigned long pte_index)
{
int i, b_psize = MMU_PAGE_4K, a_psize = MMU_PAGE_4K;
unsigned int penc;
int a_pgshift, b_pgshift;
unsigned long rb = 0, va_low, sllp;
unsigned int lp = (r >> LP_SHIFT) & ((1 << LP_BITS) - 1);
if (v & HPTE_V_LARGE) {
i = hpte_page_sizes[lp];
b_psize = i & 0xf;
a_psize = i >> 4;
b_pgshift = a_pgshift = kvmppc_hpte_page_shifts(v, r);
if (a_pgshift >= 0x100) {
b_pgshift &= 0xff;
a_pgshift >>= 8;
}
/*
......@@ -152,37 +232,33 @@ static inline unsigned long compute_tlbie_rb(unsigned long v, unsigned long r,
va_low ^= v >> (SID_SHIFT_1T - 16);
va_low &= 0x7ff;
switch (b_psize) {
case MMU_PAGE_4K:
sllp = get_sllp_encoding(a_psize);
rb |= sllp << 5; /* AP field */
if (b_pgshift == 12) {
if (a_pgshift > 12) {
sllp = (a_pgshift == 16) ? 5 : 4;
rb |= sllp << 5; /* AP field */
}
rb |= (va_low & 0x7ff) << 12; /* remaining 11 bits of AVA */
break;
default:
{
} else {
int aval_shift;
/*
* remaining bits of AVA/LP fields
* Also contain the rr bits of LP
*/
rb |= (va_low << mmu_psize_defs[b_psize].shift) & 0x7ff000;
rb |= (va_low << b_pgshift) & 0x7ff000;
/*
* Now clear not needed LP bits based on actual psize
*/
rb &= ~((1ul << mmu_psize_defs[a_psize].shift) - 1);
rb &= ~((1ul << a_pgshift) - 1);
/*
* AVAL field 58..77 - base_page_shift bits of va
* we have space for 58..64 bits, Missing bits should
* be zero filled. +1 is to take care of L bit shift
*/
aval_shift = 64 - (77 - mmu_psize_defs[b_psize].shift) + 1;
aval_shift = 64 - (77 - b_pgshift) + 1;
rb |= ((va_low << aval_shift) & 0xfe);
rb |= 1; /* L field */
penc = mmu_psize_defs[b_psize].penc[a_psize];
rb |= penc << 12; /* LP field */
break;
}
rb |= r & 0xff000 & ((1ul << a_pgshift) - 1); /* LP field */
}
rb |= (v >> HPTE_V_SSIZE_SHIFT) << 8; /* B field */
return rb;
......@@ -370,6 +446,28 @@ static inline unsigned long kvmppc_hpt_mask(struct kvm_hpt_info *hpt)
return (1UL << (hpt->order - 7)) - 1;
}
/* Set bits in a dirty bitmap, which is in LE format */
static inline void set_dirty_bits(unsigned long *map, unsigned long i,
unsigned long npages)
{
if (npages >= 8)
memset((char *)map + i / 8, 0xff, npages / 8);
else
for (; npages; ++i, --npages)
__set_bit_le(i, map);
}
static inline void set_dirty_bits_atomic(unsigned long *map, unsigned long i,
unsigned long npages)
{
if (npages >= 8)
memset((char *)map + i / 8, 0xff, npages / 8);
else
for (; npages; ++i, --npages)
set_bit_le(i, map);
}
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#endif /* __ASM_KVM_BOOK3S_64_H__ */
......@@ -82,6 +82,16 @@ struct kvm_split_mode {
u8 do_nap;
u8 napped[MAX_SMT_THREADS];
struct kvmppc_vcore *vc[MAX_SUBCORES];
/* Bits for changing lpcr on P9 */
unsigned long lpcr_req;
unsigned long lpidr_req;
unsigned long host_lpcr;
u32 do_set;
u32 do_restore;
union {
u32 allphases;
u8 phase[4];
} lpcr_sync;
};
/*
......@@ -104,14 +114,11 @@ struct kvmppc_host_state {
u8 napping;
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
/*
* hwthread_req/hwthread_state pair is used to pull sibling threads
* out of guest on pre-ISAv3.0B CPUs where threads share MMU.
*/
u8 hwthread_req;
u8 hwthread_state;
u8 host_ipi;
u8 ptid;
u8 ptid; /* thread number within subcore when split */
u8 tid; /* thread number within whole core */
struct kvm_vcpu *kvm_vcpu;
struct kvmppc_vcore *kvm_vcore;
void __iomem *xics_phys;
......
......@@ -235,10 +235,7 @@ struct revmap_entry {
*/
#define KVMPPC_RMAP_LOCK_BIT 63
#define KVMPPC_RMAP_RC_SHIFT 32
#define KVMPPC_RMAP_CHG_SHIFT 48
#define KVMPPC_RMAP_REFERENCED (HPTE_R_R << KVMPPC_RMAP_RC_SHIFT)
#define KVMPPC_RMAP_CHANGED (HPTE_R_C << KVMPPC_RMAP_RC_SHIFT)
#define KVMPPC_RMAP_CHG_ORDER (0x3ful << KVMPPC_RMAP_CHG_SHIFT)
#define KVMPPC_RMAP_PRESENT 0x100000000ul
#define KVMPPC_RMAP_INDEX 0xfffffffful
......@@ -276,7 +273,7 @@ struct kvm_arch {
int tlbie_lock;
unsigned long lpcr;
unsigned long vrma_slb_v;
int hpte_setup_done;
int mmu_ready;
atomic_t vcpus_running;
u32 online_vcores;
atomic_t hpte_mod_interest;
......@@ -284,6 +281,7 @@ struct kvm_arch {
cpumask_t cpu_in_guest;
u8 radix;
u8 fwnmi_enabled;
bool threads_indep;
pgd_t *pgtable;
u64 process_table;
struct dentry *debugfs_dir;
......
......@@ -168,6 +168,7 @@ extern int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order);
extern void kvmppc_set_hpt(struct kvm *kvm, struct kvm_hpt_info *info);
extern long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order);
extern void kvmppc_free_hpt(struct kvm_hpt_info *info);
extern void kvmppc_rmap_reset(struct kvm *kvm);
extern long kvmppc_prepare_vrma(struct kvm *kvm,
struct kvm_userspace_memory_region *mem);
extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
......@@ -177,6 +178,8 @@ extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd,
struct iommu_group *grp);
extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm,
struct iommu_group *grp);
extern int kvmppc_switch_mmu_to_hpt(struct kvm *kvm);
extern int kvmppc_switch_mmu_to_radix(struct kvm *kvm);
extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
struct kvm_create_spapr_tce_64 *args);
......
......@@ -642,6 +642,7 @@ int main(void)
HSTATE_FIELD(HSTATE_SAVED_XIRR, saved_xirr);
HSTATE_FIELD(HSTATE_HOST_IPI, host_ipi);
HSTATE_FIELD(HSTATE_PTID, ptid);
HSTATE_FIELD(HSTATE_TID, tid);
HSTATE_FIELD(HSTATE_MMCR0, host_mmcr[0]);
HSTATE_FIELD(HSTATE_MMCR1, host_mmcr[1]);
HSTATE_FIELD(HSTATE_MMCRA, host_mmcr[2]);
......@@ -667,6 +668,8 @@ int main(void)
OFFSET(KVM_SPLIT_LDBAR, kvm_split_mode, ldbar);
OFFSET(KVM_SPLIT_DO_NAP, kvm_split_mode, do_nap);
OFFSET(KVM_SPLIT_NAPPED, kvm_split_mode, napped);
OFFSET(KVM_SPLIT_DO_SET, kvm_split_mode, do_set);
OFFSET(KVM_SPLIT_DO_RESTORE, kvm_split_mode, do_restore);
#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
#ifdef CONFIG_PPC_BOOK3S_64
......
......@@ -319,20 +319,13 @@ enter_winkle:
/*
* r3 - PSSCR value corresponding to the requested stop state.
*/
power_enter_stop:
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
power_enter_stop_kvm_rm:
/*
* This is currently unused because POWER9 KVM does not have to
* gather secondary threads into sibling mode, but the code is
* here in case that function is required.
*
* Tell KVM we're entering idle.
*/
/* Tell KVM we're entering idle */
li r4,KVM_HWTHREAD_IN_IDLE
/* DO THIS IN REAL MODE! See comment above. */
stb r4,HSTATE_HWTHREAD_STATE(r13)
#endif
power_enter_stop:
/*
* Check if we are executing the lite variant with ESL=EC=0
*/
......@@ -496,18 +489,6 @@ pnv_powersave_wakeup_mce:
b pnv_powersave_wakeup
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
kvm_start_guest_check:
li r0,KVM_HWTHREAD_IN_KERNEL
stb r0,HSTATE_HWTHREAD_STATE(r13)
/* Order setting hwthread_state vs. testing hwthread_req */
sync
lbz r0,HSTATE_HWTHREAD_REQ(r13)
cmpwi r0,0
beqlr
b kvm_start_guest
#endif
/*
* Called from reset vector for powersave wakeups.
* cr3 - set to gt if waking up with partial/complete hypervisor state loss
......@@ -532,9 +513,15 @@ ALT_FTR_SECTION_END_IFSET(CPU_FTR_ARCH_300)
mr r3,r12
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
BEGIN_FTR_SECTION
bl kvm_start_guest_check
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
li r0,KVM_HWTHREAD_IN_KERNEL
stb r0,HSTATE_HWTHREAD_STATE(r13)
/* Order setting hwthread_state vs. testing hwthread_req */
sync
lbz r0,HSTATE_HWTHREAD_REQ(r13)
cmpwi r0,0
beq 1f
b kvm_start_guest
1:
#endif
/* Return SRR1 from power7_nap() */
......
......@@ -73,8 +73,6 @@ struct kvm_resize_hpt {
struct kvm_hpt_info hpt;
};
static void kvmppc_rmap_reset(struct kvm *kvm);
int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
{
unsigned long hpt = 0;
......@@ -106,7 +104,6 @@ int kvmppc_allocate_hpt(struct kvm_hpt_info *info, u32 order)
/* Allocate reverse map array */
rev = vmalloc(sizeof(struct revmap_entry) * npte);
if (!rev) {
pr_err("kvmppc_allocate_hpt: Couldn't alloc reverse map array\n");
if (cma)
kvm_free_hpt_cma(page, 1 << (order - PAGE_SHIFT));
else
......@@ -137,19 +134,22 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
long err = -EBUSY;
struct kvm_hpt_info info;
if (kvm_is_radix(kvm))
return -EINVAL;
mutex_lock(&kvm->lock);
if (kvm->arch.hpte_setup_done) {
kvm->arch.hpte_setup_done = 0;
/* order hpte_setup_done vs. vcpus_running */
if (kvm->arch.mmu_ready) {
kvm->arch.mmu_ready = 0;
/* order mmu_ready vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
kvm->arch.hpte_setup_done = 1;
kvm->arch.mmu_ready = 1;
goto out;
}
}
if (kvm_is_radix(kvm)) {
err = kvmppc_switch_mmu_to_hpt(kvm);
if (err)
goto out;
}
if (kvm->arch.hpt.order == order) {
/* We already have a suitable HPT */
......@@ -183,6 +183,7 @@ long kvmppc_alloc_reset_hpt(struct kvm *kvm, int order)
void kvmppc_free_hpt(struct kvm_hpt_info *info)
{
vfree(info->rev);
info->rev = NULL;
if (info->cma)
kvm_free_hpt_cma(virt_to_page(info->virt),
1 << (info->order - PAGE_SHIFT));
......@@ -334,7 +335,7 @@ static unsigned long kvmppc_mmu_get_real_addr(unsigned long v, unsigned long r,
{
unsigned long ra_mask;
ra_mask = hpte_page_size(v, r) - 1;
ra_mask = kvmppc_actual_pgsz(v, r) - 1;
return (r & HPTE_R_RPN & ~ra_mask) | (ea & ra_mask);
}
......@@ -350,6 +351,9 @@ static int kvmppc_mmu_book3s_64_hv_xlate(struct kvm_vcpu *vcpu, gva_t eaddr,
int index;
int virtmode = vcpu->arch.shregs.msr & (data ? MSR_DR : MSR_IR);
if (kvm_is_radix(vcpu->kvm))
return kvmppc_mmu_radix_xlate(vcpu, eaddr, gpte, data, iswrite);
/* Get SLB entry */
if (virtmode) {
slbe = kvmppc_mmu_book3s_hv_find_slbe(vcpu, eaddr);
......@@ -505,7 +509,8 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
mmio_update = atomic64_read(&kvm->arch.mmio_update);
if (mmio_update == vcpu->arch.pgfault_cache->mmio_update) {
r = vcpu->arch.pgfault_cache->rpte;
psize = hpte_page_size(vcpu->arch.pgfault_hpte[0], r);
psize = kvmppc_actual_pgsz(vcpu->arch.pgfault_hpte[0],
r);
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
gfn_base = gpa_base >> PAGE_SHIFT;
gpa = gpa_base | (ea & (psize - 1));
......@@ -534,7 +539,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
return RESUME_GUEST;
/* Translate the logical address and get the page */
psize = hpte_page_size(hpte[0], r);
psize = kvmppc_actual_pgsz(hpte[0], r);
gpa_base = r & HPTE_R_RPN & ~(psize - 1);
gfn_base = gpa_base >> PAGE_SHIFT;
gpa = gpa_base | (ea & (psize - 1));
......@@ -710,7 +715,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
goto out_put;
}
static void kvmppc_rmap_reset(struct kvm *kvm)
void kvmppc_rmap_reset(struct kvm *kvm)
{
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
......@@ -776,6 +781,7 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
/* Must be called with both HPTE and rmap locked */
static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
struct kvm_memory_slot *memslot,
unsigned long *rmapp, unsigned long gfn)
{
__be64 *hptep = (__be64 *) (kvm->arch.hpt.virt + (i << 4));
......@@ -798,7 +804,7 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
/* Now check and modify the HPTE */
ptel = rev[i].guest_rpte;
psize = hpte_page_size(be64_to_cpu(hptep[0]), ptel);
psize = kvmppc_actual_pgsz(be64_to_cpu(hptep[0]), ptel);
if ((be64_to_cpu(hptep[0]) & HPTE_V_VALID) &&
hpte_rpn(ptel, psize) == gfn) {
hptep[0] |= cpu_to_be64(HPTE_V_ABSENT);
......@@ -807,8 +813,8 @@ static void kvmppc_unmap_hpte(struct kvm *kvm, unsigned long i,
/* Harvest R and C */
rcbits = be64_to_cpu(hptep[1]) & (HPTE_R_R | HPTE_R_C);
*rmapp |= rcbits << KVMPPC_RMAP_RC_SHIFT;
if (rcbits & HPTE_R_C)
kvmppc_update_rmap_change(rmapp, psize);
if ((rcbits & HPTE_R_C) && memslot->dirty_bitmap)
kvmppc_update_dirty_map(memslot, gfn, psize);
if (rcbits & ~rev[i].guest_rpte) {
rev[i].guest_rpte = ptel | rcbits;
note_hpte_modification(kvm, &rev[i]);
......@@ -846,7 +852,7 @@ static int kvm_unmap_rmapp(struct kvm *kvm, struct kvm_memory_slot *memslot,
continue;
}
kvmppc_unmap_hpte(kvm, i, rmapp, gfn);
kvmppc_unmap_hpte(kvm, i, memslot, rmapp, gfn);
unlock_rmap(rmapp);
__unlock_hpte(hptep, be64_to_cpu(hptep[0]));
}
......@@ -1029,14 +1035,6 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
retry:
lock_rmap(rmapp);
if (*rmapp & KVMPPC_RMAP_CHANGED) {
long change_order = (*rmapp & KVMPPC_RMAP_CHG_ORDER)
>> KVMPPC_RMAP_CHG_SHIFT;
*rmapp &= ~(KVMPPC_RMAP_CHANGED | KVMPPC_RMAP_CHG_ORDER);
npages_dirty = 1;
if (change_order > PAGE_SHIFT)
npages_dirty = 1ul << (change_order - PAGE_SHIFT);
}
if (!(*rmapp & KVMPPC_RMAP_PRESENT)) {
unlock_rmap(rmapp);
return npages_dirty;
......@@ -1092,7 +1090,7 @@ static int kvm_test_clear_dirty_npages(struct kvm *kvm, unsigned long *rmapp)
rev[i].guest_rpte |= HPTE_R_C;
note_hpte_modification(kvm, &rev[i]);
}
n = hpte_page_size(v, r);
n = kvmppc_actual_pgsz(v, r);
n = (n + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (n > npages_dirty)
npages_dirty = n;
......@@ -1128,7 +1126,7 @@ void kvmppc_harvest_vpa_dirty(struct kvmppc_vpa *vpa,
long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map)
{
unsigned long i, j;
unsigned long i;
unsigned long *rmapp;
preempt_disable();
......@@ -1140,9 +1138,8 @@ long kvmppc_hv_get_dirty_log_hpt(struct kvm *kvm,
* since we always put huge-page HPTEs in the rmap chain
* corresponding to their page base address.
*/
if (npages && map)
for (j = i; npages; ++j, --npages)
__set_bit_le(j, map);
if (npages)
set_dirty_bits(map, i, npages);
++rmapp;
}
preempt_enable();
......@@ -1186,7 +1183,6 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
struct page *page = virt_to_page(va);
struct kvm_memory_slot *memslot;
unsigned long gfn;
unsigned long *rmap;
int srcu_idx;
put_page(page);
......@@ -1194,20 +1190,12 @@ void kvmppc_unpin_guest_page(struct kvm *kvm, void *va, unsigned long gpa,
if (!dirty)
return;
/* We need to mark this page dirty in the rmap chain */
/* We need to mark this page dirty in the memslot dirty_bitmap, if any */
gfn = gpa >> PAGE_SHIFT;
srcu_idx = srcu_read_lock(&kvm->srcu);
memslot = gfn_to_memslot(kvm, gfn);
if (memslot) {
if (!kvm_is_radix(kvm)) {
rmap = &memslot->arch.rmap[gfn - memslot->base_gfn];
lock_rmap(rmap);
*rmap |= KVMPPC_RMAP_CHANGED;
unlock_rmap(rmap);
} else if (memslot->dirty_bitmap) {
mark_page_dirty(kvm, gfn);
}
}
if (memslot && memslot->dirty_bitmap)
set_bit_le(gfn - memslot->base_gfn, memslot->dirty_bitmap);
srcu_read_unlock(&kvm->srcu, srcu_idx);
}
......@@ -1267,7 +1255,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
guest_rpte = rev->guest_rpte;
ret = -EIO;
apsize = hpte_page_size(vpte, guest_rpte);
apsize = kvmppc_actual_pgsz(vpte, guest_rpte);
if (!apsize)
goto out;
......@@ -1282,7 +1270,7 @@ static unsigned long resize_hpt_rehash_hpte(struct kvm_resize_hpt *resize,
rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn];
lock_rmap(rmapp);
kvmppc_unmap_hpte(kvm, idx, rmapp, gfn);
kvmppc_unmap_hpte(kvm, idx, memslot, rmapp, gfn);
unlock_rmap(rmapp);
}
......@@ -1455,7 +1443,7 @@ long kvm_vm_ioctl_resize_hpt_prepare(struct kvm *kvm,
struct kvm_resize_hpt *resize;
int ret;
if (flags != 0)
if (flags != 0 || kvm_is_radix(kvm))
return -EINVAL;
if (shift && ((shift < 18) || (shift > 46)))
......@@ -1521,7 +1509,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
struct kvm_resize_hpt *resize;
long ret;
if (flags != 0)
if (flags != 0 || kvm_is_radix(kvm))
return -EINVAL;
if (shift && ((shift < 18) || (shift > 46)))
......@@ -1533,15 +1521,15 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
/* This shouldn't be possible */
ret = -EIO;
if (WARN_ON(!kvm->arch.hpte_setup_done))
if (WARN_ON(!kvm->arch.mmu_ready))
goto out_no_hpt;
/* Stop VCPUs from running while we mess with the HPT */
kvm->arch.hpte_setup_done = 0;
kvm->arch.mmu_ready = 0;
smp_mb();
/* Boot all CPUs out of the guest so they re-read
* hpte_setup_done */
* mmu_ready */
on_each_cpu(resize_hpt_boot_vcpu, NULL, 1);
ret = -ENXIO;
......@@ -1564,7 +1552,7 @@ long kvm_vm_ioctl_resize_hpt_commit(struct kvm *kvm,
out:
/* Let VCPUs run again */
kvm->arch.hpte_setup_done = 1;
kvm->arch.mmu_ready = 1;
smp_mb();
out_no_hpt:
resize_hpt_release(kvm, resize);
......@@ -1707,6 +1695,8 @@ static ssize_t kvm_htab_read(struct file *file, char __user *buf,
if (!access_ok(VERIFY_WRITE, buf, count))
return -EFAULT;
if (kvm_is_radix(kvm))
return 0;
first_pass = ctx->first_pass;
flags = ctx->flags;
......@@ -1800,20 +1790,22 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
unsigned long tmp[2];
ssize_t nb;
long int err, ret;
int hpte_setup;
int mmu_ready;
if (!access_ok(VERIFY_READ, buf, count))
return -EFAULT;
if (kvm_is_radix(kvm))
return -EINVAL;
/* lock out vcpus from running while we're doing this */
mutex_lock(&kvm->lock);
hpte_setup = kvm->arch.hpte_setup_done;
if (hpte_setup) {
kvm->arch.hpte_setup_done = 0; /* temporarily */
/* order hpte_setup_done vs. vcpus_running */
mmu_ready = kvm->arch.mmu_ready;
if (mmu_ready) {
kvm->arch.mmu_ready = 0; /* temporarily */
/* order mmu_ready vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
kvm->arch.hpte_setup_done = 1;
kvm->arch.mmu_ready = 1;
mutex_unlock(&kvm->lock);
return -EBUSY;
}
......@@ -1866,7 +1858,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
"r=%lx\n", ret, i, v, r);
goto out;
}
if (!hpte_setup && is_vrma_hpte(v)) {
if (!mmu_ready && is_vrma_hpte(v)) {
unsigned long psize = hpte_base_page_size(v, r);
unsigned long senc = slb_pgsize_encoding(psize);
unsigned long lpcr;
......@@ -1875,7 +1867,7 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
(VRMA_VSID << SLB_VSID_SHIFT_1T);
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
hpte_setup = 1;
mmu_ready = 1;
}
++i;
hptp += 2;
......@@ -1891,9 +1883,9 @@ static ssize_t kvm_htab_write(struct file *file, const char __user *buf,
}
out:
/* Order HPTE updates vs. hpte_setup_done */
/* Order HPTE updates vs. mmu_ready */
smp_wmb();
kvm->arch.hpte_setup_done = hpte_setup;
kvm->arch.mmu_ready = mmu_ready;
mutex_unlock(&kvm->lock);
if (err)
......@@ -2002,6 +1994,10 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
struct kvm *kvm;
__be64 *hptp;
kvm = p->kvm;
if (kvm_is_radix(kvm))
return 0;
ret = mutex_lock_interruptible(&p->mutex);
if (ret)
return ret;
......@@ -2024,7 +2020,6 @@ static ssize_t debugfs_htab_read(struct file *file, char __user *buf,
}
}
kvm = p->kvm;
i = p->hpt_index;
hptp = (__be64 *)(kvm->arch.hpt.virt + (i * HPTE_SIZE));
for (; len != 0 && i < kvmppc_hpt_npte(&kvm->arch.hpt);
......@@ -2099,10 +2094,7 @@ void kvmppc_mmu_book3s_hv_init(struct kvm_vcpu *vcpu)
vcpu->arch.slb_nr = 32; /* POWER7/POWER8 */
if (kvm_is_radix(vcpu->kvm))
mmu->xlate = kvmppc_mmu_radix_xlate;
else
mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
mmu->xlate = kvmppc_mmu_book3s_64_hv_xlate;
mmu->reset_msr = kvmppc_mmu_book3s_64_hv_reset_msr;
vcpu->arch.hflags |= BOOK3S_HFLAG_SLB;
......
......@@ -474,26 +474,6 @@ int kvmppc_book3s_radix_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu,
return ret;
}
static void mark_pages_dirty(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn, unsigned int order)
{
unsigned long i, limit;
unsigned long *dp;
if (!memslot->dirty_bitmap)
return;
limit = 1ul << order;
if (limit < BITS_PER_LONG) {
for (i = 0; i < limit; ++i)
mark_page_dirty(kvm, gfn + i);
return;
}
dp = memslot->dirty_bitmap + (gfn - memslot->base_gfn);
limit /= BITS_PER_LONG;
for (i = 0; i < limit; ++i)
*dp++ = ~0ul;
}
/* Called with kvm->lock held */
int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
unsigned long gfn)
......@@ -508,12 +488,11 @@ int kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot,
old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_PRESENT, 0,
gpa, shift);
kvmppc_radix_tlbie_page(kvm, gpa, shift);
if (old & _PAGE_DIRTY) {
if (!shift)
mark_page_dirty(kvm, gfn);
else
mark_pages_dirty(kvm, memslot,
gfn, shift - PAGE_SHIFT);
if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) {
unsigned long npages = 1;
if (shift)
npages = 1ul << (shift - PAGE_SHIFT);
kvmppc_update_dirty_map(memslot, gfn, npages);
}
}
return 0;
......@@ -579,20 +558,8 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
struct kvm_memory_slot *memslot, unsigned long *map)
{
unsigned long i, j;
unsigned long n, *p;
int npages;
/*
* Radix accumulates dirty bits in the first half of the
* memslot's dirty_bitmap area, for when pages are paged
* out or modified by the host directly. Pick up these
* bits and add them to the map.
*/
n = kvm_dirty_bitmap_bytes(memslot) / sizeof(long);
p = memslot->dirty_bitmap;
for (i = 0; i < n; ++i)
map[i] |= xchg(&p[i], 0);
for (i = 0; i < memslot->npages; i = j) {
npages = kvm_radix_test_clear_dirty(kvm, memslot, i);
......@@ -604,9 +571,10 @@ long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm,
* real address, if npages > 1 we can skip to i + npages.
*/
j = i + 1;
if (npages)
for (j = i; npages; ++j, --npages)
__set_bit_le(j, map);
if (npages) {
set_dirty_bits(map, i, npages);
i = j + npages;
}
}
return 0;
}
......@@ -694,6 +662,7 @@ void kvmppc_free_radix(struct kvm *kvm)
pgd_clear(pgd);
}
pgd_free(kvm->mm, kvm->arch.pgtable);
kvm->arch.pgtable = NULL;
}
static void pte_ctor(void *addr)
......
......@@ -113,7 +113,7 @@ slb_do_enter:
/* Remove all SLB entries that are in use. */
li r0, r0
li r0, 0
slbmte r0, r0
slbia
......
......@@ -19,6 +19,7 @@
*/
#include <linux/kvm_host.h>
#include <linux/kernel.h>
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/preempt.h>
......@@ -97,6 +98,10 @@ static int target_smt_mode;
module_param(target_smt_mode, int, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)");
static bool indep_threads_mode = true;
module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)");
#ifdef CONFIG_KVM_XICS
static struct kernel_param_ops module_param_ops = {
.set = param_set_int,
......@@ -114,6 +119,7 @@ MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
static void kvmppc_end_cede(struct kvm_vcpu *vcpu);
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
static void kvmppc_setup_partition_table(struct kvm *kvm);
static inline struct kvm_vcpu *next_runnable_thread(struct kvmppc_vcore *vc,
int *ip)
......@@ -1732,9 +1738,9 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id,
* MMU mode (radix or HPT), unfortunately, but since we only support
* HPT guests on a HPT host so far, that isn't an impediment yet.
*/
static int threads_per_vcore(void)
static int threads_per_vcore(struct kvm *kvm)
{
if (cpu_has_feature(CPU_FTR_ARCH_300))
if (kvm->arch.threads_indep)
return 1;
return threads_per_subcore;
}
......@@ -1772,7 +1778,7 @@ static struct debugfs_timings_element {
{"cede", offsetof(struct kvm_vcpu, arch.cede_time)},
};
#define N_TIMINGS (sizeof(timings) / sizeof(timings[0]))
#define N_TIMINGS (ARRAY_SIZE(timings))
struct debugfs_timings_state {
struct kvm_vcpu *vcpu;
......@@ -2117,15 +2123,6 @@ static int kvmppc_grab_hwthread(int cpu)
struct paca_struct *tpaca;
long timeout = 10000;
/*
* ISA v3.0 idle routines do not set hwthread_state or test
* hwthread_req, so they can not grab idle threads.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300)) {
WARN(1, "KVM: can not control sibling threads\n");
return -EBUSY;
}
tpaca = &paca[cpu];
/* Ensure the thread won't go into the kernel if it wakes */
......@@ -2160,12 +2157,10 @@ static void kvmppc_release_hwthread(int cpu)
struct paca_struct *tpaca;
tpaca = &paca[cpu];
tpaca->kvm_hstate.hwthread_req = 0;
tpaca->kvm_hstate.kvm_vcpu = NULL;
tpaca->kvm_hstate.kvm_vcore = NULL;
tpaca->kvm_hstate.kvm_split_mode = NULL;
if (!cpu_has_feature(CPU_FTR_ARCH_300))
tpaca->kvm_hstate.hwthread_req = 0;
}
static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
......@@ -2237,11 +2232,10 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, struct kvmppc_vcore *vc)
kvmppc_ipi_thread(cpu);
}
static void kvmppc_wait_for_nap(void)
static void kvmppc_wait_for_nap(int n_threads)
{
int cpu = smp_processor_id();
int i, loops;
int n_threads = threads_per_vcore();
if (n_threads <= 1)
return;
......@@ -2328,7 +2322,7 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
vc->vcore_state = VCORE_PREEMPT;
vc->pcpu = smp_processor_id();
if (vc->num_threads < threads_per_vcore()) {
if (vc->num_threads < threads_per_vcore(vc->kvm)) {
spin_lock(&lp->lock);
list_add_tail(&vc->preempt_list, &lp->list);
spin_unlock(&lp->lock);
......@@ -2366,7 +2360,7 @@ struct core_info {
/*
* This mapping means subcores 0 and 1 can use threads 0-3 and 4-7
* respectively in 2-way micro-threading (split-core) mode.
* respectively in 2-way micro-threading (split-core) mode on POWER8.
*/
static int subcore_thread_map[MAX_SUBCORES] = { 0, 4, 2, 6 };
......@@ -2382,7 +2376,14 @@ static void init_core_info(struct core_info *cip, struct kvmppc_vcore *vc)
static bool subcore_config_ok(int n_subcores, int n_threads)
{
/* Can only dynamically split if unsplit to begin with */
/*
* POWER9 "SMT4" cores are permanently in what is effectively a 4-way split-core
* mode, with one thread per subcore.
*/
if (cpu_has_feature(CPU_FTR_ARCH_300))
return n_subcores <= 4 && n_threads == 1;
/* On POWER8, can only dynamically split if unsplit to begin with */
if (n_subcores > 1 && threads_per_subcore < MAX_SMT_THREADS)
return false;
if (n_subcores > MAX_SUBCORES)
......@@ -2413,6 +2414,11 @@ static bool can_dynamic_split(struct kvmppc_vcore *vc, struct core_info *cip)
if (!cpu_has_feature(CPU_FTR_ARCH_207S))
return false;
/* POWER9 currently requires all threads to be in the same MMU mode */
if (cpu_has_feature(CPU_FTR_ARCH_300) &&
kvm_is_radix(vc->kvm) != kvm_is_radix(cip->vc[0]->kvm))
return false;
if (n_threads < cip->max_subcore_threads)
n_threads = cip->max_subcore_threads;
if (!subcore_config_ok(cip->n_subcores + 1, n_threads))
......@@ -2638,6 +2644,8 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
int target_threads;
int controlled_threads;
int trap;
bool is_power8;
bool hpt_on_radix;
/*
* Remove from the list any threads that have a signal pending
......@@ -2660,15 +2668,19 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* the number of threads per subcore, except on POWER9,
* where it's 1 because the threads are (mostly) independent.
*/
controlled_threads = threads_per_vcore();
controlled_threads = threads_per_vcore(vc->kvm);
/*
* Make sure we are running on primary threads, and that secondary
* threads are offline. Also check if the number of threads in this
* guest are greater than the current system threads per guest.
* On POWER9, we need to be not in independent-threads mode if
* this is a HPT guest on a radix host.
*/
if ((controlled_threads > 1) &&
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) {
hpt_on_radix = radix_enabled() && !kvm_is_radix(vc->kvm);
if (((controlled_threads > 1) &&
((vc->num_threads > threads_per_subcore) || !on_primary_thread())) ||
(hpt_on_radix && vc->kvm->arch.threads_indep)) {
for_each_runnable_thread(i, vcpu, vc) {
vcpu->arch.ret = -EBUSY;
kvmppc_remove_runnable(vc, vcpu);
......@@ -2731,32 +2743,51 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cmd_bit = stat_bit = 0;
split = core_info.n_subcores;
sip = NULL;
if (split > 1) {
/* threads_per_subcore must be MAX_SMT_THREADS (8) here */
if (split == 2 && (dynamic_mt_modes & 2)) {
cmd_bit = HID0_POWER8_1TO2LPAR;
stat_bit = HID0_POWER8_2LPARMODE;
} else {
split = 4;
cmd_bit = HID0_POWER8_1TO4LPAR;
stat_bit = HID0_POWER8_4LPARMODE;
}
subcore_size = MAX_SMT_THREADS / split;
is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S)
&& !cpu_has_feature(CPU_FTR_ARCH_300);
if (split > 1 || hpt_on_radix) {
sip = &split_info;
memset(&split_info, 0, sizeof(split_info));
split_info.rpr = mfspr(SPRN_RPR);
split_info.pmmar = mfspr(SPRN_PMMAR);
split_info.ldbar = mfspr(SPRN_LDBAR);
split_info.subcore_size = subcore_size;
for (sub = 0; sub < core_info.n_subcores; ++sub)
split_info.vc[sub] = core_info.vc[sub];
if (is_power8) {
if (split == 2 && (dynamic_mt_modes & 2)) {
cmd_bit = HID0_POWER8_1TO2LPAR;
stat_bit = HID0_POWER8_2LPARMODE;
} else {
split = 4;
cmd_bit = HID0_POWER8_1TO4LPAR;
stat_bit = HID0_POWER8_4LPARMODE;
}
subcore_size = MAX_SMT_THREADS / split;
split_info.rpr = mfspr(SPRN_RPR);
split_info.pmmar = mfspr(SPRN_PMMAR);
split_info.ldbar = mfspr(SPRN_LDBAR);
split_info.subcore_size = subcore_size;
} else {
split_info.subcore_size = 1;
if (hpt_on_radix) {
/* Use the split_info for LPCR/LPIDR changes */
split_info.lpcr_req = vc->lpcr;
split_info.lpidr_req = vc->kvm->arch.lpid;
split_info.host_lpcr = vc->kvm->arch.host_lpcr;
split_info.do_set = 1;
}
}
/* order writes to split_info before kvm_split_mode pointer */
smp_wmb();
}
for (thr = 0; thr < controlled_threads; ++thr)
for (thr = 0; thr < controlled_threads; ++thr) {
paca[pcpu + thr].kvm_hstate.tid = thr;
paca[pcpu + thr].kvm_hstate.napping = 0;
paca[pcpu + thr].kvm_hstate.kvm_split_mode = sip;
}
/* Initiate micro-threading (split-core) if required */
/* Initiate micro-threading (split-core) on POWER8 if required */
if (cmd_bit) {
unsigned long hid0 = mfspr(SPRN_HID0);
......@@ -2775,7 +2806,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
/* Start all the threads */
active = 0;
for (sub = 0; sub < core_info.n_subcores; ++sub) {
thr = subcore_thread_map[sub];
thr = is_power8 ? subcore_thread_map[sub] : sub;
thr0_done = false;
active |= 1 << thr;
pvc = core_info.vc[sub];
......@@ -2802,18 +2833,20 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
* the vcore pointer in the PACA of the secondaries.
*/
smp_mb();
if (cmd_bit)
split_info.do_nap = 1; /* ask secondaries to nap when done */
/*
* When doing micro-threading, poke the inactive threads as well.
* This gets them to the nap instruction after kvm_do_nap,
* which reduces the time taken to unsplit later.
* For POWER9 HPT guest on radix host, we need all the secondary
* threads woken up so they can do the LPCR/LPIDR change.
*/
if (split > 1)
if (cmd_bit || hpt_on_radix) {
split_info.do_nap = 1; /* ask secondaries to nap when done */
for (thr = 1; thr < threads_per_subcore; ++thr)
if (!(active & (1 << thr)))
kvmppc_ipi_thread(pcpu + thr);
}
vc->vcore_state = VCORE_RUNNING;
preempt_disable();
......@@ -2847,10 +2880,10 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
vc->vcore_state = VCORE_EXITING;
/* wait for secondary threads to finish writing their state to memory */
kvmppc_wait_for_nap();
kvmppc_wait_for_nap(controlled_threads);
/* Return to whole-core mode if we split the core earlier */
if (split > 1) {
if (cmd_bit) {
unsigned long hid0 = mfspr(SPRN_HID0);
unsigned long loops = 0;
......@@ -2866,8 +2899,17 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc)
cpu_relax();
++loops;
}
split_info.do_nap = 0;
} else if (hpt_on_radix) {
/* Wait for all threads to have seen final sync */
for (thr = 1; thr < controlled_threads; ++thr) {
while (paca[pcpu + thr].kvm_hstate.kvm_split_mode) {
HMT_low();
barrier();
}
HMT_medium();
}
}
split_info.do_nap = 0;
kvmppc_set_host_core(pcpu);
......@@ -3208,6 +3250,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
unsigned long ebb_regs[3] = {}; /* shut up GCC */
unsigned long user_tar = 0;
unsigned int user_vrsave;
struct kvm *kvm;
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
......@@ -3245,13 +3288,25 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
return -EINTR;
}
atomic_inc(&vcpu->kvm->arch.vcpus_running);
/* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */
kvm = vcpu->kvm;
atomic_inc(&kvm->arch.vcpus_running);
/* Order vcpus_running vs. mmu_ready, see kvmppc_alloc_reset_hpt */
smp_mb();
/* On the first time here, set up HTAB and VRMA */
if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) {
r = kvmppc_hv_setup_htab_rma(vcpu);
/* On the first time here, set up MMU if necessary */
if (!vcpu->kvm->arch.mmu_ready) {
mutex_lock(&kvm->lock);
r = 0;
if (!kvm->arch.mmu_ready) {
if (!kvm_is_radix(vcpu->kvm))
r = kvmppc_hv_setup_htab_rma(vcpu);
if (!r) {
if (cpu_has_feature(CPU_FTR_ARCH_300))
kvmppc_setup_partition_table(kvm);
kvm->arch.mmu_ready = 1;
}
}
mutex_unlock(&kvm->lock);
if (r)
goto out;
}
......@@ -3310,22 +3365,21 @@ static int kvmppc_vcpu_run_hv(struct kvm_run *run, struct kvm_vcpu *vcpu)
}
static void kvmppc_add_seg_page_size(struct kvm_ppc_one_seg_page_size **sps,
int linux_psize)
int shift, int sllp)
{
struct mmu_psize_def *def = &mmu_psize_defs[linux_psize];
if (!def->shift)
return;
(*sps)->page_shift = def->shift;
(*sps)->slb_enc = def->sllp;
(*sps)->enc[0].page_shift = def->shift;
(*sps)->enc[0].pte_enc = def->penc[linux_psize];
(*sps)->page_shift = shift;
(*sps)->slb_enc = sllp;
(*sps)->enc[0].page_shift = shift;
(*sps)->enc[0].pte_enc = kvmppc_pgsize_lp_encoding(shift, shift);
/*
* Add 16MB MPSS support if host supports it
* Add 16MB MPSS support (may get filtered out by userspace)
*/
if (linux_psize != MMU_PAGE_16M && def->penc[MMU_PAGE_16M] != -1) {
(*sps)->enc[1].page_shift = 24;
(*sps)->enc[1].pte_enc = def->penc[MMU_PAGE_16M];
if (shift != 24) {
int penc = kvmppc_pgsize_lp_encoding(shift, 24);
if (penc != -1) {
(*sps)->enc[1].page_shift = 24;
(*sps)->enc[1].pte_enc = penc;
}
}
(*sps)++;
}
......@@ -3335,13 +3389,6 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
{
struct kvm_ppc_one_seg_page_size *sps;
/*
* Since we don't yet support HPT guests on a radix host,
* return an error if the host uses radix.
*/
if (radix_enabled())
return -EINVAL;
/*
* POWER7, POWER8 and POWER9 all support 32 storage keys for data.
* POWER7 doesn't support keys for instruction accesses,
......@@ -3350,16 +3397,15 @@ static int kvm_vm_ioctl_get_smmu_info_hv(struct kvm *kvm,
info->data_keys = 32;
info->instr_keys = cpu_has_feature(CPU_FTR_ARCH_207S) ? 32 : 0;
info->flags = KVM_PPC_PAGE_SIZES_REAL;
if (mmu_has_feature(MMU_FTR_1T_SEGMENT))
info->flags |= KVM_PPC_1T_SEGMENTS;
info->slb_size = mmu_slb_size;
/* POWER7, 8 and 9 all have 1T segments and 32-entry SLB */
info->flags = KVM_PPC_PAGE_SIZES_REAL | KVM_PPC_1T_SEGMENTS;
info->slb_size = 32;
/* We only support these sizes for now, and no muti-size segments */
sps = &info->sps[0];
kvmppc_add_seg_page_size(&sps, MMU_PAGE_4K);
kvmppc_add_seg_page_size(&sps, MMU_PAGE_64K);
kvmppc_add_seg_page_size(&sps, MMU_PAGE_16M);
kvmppc_add_seg_page_size(&sps, 12, 0);
kvmppc_add_seg_page_size(&sps, 16, SLB_VSID_L | SLB_VSID_LP_01);
kvmppc_add_seg_page_size(&sps, 24, SLB_VSID_L);
return 0;
}
......@@ -3374,7 +3420,7 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
struct kvm_memory_slot *memslot;
int i, r;
unsigned long n;
unsigned long *buf;
unsigned long *buf, *p;
struct kvm_vcpu *vcpu;
mutex_lock(&kvm->slots_lock);
......@@ -3390,8 +3436,8 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
goto out;
/*
* Use second half of bitmap area because radix accumulates
* bits in the first half.
* Use second half of bitmap area because both HPT and radix
* accumulate bits in the first half.
*/
n = kvm_dirty_bitmap_bytes(memslot);
buf = memslot->dirty_bitmap + n / sizeof(long);
......@@ -3404,6 +3450,16 @@ static int kvm_vm_ioctl_get_dirty_log_hv(struct kvm *kvm,
if (r)
goto out;
/*
* We accumulate dirty bits in the first half of the
* memslot's dirty_bitmap area, for when pages are paged
* out or modified by the host directly. Pick up these
* bits and add them to the map.
*/
p = memslot->dirty_bitmap;
for (i = 0; i < n / sizeof(long); ++i)
buf[i] |= xchg(&p[i], 0);
/* Harvest dirty bits from VPA and DTL updates */
/* Note: we never modify the SLB shadow buffer areas */
kvm_for_each_vcpu(i, vcpu, kvm) {
......@@ -3435,15 +3491,6 @@ static void kvmppc_core_free_memslot_hv(struct kvm_memory_slot *free,
static int kvmppc_core_create_memslot_hv(struct kvm_memory_slot *slot,
unsigned long npages)
{
/*
* For now, if radix_enabled() then we only support radix guests,
* and in that case we don't need the rmap array.
*/
if (radix_enabled()) {
slot->arch.rmap = NULL;
return 0;
}
slot->arch.rmap = vzalloc(npages * sizeof(*slot->arch.rmap));
if (!slot->arch.rmap)
return -ENOMEM;
......@@ -3464,8 +3511,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
const struct kvm_memory_slot *new)
{
unsigned long npages = mem->memory_size >> PAGE_SHIFT;
struct kvm_memslots *slots;
struct kvm_memory_slot *memslot;
/*
* If we are making a new memslot, it might make
......@@ -3475,18 +3520,6 @@ static void kvmppc_core_commit_memory_region_hv(struct kvm *kvm,
*/
if (npages)
atomic64_inc(&kvm->arch.mmio_update);
if (npages && old->npages && !kvm_is_radix(kvm)) {
/*
* If modifying a memslot, reset all the rmap dirty bits.
* If this is a new memslot, we don't need to do anything
* since the rmap array starts out as all zeroes,
* i.e. no pages are dirty.
*/
slots = kvm_memslots(kvm);
memslot = id_to_memslot(slots, mem->slot);
kvmppc_hv_get_dirty_log_hpt(kvm, memslot, NULL);
}
}
/*
......@@ -3542,6 +3575,10 @@ static void kvmppc_setup_partition_table(struct kvm *kvm)
mmu_partition_table_set_entry(kvm->arch.lpid, dw0, dw1);
}
/*
* Set up HPT (hashed page table) and RMA (real-mode area).
* Must be called with kvm->lock held.
*/
static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
{
int err = 0;
......@@ -3553,10 +3590,6 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
unsigned long psize, porder;
int srcu_idx;
mutex_lock(&kvm->lock);
if (kvm->arch.hpte_setup_done)
goto out; /* another vcpu beat us to it */
/* Allocate hashed page table (if not done already) and reset it */
if (!kvm->arch.hpt.virt) {
int order = KVM_DEFAULT_HPT_ORDER;
......@@ -3615,18 +3648,14 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
/* the -4 is to account for senc values starting at 0x10 */
lpcr = senc << (LPCR_VRMASD_SH - 4);
kvmppc_update_lpcr(kvm, lpcr, LPCR_VRMASD);
} else {
kvmppc_setup_partition_table(kvm);
}
/* Order updates to kvm->arch.lpcr etc. vs. hpte_setup_done */
/* Order updates to kvm->arch.lpcr etc. vs. mmu_ready */
smp_wmb();
kvm->arch.hpte_setup_done = 1;
err = 0;
out_srcu:
srcu_read_unlock(&kvm->srcu, srcu_idx);
out:
mutex_unlock(&kvm->lock);
return err;
up_out:
......@@ -3634,6 +3663,34 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
goto out_srcu;
}
/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
{
kvmppc_free_radix(kvm);
kvmppc_update_lpcr(kvm, LPCR_VPM1,
LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
kvmppc_rmap_reset(kvm);
kvm->arch.radix = 0;
kvm->arch.process_table = 0;
return 0;
}
/* Must be called with kvm->lock held and mmu_ready = 0 and no vcpus running */
int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
{
int err;
err = kvmppc_init_vm_radix(kvm);
if (err)
return err;
kvmppc_free_hpt(&kvm->arch.hpt);
kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
kvm->arch.radix = 1;
return 0;
}
#ifdef CONFIG_KVM_XICS
/*
* Allocate a per-core structure for managing state about which cores are
......@@ -3777,10 +3834,11 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
}
/*
* For now, if the host uses radix, the guest must be radix.
* If the host uses radix, the guest starts out as radix.
*/
if (radix_enabled()) {
kvm->arch.radix = 1;
kvm->arch.mmu_ready = 1;
lpcr &= ~LPCR_VPM1;
lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
ret = kvmppc_init_vm_radix(kvm);
......@@ -3800,7 +3858,7 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
* Work out how many sets the TLB has, for the use of
* the TLB invalidation loop in book3s_hv_rmhandlers.S.
*/
if (kvm_is_radix(kvm))
if (radix_enabled())
kvm->arch.tlb_sets = POWER9_TLB_SETS_RADIX; /* 128 */
else if (cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.tlb_sets = POWER9_TLB_SETS_HASH; /* 256 */
......@@ -3812,10 +3870,12 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
/*
* Track that we now have a HV mode VM active. This blocks secondary
* CPU threads from coming online.
* On POWER9, we only need to do this for HPT guests on a radix
* host, which is not yet supported.
* On POWER9, we only need to do this if the "indep_threads_mode"
* module parameter has been set to N.
*/
if (!cpu_has_feature(CPU_FTR_ARCH_300))
if (cpu_has_feature(CPU_FTR_ARCH_300))
kvm->arch.threads_indep = indep_threads_mode;
if (!kvm->arch.threads_indep)
kvm_hv_vm_activated();
/*
......@@ -3855,7 +3915,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
{
debugfs_remove_recursive(kvm->arch.debugfs_dir);
if (!cpu_has_feature(CPU_FTR_ARCH_300))
if (!kvm->arch.threads_indep)
kvm_hv_vm_deactivated();
kvmppc_free_vcores(kvm);
......@@ -4190,6 +4250,7 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
{
unsigned long lpcr;
int radix;
int err;
/* If not on a POWER9, reject it */
if (!cpu_has_feature(CPU_FTR_ARCH_300))
......@@ -4199,12 +4260,8 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if (cfg->flags & ~(KVM_PPC_MMUV3_RADIX | KVM_PPC_MMUV3_GTSE))
return -EINVAL;
/* We can't change a guest to/from radix yet */
radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
if (radix != kvm_is_radix(kvm))
return -EINVAL;
/* GR (guest radix) bit in process_table field must match */
radix = !!(cfg->flags & KVM_PPC_MMUV3_RADIX);
if (!!(cfg->process_table & PATB_GR) != radix)
return -EINVAL;
......@@ -4212,15 +4269,40 @@ static int kvmhv_configure_mmu(struct kvm *kvm, struct kvm_ppc_mmuv3_cfg *cfg)
if ((cfg->process_table & PRTS_MASK) > 24)
return -EINVAL;
/* We can change a guest to/from radix now, if the host is radix */
if (radix && !radix_enabled())
return -EINVAL;
mutex_lock(&kvm->lock);
if (radix != kvm_is_radix(kvm)) {
if (kvm->arch.mmu_ready) {
kvm->arch.mmu_ready = 0;
/* order mmu_ready vs. vcpus_running */
smp_mb();
if (atomic_read(&kvm->arch.vcpus_running)) {
kvm->arch.mmu_ready = 1;
err = -EBUSY;
goto out_unlock;
}
}
if (radix)
err = kvmppc_switch_mmu_to_radix(kvm);
else
err = kvmppc_switch_mmu_to_hpt(kvm);
if (err)
goto out_unlock;
}
kvm->arch.process_table = cfg->process_table;
kvmppc_setup_partition_table(kvm);
lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0;
kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE);
mutex_unlock(&kvm->lock);
err = 0;
return 0;
out_unlock:
mutex_unlock(&kvm->lock);
return err;
}
static struct kvmppc_ops kvm_ops_hv = {
......@@ -4362,4 +4444,3 @@ module_exit(kvmppc_book3s_exit_hv);
MODULE_LICENSE("GPL");
MODULE_ALIAS_MISCDEV(KVM_MINOR);
MODULE_ALIAS("devname:kvm");
......@@ -278,7 +278,8 @@ void kvmhv_commence_exit(int trap)
struct kvmppc_vcore *vc = local_paca->kvm_hstate.kvm_vcore;
int ptid = local_paca->kvm_hstate.ptid;
struct kvm_split_mode *sip = local_paca->kvm_hstate.kvm_split_mode;
int me, ee, i;
int me, ee, i, t;
int cpu0;
/* Set our bit in the threads-exiting-guest map in the 0xff00
bits of vcore->entry_exit_map */
......@@ -320,6 +321,22 @@ void kvmhv_commence_exit(int trap)
if ((ee >> 8) == 0)
kvmhv_interrupt_vcore(vc, ee);
}
/*
* On POWER9 when running a HPT guest on a radix host (sip != NULL),
* we have to interrupt inactive CPU threads to get them to
* restore the host LPCR value.
*/
if (sip->lpcr_req) {
if (cmpxchg(&sip->do_restore, 0, 1) == 0) {
vc = local_paca->kvm_hstate.kvm_vcore;
cpu0 = vc->pcpu + ptid - local_paca->kvm_hstate.tid;
for (t = 1; t < threads_per_core; ++t) {
if (sip->napped[t])
kvmhv_rm_send_ipi(cpu0 + t);
}
}
}
}
struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
......@@ -529,6 +546,8 @@ static inline bool is_rm(void)
unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_xirr(vcpu);
......@@ -541,6 +560,8 @@ unsigned long kvmppc_rm_h_xirr(struct kvm_vcpu *vcpu)
unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
vcpu->arch.gpr[5] = get_tb();
if (xive_enabled()) {
if (is_rm())
......@@ -554,6 +575,8 @@ unsigned long kvmppc_rm_h_xirr_x(struct kvm_vcpu *vcpu)
unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_ipoll(vcpu, server);
......@@ -567,6 +590,8 @@ unsigned long kvmppc_rm_h_ipoll(struct kvm_vcpu *vcpu, unsigned long server)
int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
unsigned long mfrr)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_ipi(vcpu, server, mfrr);
......@@ -579,6 +604,8 @@ int kvmppc_rm_h_ipi(struct kvm_vcpu *vcpu, unsigned long server,
int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_cppr(vcpu, cppr);
......@@ -591,6 +618,8 @@ int kvmppc_rm_h_cppr(struct kvm_vcpu *vcpu, unsigned long cppr)
int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
{
if (!kvmppc_xics_enabled(vcpu))
return H_TOO_HARD;
if (xive_enabled()) {
if (is_rm())
return xive_rm_h_eoi(vcpu, xirr);
......@@ -601,3 +630,89 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr)
return xics_rm_h_eoi(vcpu, xirr);
}
#endif /* CONFIG_KVM_XICS */
void kvmppc_bad_interrupt(struct pt_regs *regs)
{
die("Bad interrupt in KVM entry/exit code", regs, SIGABRT);
panic("Bad KVM trap");
}
/*
* Functions used to switch LPCR HR and UPRT bits on all threads
* when entering and exiting HPT guests on a radix host.
*/
#define PHASE_REALMODE 1 /* in real mode */
#define PHASE_SET_LPCR 2 /* have set LPCR */
#define PHASE_OUT_OF_GUEST 4 /* have finished executing in guest */
#define PHASE_RESET_LPCR 8 /* have reset LPCR to host value */
#define ALL(p) (((p) << 24) | ((p) << 16) | ((p) << 8) | (p))
static void wait_for_sync(struct kvm_split_mode *sip, int phase)
{
int thr = local_paca->kvm_hstate.tid;
sip->lpcr_sync.phase[thr] |= phase;
phase = ALL(phase);
while ((sip->lpcr_sync.allphases & phase) != phase) {
HMT_low();
barrier();
}
HMT_medium();
}
void kvmhv_p9_set_lpcr(struct kvm_split_mode *sip)
{
unsigned long rb, set;
/* wait for every other thread to get to real mode */
wait_for_sync(sip, PHASE_REALMODE);
/* Set LPCR and LPIDR */
mtspr(SPRN_LPCR, sip->lpcr_req);
mtspr(SPRN_LPID, sip->lpidr_req);
isync();
/* Invalidate the TLB on thread 0 */
if (local_paca->kvm_hstate.tid == 0) {
sip->do_set = 0;
asm volatile("ptesync" : : : "memory");
for (set = 0; set < POWER9_TLB_SETS_RADIX; ++set) {
rb = TLBIEL_INVAL_SET_LPID +
(set << TLBIEL_INVAL_SET_SHIFT);
asm volatile(PPC_TLBIEL(%0, %1, 0, 0, 0) : :
"r" (rb), "r" (0));
}
asm volatile("ptesync" : : : "memory");
}
/* indicate that we have done so and wait for others */
wait_for_sync(sip, PHASE_SET_LPCR);
/* order read of sip->lpcr_sync.allphases vs. sip->do_set */
smp_rmb();
}
/*
* Called when a thread that has been in the guest needs
* to reload the host LPCR value - but only on POWER9 when
* running a HPT guest on a radix host.
*/
void kvmhv_p9_restore_lpcr(struct kvm_split_mode *sip)
{
/* we're out of the guest... */
wait_for_sync(sip, PHASE_OUT_OF_GUEST);
mtspr(SPRN_LPID, 0);
mtspr(SPRN_LPCR, sip->host_lpcr);
isync();
if (local_paca->kvm_hstate.tid == 0) {
sip->do_restore = 0;
smp_wmb(); /* order store of do_restore vs. phase */
}
wait_for_sync(sip, PHASE_RESET_LPCR);
smp_mb();
local_paca->kvm_hstate.kvm_split_mode = NULL;
}
......@@ -107,30 +107,50 @@ void kvmppc_add_revmap_chain(struct kvm *kvm, struct revmap_entry *rev,
}
EXPORT_SYMBOL_GPL(kvmppc_add_revmap_chain);
/* Update the changed page order field of an rmap entry */
void kvmppc_update_rmap_change(unsigned long *rmap, unsigned long psize)
/* Update the dirty bitmap of a memslot */
void kvmppc_update_dirty_map(struct kvm_memory_slot *memslot,
unsigned long gfn, unsigned long psize)
{
unsigned long order;
unsigned long npages;
if (!psize)
if (!psize || !memslot->dirty_bitmap)
return;
order = ilog2(psize);
order <<= KVMPPC_RMAP_CHG_SHIFT;
if (order > (*rmap & KVMPPC_RMAP_CHG_ORDER))
*rmap = (*rmap & ~KVMPPC_RMAP_CHG_ORDER) | order;
npages = (psize + PAGE_SIZE - 1) / PAGE_SIZE;
gfn -= memslot->base_gfn;
set_dirty_bits_atomic(memslot->dirty_bitmap, gfn, npages);
}
EXPORT_SYMBOL_GPL(kvmppc_update_dirty_map);
static void kvmppc_set_dirty_from_hpte(struct kvm *kvm,
unsigned long hpte_v, unsigned long hpte_gr)
{
struct kvm_memory_slot *memslot;
unsigned long gfn;
unsigned long psize;
psize = kvmppc_actual_pgsz(hpte_v, hpte_gr);
gfn = hpte_rpn(hpte_gr, psize);
memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
if (memslot && memslot->dirty_bitmap)
kvmppc_update_dirty_map(memslot, gfn, psize);
}
EXPORT_SYMBOL_GPL(kvmppc_update_rmap_change);
/* Returns a pointer to the revmap entry for the page mapped by a HPTE */
static unsigned long *revmap_for_hpte(struct kvm *kvm, unsigned long hpte_v,
unsigned long hpte_gr)
unsigned long hpte_gr,
struct kvm_memory_slot **memslotp,
unsigned long *gfnp)
{
struct kvm_memory_slot *memslot;
unsigned long *rmap;
unsigned long gfn;
gfn = hpte_rpn(hpte_gr, hpte_page_size(hpte_v, hpte_gr));
gfn = hpte_rpn(hpte_gr, kvmppc_actual_pgsz(hpte_v, hpte_gr));
memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn);
if (memslotp)
*memslotp = memslot;
if (gfnp)
*gfnp = gfn;
if (!memslot)
return NULL;
......@@ -147,10 +167,12 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
unsigned long ptel, head;
unsigned long *rmap;
unsigned long rcbits;
struct kvm_memory_slot *memslot;
unsigned long gfn;
rcbits = hpte_r & (HPTE_R_R | HPTE_R_C);
ptel = rev->guest_rpte |= rcbits;
rmap = revmap_for_hpte(kvm, hpte_v, ptel);
rmap = revmap_for_hpte(kvm, hpte_v, ptel, &memslot, &gfn);
if (!rmap)
return;
lock_rmap(rmap);
......@@ -169,7 +191,8 @@ static void remove_revmap_chain(struct kvm *kvm, long pte_index,
}
*rmap |= rcbits << KVMPPC_RMAP_RC_SHIFT;
if (rcbits & HPTE_R_C)
kvmppc_update_rmap_change(rmap, hpte_page_size(hpte_v, hpte_r));
kvmppc_update_dirty_map(memslot, gfn,
kvmppc_actual_pgsz(hpte_v, hpte_r));
unlock_rmap(rmap);
}
......@@ -193,7 +216,7 @@ long kvmppc_do_h_enter(struct kvm *kvm, unsigned long flags,
if (kvm_is_radix(kvm))
return H_FUNCTION;
psize = hpte_page_size(pteh, ptel);
psize = kvmppc_actual_pgsz(pteh, ptel);
if (!psize)
return H_PARAMETER;
writing = hpte_is_writable(ptel);
......@@ -797,7 +820,7 @@ long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags,
gr |= r & (HPTE_R_R | HPTE_R_C);
if (r & HPTE_R_R) {
kvmppc_clear_ref_hpte(kvm, hpte, pte_index);
rmap = revmap_for_hpte(kvm, v, gr);
rmap = revmap_for_hpte(kvm, v, gr, NULL, NULL);
if (rmap) {
lock_rmap(rmap);
*rmap |= KVMPPC_RMAP_REFERENCED;
......@@ -819,7 +842,6 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
__be64 *hpte;
unsigned long v, r, gr;
struct revmap_entry *rev;
unsigned long *rmap;
long ret = H_NOT_FOUND;
if (kvm_is_radix(kvm))
......@@ -848,16 +870,9 @@ long kvmppc_h_clear_mod(struct kvm_vcpu *vcpu, unsigned long flags,
r = be64_to_cpu(hpte[1]);
gr |= r & (HPTE_R_R | HPTE_R_C);
if (r & HPTE_R_C) {
unsigned long psize = hpte_page_size(v, r);
hpte[1] = cpu_to_be64(r & ~HPTE_R_C);
eieio();
rmap = revmap_for_hpte(kvm, v, gr);
if (rmap) {
lock_rmap(rmap);
*rmap |= KVMPPC_RMAP_CHANGED;
kvmppc_update_rmap_change(rmap, psize);
unlock_rmap(rmap);
}
kvmppc_set_dirty_from_hpte(kvm, v, gr);
}
}
vcpu->arch.gpr[4] = gr;
......@@ -1014,7 +1029,7 @@ long kvmppc_hv_find_lock_hpte(struct kvm *kvm, gva_t eaddr, unsigned long slb_v,
* Check the HPTE again, including base page size
*/
if ((v & valid) && (v & mask) == val &&
hpte_base_page_size(v, r) == (1ul << pshift))
kvmppc_hpte_base_page_shift(v, r) == pshift)
/* Return with the HPTE still locked */
return (hash << 3) + (i >> 1);
......
......@@ -31,6 +31,7 @@
#include <asm/tm.h>
#include <asm/opal.h>
#include <asm/xive-regs.h>
#include <asm/thread_info.h>
/* Sign-extend HDEC if not on POWER9 */
#define EXTEND_HDEC(reg) \
......@@ -81,6 +82,19 @@ _GLOBAL_TOC(kvmppc_hv_entry_trampoline)
RFI
kvmppc_call_hv_entry:
BEGIN_FTR_SECTION
/* On P9, do LPCR setting, if necessary */
ld r3, HSTATE_SPLIT_MODE(r13)
cmpdi r3, 0
beq 46f
lwz r4, KVM_SPLIT_DO_SET(r3)
cmpwi r4, 0
beq 46f
bl kvmhv_p9_set_lpcr
nop
46:
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
ld r4, HSTATE_KVM_VCPU(r13)
bl kvmppc_hv_entry
......@@ -149,11 +163,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
subf r4, r4, r3
mtspr SPRN_DEC, r4
BEGIN_FTR_SECTION
/* hwthread_req may have got set by cede or no vcpu, so clear it */
li r0, 0
stb r0, HSTATE_HWTHREAD_REQ(r13)
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
/*
* For external interrupts we need to call the Linux
......@@ -316,7 +328,6 @@ kvm_novcpu_exit:
* Relocation is off and most register values are lost.
* r13 points to the PACA.
* r3 contains the SRR1 wakeup value, SRR1 is trashed.
* This is not used by ISAv3.0B processors.
*/
.globl kvm_start_guest
kvm_start_guest:
......@@ -390,6 +401,7 @@ kvm_secondary_got_guest:
ld r6, HSTATE_SPLIT_MODE(r13)
cmpdi r6, 0
beq 63f
BEGIN_FTR_SECTION
ld r0, KVM_SPLIT_RPR(r6)
mtspr SPRN_RPR, r0
ld r0, KVM_SPLIT_PMMAR(r6)
......@@ -397,6 +409,15 @@ kvm_secondary_got_guest:
ld r0, KVM_SPLIT_LDBAR(r6)
mtspr SPRN_LDBAR, r0
isync
FTR_SECTION_ELSE
/* On P9 we use the split_info for coordinating LPCR changes */
lwz r4, KVM_SPLIT_DO_SET(r6)
cmpwi r4, 0
beq 63f
mr r3, r6
bl kvmhv_p9_set_lpcr
nop
ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300)
63:
/* Order load of vcpu after load of vcore */
lwsync
......@@ -435,9 +456,6 @@ kvm_secondary_got_guest:
* While waiting we also need to check if we get given a vcpu to run.
*/
kvm_no_guest:
BEGIN_FTR_SECTION
twi 31,0,0
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
lbz r3, HSTATE_HWTHREAD_REQ(r13)
cmpwi r3, 0
bne 53f
......@@ -470,6 +488,12 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
ld r3, HSTATE_SPLIT_MODE(r13)
cmpdi r3, 0
beq kvm_no_guest
lwz r0, KVM_SPLIT_DO_SET(r3)
cmpwi r0, 0
bne kvmhv_do_set
lwz r0, KVM_SPLIT_DO_RESTORE(r3)
cmpwi r0, 0
bne kvmhv_do_restore
lbz r0, KVM_SPLIT_DO_NAP(r3)
cmpwi r0, 0
beq kvm_no_guest
......@@ -482,6 +506,19 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
stb r0, HSTATE_HWTHREAD_STATE(r13)
b kvm_no_guest
kvmhv_do_set:
/* Set LPCR, LPIDR etc. on P9 */
HMT_MEDIUM
bl kvmhv_p9_set_lpcr
nop
b kvm_no_guest
kvmhv_do_restore:
HMT_MEDIUM
bl kvmhv_p9_restore_lpcr
nop
b kvm_no_guest
/*
* Here the primary thread is trying to return the core to
* whole-core mode, so we need to nap.
......@@ -519,8 +556,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
/* Set kvm_split_mode.napped[tid] = 1 */
ld r3, HSTATE_SPLIT_MODE(r13)
li r0, 1
lhz r4, PACAPACAINDEX(r13)
clrldi r4, r4, 61 /* micro-threading => P8 => 8 threads/core */
lbz r4, HSTATE_TID(r13)
addi r4, r4, KVM_SPLIT_NAPPED
stbx r0, r3, r4
/* Check the do_nap flag again after setting napped[] */
......@@ -1914,10 +1950,26 @@ END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S)
19: lis r8,0x7fff /* MAX_INT@h */
mtspr SPRN_HDEC,r8
16: ld r8,KVM_HOST_LPCR(r4)
16:
BEGIN_FTR_SECTION
/* On POWER9 with HPT-on-radix we need to wait for all other threads */
ld r3, HSTATE_SPLIT_MODE(r13)
cmpdi r3, 0
beq 47f
lwz r8, KVM_SPLIT_DO_RESTORE(r3)
cmpwi r8, 0
beq 47f
stw r12, STACK_SLOT_TRAP(r1)
bl kvmhv_p9_restore_lpcr
nop
lwz r12, STACK_SLOT_TRAP(r1)
b 48f
47:
END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
ld r8,KVM_HOST_LPCR(r4)
mtspr SPRN_LPCR,r8
isync
48:
/* load host SLB entries */
BEGIN_MMU_FTR_SECTION
b 0f
......@@ -2543,10 +2595,8 @@ kvm_do_nap:
clrrdi r0, r0, 1
mtspr SPRN_CTRLT, r0
BEGIN_FTR_SECTION
li r0,1
stb r0,HSTATE_HWTHREAD_REQ(r13)
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
mfspr r5,SPRN_LPCR
ori r5,r5,LPCR_PECE0 | LPCR_PECE1
BEGIN_FTR_SECTION
......@@ -3134,10 +3184,139 @@ kvmppc_restore_tm:
/*
* We come here if we get any exception or interrupt while we are
* executing host real mode code while in guest MMU context.
* For now just spin, but we should do something better.
* r12 is (CR << 32) | vector
* r13 points to our PACA
* r12 is saved in HSTATE_SCRATCH0(r13)
* ctr is saved in HSTATE_SCRATCH1(r13) if RELOCATABLE
* r9 is saved in HSTATE_SCRATCH2(r13)
* r13 is saved in HSPRG1
* cfar is saved in HSTATE_CFAR(r13)
* ppr is saved in HSTATE_PPR(r13)
*/
kvmppc_bad_host_intr:
/*
* Switch to the emergency stack, but start half-way down in
* case we were already on it.
*/
mr r9, r1
std r1, PACAR1(r13)
ld r1, PACAEMERGSP(r13)
subi r1, r1, THREAD_SIZE/2 + INT_FRAME_SIZE
std r9, 0(r1)
std r0, GPR0(r1)
std r9, GPR1(r1)
std r2, GPR2(r1)
SAVE_4GPRS(3, r1)
SAVE_2GPRS(7, r1)
srdi r0, r12, 32
clrldi r12, r12, 32
std r0, _CCR(r1)
std r12, _TRAP(r1)
andi. r0, r12, 2
beq 1f
mfspr r3, SPRN_HSRR0
mfspr r4, SPRN_HSRR1
mfspr r5, SPRN_HDAR
mfspr r6, SPRN_HDSISR
b 2f
1: mfspr r3, SPRN_SRR0
mfspr r4, SPRN_SRR1
mfspr r5, SPRN_DAR
mfspr r6, SPRN_DSISR
2: std r3, _NIP(r1)
std r4, _MSR(r1)
std r5, _DAR(r1)
std r6, _DSISR(r1)
ld r9, HSTATE_SCRATCH2(r13)
ld r12, HSTATE_SCRATCH0(r13)
GET_SCRATCH0(r0)
SAVE_4GPRS(9, r1)
std r0, GPR13(r1)
SAVE_NVGPRS(r1)
ld r5, HSTATE_CFAR(r13)
std r5, ORIG_GPR3(r1)
mflr r3
#ifdef CONFIG_RELOCATABLE
ld r4, HSTATE_SCRATCH1(r13)
#else
mfctr r4
#endif
mfxer r5
lbz r6, PACASOFTIRQEN(r13)
std r3, _LINK(r1)
std r4, _CTR(r1)
std r5, _XER(r1)
std r6, SOFTE(r1)
ld r2, PACATOC(r13)
LOAD_REG_IMMEDIATE(3, 0x7265677368657265)
std r3, STACK_FRAME_OVERHEAD-16(r1)
/*
* On POWER9 do a minimal restore of the MMU and call C code,
* which will print a message and panic.
* XXX On POWER7 and POWER8, we just spin here since we don't
* know what the other threads are doing (and we don't want to
* coordinate with them) - but at least we now have register state
* in memory that we might be able to look at from another CPU.
*/
BEGIN_FTR_SECTION
b .
END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300)
ld r9, HSTATE_KVM_VCPU(r13)
ld r10, VCPU_KVM(r9)
li r0, 0
mtspr SPRN_AMR, r0
mtspr SPRN_IAMR, r0
mtspr SPRN_CIABR, r0
mtspr SPRN_DAWRX, r0
/* Flush the ERAT on radix P9 DD1 guest exit */
BEGIN_FTR_SECTION
PPC_INVALIDATE_ERAT
END_FTR_SECTION_IFSET(CPU_FTR_POWER9_DD1)
BEGIN_MMU_FTR_SECTION
b 4f
END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX)
slbmte r0, r0
slbia
ptesync
ld r8, PACA_SLBSHADOWPTR(r13)
.rept SLB_NUM_BOLTED
li r3, SLBSHADOW_SAVEAREA
LDX_BE r5, r8, r3
addi r3, r3, 8
LDX_BE r6, r8, r3
andis. r7, r5, SLB_ESID_V@h
beq 3f
slbmte r6, r5
3: addi r8, r8, 16
.endr
4: lwz r7, KVM_HOST_LPID(r10)
mtspr SPRN_LPID, r7
mtspr SPRN_PID, r0
ld r8, KVM_HOST_LPCR(r10)
mtspr SPRN_LPCR, r8
isync
li r0, KVM_GUEST_MODE_NONE
stb r0, HSTATE_IN_GUEST(r13)
/*
* Turn on the MMU and jump to C code
*/
bcl 20, 31, .+4
5: mflr r3
addi r3, r3, 9f - 5b
ld r4, PACAKMSR(r13)
mtspr SPRN_SRR0, r3
mtspr SPRN_SRR1, r4
rfid
9: addi r3, r1, STACK_FRAME_OVERHEAD
bl kvmppc_bad_interrupt
b 9b
/*
* This mimics the MSR transition on IRQ delivery. The new guest MSR is taken
......
......@@ -1326,12 +1326,22 @@ static int kvm_arch_vcpu_ioctl_set_sregs_pr(struct kvm_vcpu *vcpu,
kvmppc_set_pvr_pr(vcpu, sregs->pvr);
vcpu3s->sdr1 = sregs->u.s.sdr1;
#ifdef CONFIG_PPC_BOOK3S_64
if (vcpu->arch.hflags & BOOK3S_HFLAG_SLB) {
/* Flush all SLB entries */
vcpu->arch.mmu.slbmte(vcpu, 0, 0);
vcpu->arch.mmu.slbia(vcpu);
for (i = 0; i < 64; i++) {
vcpu->arch.mmu.slbmte(vcpu, sregs->u.s.ppc64.slb[i].slbv,
sregs->u.s.ppc64.slb[i].slbe);
u64 rb = sregs->u.s.ppc64.slb[i].slbe;
u64 rs = sregs->u.s.ppc64.slb[i].slbv;
if (rb & SLB_ESID_V)
vcpu->arch.mmu.slbmte(vcpu, rs, rb);
}
} else {
} else
#endif
{
for (i = 0; i < 16; i++) {
vcpu->arch.mmu.mtsrin(vcpu, i, sregs->u.s.ppc32.sr[i]);
}
......
......@@ -419,6 +419,8 @@ int kvmppc_hcall_impl_pr(unsigned long cmd)
case H_PROTECT:
case H_BULK_REMOVE:
case H_PUT_TCE:
case H_PUT_TCE_INDIRECT:
case H_STUFF_TCE:
case H_CEDE:
case H_LOGICAL_CI_LOAD:
case H_LOGICAL_CI_STORE:
......
......@@ -377,7 +377,7 @@ static inline int kvmppc_e500_shadow_map(struct kvmppc_vcpu_e500 *vcpu_e500,
start = vma->vm_pgoff;
end = start +
((vma->vm_end - vma->vm_start) >> PAGE_SHIFT);
vma_pages(vma);
pfn = start + ((hva - vma->vm_start) >> PAGE_SHIFT);
......
......@@ -590,8 +590,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
r = !!(hv_enabled && radix_enabled());
break;
case KVM_CAP_PPC_MMU_HASH_V3:
r = !!(hv_enabled && !radix_enabled() &&
cpu_has_feature(CPU_FTR_ARCH_300));
r = !!(hv_enabled && cpu_has_feature(CPU_FTR_ARCH_300));
break;
#endif
case KVM_CAP_SYNC_MMU:
......@@ -644,8 +643,8 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
break;
#endif
case KVM_CAP_PPC_HTM:
r = cpu_has_feature(CPU_FTR_TM_COMP) &&
is_kvmppc_hv_enabled(kvm);
r = is_kvmppc_hv_enabled(kvm) &&
(cur_cpu_spec->cpu_user_features2 & PPC_FEATURE2_HTM_COMP);
break;
default:
r = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment