Commit 2f673816 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm

Pull kvm fixes from Paolo Bonzini:
 "Bugfixes, including a TLB flush fix that affects processors without
  nested page tables"

* tag 'for-linus' of git://git.kernel.org/pub/scm/virt/kvm/kvm:
  kvm: fix previous commit for 32-bit builds
  kvm: avoid speculation-based attacks from out-of-range memslot accesses
  KVM: x86: Unload MMU on guest TLB flush if TDP disabled to force MMU sync
  KVM: x86: Ensure liveliness of nested VM-Enter fail tracepoint message
  selftests: kvm: Add support for customized slot0 memory size
  KVM: selftests: introduce P47V64 for s390x
  KVM: x86: Ensure PV TLB flush tracepoint reflects KVM behavior
  KVM: X86: MMU: Use the correct inherited permissions to get shadow page
  KVM: LAPIC: Write 0 to TMICT should also cancel vmx-preemption timer
  KVM: SVM: Fix SEV SEND_START session length & SEND_UPDATE_DATA query length after commit 238eca82
parents 368094df 4422829e
......@@ -171,8 +171,8 @@ Shadow pages contain the following information:
shadow pages) so role.quadrant takes values in the range 0..3. Each
quadrant maps 1GB virtual address space.
role.access:
Inherited guest access permissions in the form uwx. Note execute
permission is positive, not negative.
Inherited guest access permissions from the parent ptes in the form uwx.
Note execute permission is positive, not negative.
role.invalid:
The page is invalid and should not be used. It is a root page that is
currently pinned (by a cpu hardware register pointing to it); once it is
......
......@@ -1494,6 +1494,15 @@ static void limit_periodic_timer_frequency(struct kvm_lapic *apic)
static void cancel_hv_timer(struct kvm_lapic *apic);
static void cancel_apic_timer(struct kvm_lapic *apic)
{
hrtimer_cancel(&apic->lapic_timer.timer);
preempt_disable();
if (apic->lapic_timer.hv_timer_in_use)
cancel_hv_timer(apic);
preempt_enable();
}
static void apic_update_lvtt(struct kvm_lapic *apic)
{
u32 timer_mode = kvm_lapic_get_reg(apic, APIC_LVTT) &
......@@ -1502,11 +1511,7 @@ static void apic_update_lvtt(struct kvm_lapic *apic)
if (apic->lapic_timer.timer_mode != timer_mode) {
if (apic_lvtt_tscdeadline(apic) != (timer_mode ==
APIC_LVT_TIMER_TSCDEADLINE)) {
hrtimer_cancel(&apic->lapic_timer.timer);
preempt_disable();
if (apic->lapic_timer.hv_timer_in_use)
cancel_hv_timer(apic);
preempt_enable();
cancel_apic_timer(apic);
kvm_lapic_set_reg(apic, APIC_TMICT, 0);
apic->lapic_timer.period = 0;
apic->lapic_timer.tscdeadline = 0;
......@@ -2092,7 +2097,7 @@ int kvm_lapic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val)
if (apic_lvtt_tscdeadline(apic))
break;
hrtimer_cancel(&apic->lapic_timer.timer);
cancel_apic_timer(apic);
kvm_lapic_set_reg(apic, APIC_TMICT, val);
start_apic_timer(apic);
break;
......
......@@ -90,8 +90,8 @@ struct guest_walker {
gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
pt_element_t __user *ptep_user[PT_MAX_FULL_LEVELS];
bool pte_writable[PT_MAX_FULL_LEVELS];
unsigned pt_access;
unsigned pte_access;
unsigned int pt_access[PT_MAX_FULL_LEVELS];
unsigned int pte_access;
gfn_t gfn;
struct x86_exception fault;
};
......@@ -418,13 +418,15 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
}
walker->ptes[walker->level - 1] = pte;
/* Convert to ACC_*_MASK flags for struct guest_walker. */
walker->pt_access[walker->level - 1] = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
} while (!is_last_gpte(mmu, walker->level, pte));
pte_pkey = FNAME(gpte_pkeys)(vcpu, pte);
accessed_dirty = have_ad ? pte_access & PT_GUEST_ACCESSED_MASK : 0;
/* Convert to ACC_*_MASK flags for struct guest_walker. */
walker->pt_access = FNAME(gpte_access)(pt_access ^ walk_nx_mask);
walker->pte_access = FNAME(gpte_access)(pte_access ^ walk_nx_mask);
errcode = permission_fault(vcpu, mmu, walker->pte_access, pte_pkey, access);
if (unlikely(errcode))
......@@ -463,7 +465,8 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
}
pgprintk("%s: pte %llx pte_access %x pt_access %x\n",
__func__, (u64)pte, walker->pte_access, walker->pt_access);
__func__, (u64)pte, walker->pte_access,
walker->pt_access[walker->level - 1]);
return 1;
error:
......@@ -643,7 +646,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
bool huge_page_disallowed = exec && nx_huge_page_workaround_enabled;
struct kvm_mmu_page *sp = NULL;
struct kvm_shadow_walk_iterator it;
unsigned direct_access, access = gw->pt_access;
unsigned int direct_access, access;
int top_level, level, req_level, ret;
gfn_t base_gfn = gw->gfn;
......@@ -675,6 +678,7 @@ static int FNAME(fetch)(struct kvm_vcpu *vcpu, gpa_t addr,
sp = NULL;
if (!is_shadow_present_pte(*it.sptep)) {
table_gfn = gw->table_gfn[it.level - 2];
access = gw->pt_access[it.level - 2];
sp = kvm_mmu_get_page(vcpu, table_gfn, addr, it.level-1,
false, access);
}
......
......@@ -1103,10 +1103,9 @@ __sev_send_start_query_session_length(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct sev_data_send_start data;
int ret;
memset(&data, 0, sizeof(data));
data.handle = sev->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_START, &data, &argp->error);
if (ret < 0)
return ret;
params->session_len = data.session_len;
if (copy_to_user((void __user *)(uintptr_t)argp->data, params,
......@@ -1215,10 +1214,9 @@ __sev_send_update_data_query_lengths(struct kvm *kvm, struct kvm_sev_cmd *argp,
struct sev_data_send_update_data data;
int ret;
memset(&data, 0, sizeof(data));
data.handle = sev->handle;
ret = sev_issue_cmd(kvm, SEV_CMD_SEND_UPDATE_DATA, &data, &argp->error);
if (ret < 0)
return ret;
params->hdr_len = data.hdr_len;
params->trans_len = data.trans_len;
......
......@@ -1550,16 +1550,16 @@ TRACE_EVENT(kvm_nested_vmenter_failed,
TP_ARGS(msg, err),
TP_STRUCT__entry(
__field(const char *, msg)
__string(msg, msg)
__field(u32, err)
),
TP_fast_assign(
__entry->msg = msg;
__assign_str(msg, msg);
__entry->err = err;
),
TP_printk("%s%s", __entry->msg, !__entry->err ? "" :
TP_printk("%s%s", __get_str(msg), !__entry->err ? "" :
__print_symbolic(__entry->err, VMX_VMENTER_INSTRUCTION_ERRORS))
);
......
......@@ -3072,6 +3072,19 @@ static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu)
static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
if (!tdp_enabled) {
/*
* A TLB flush on behalf of the guest is equivalent to
* INVPCID(all), toggling CR4.PGE, etc., which requires
* a forced sync of the shadow page tables. Unload the
* entire MMU here and the subsequent load will sync the
* shadow page tables, and also flush the TLB.
*/
kvm_mmu_unload(vcpu);
return;
}
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
......@@ -3101,9 +3114,11 @@ static void record_steal_time(struct kvm_vcpu *vcpu)
* expensive IPIs.
*/
if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) {
u8 st_preempted = xchg(&st->preempted, 0);
trace_kvm_pv_tlb_flush(vcpu->vcpu_id,
st->preempted & KVM_VCPU_FLUSH_TLB);
if (xchg(&st->preempted, 0) & KVM_VCPU_FLUSH_TLB)
st_preempted & KVM_VCPU_FLUSH_TLB);
if (st_preempted & KVM_VCPU_FLUSH_TLB)
kvm_vcpu_flush_tlb_guest(vcpu);
} else {
st->preempted = 0;
......
......@@ -1185,7 +1185,15 @@ __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn)
static inline unsigned long
__gfn_to_hva_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
{
return slot->userspace_addr + (gfn - slot->base_gfn) * PAGE_SIZE;
/*
* The index was checked originally in search_memslots. To avoid
* that a malicious guest builds a Spectre gadget out of e.g. page
* table walks, do not let the processor speculate loads outside
* the guest's registered memslots.
*/
unsigned long offset = gfn - slot->base_gfn;
offset = array_index_nospec(offset, slot->npages);
return slot->userspace_addr + offset * PAGE_SIZE;
}
static inline int memslot_id(struct kvm *kvm, gfn_t gfn)
......
......@@ -43,6 +43,7 @@ enum vm_guest_mode {
VM_MODE_P40V48_4K,
VM_MODE_P40V48_64K,
VM_MODE_PXXV48_4K, /* For 48bits VA but ANY bits PA */
VM_MODE_P47V64_4K,
NUM_VM_MODES,
};
......@@ -60,7 +61,7 @@ enum vm_guest_mode {
#elif defined(__s390x__)
#define VM_MODE_DEFAULT VM_MODE_P52V48_4K
#define VM_MODE_DEFAULT VM_MODE_P47V64_4K
#define MIN_PAGE_SHIFT 12U
#define ptes_per_page(page_size) ((page_size) / 16)
......@@ -285,10 +286,11 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me
uint32_t num_percpu_pages, void *guest_code,
uint32_t vcpuids[]);
/* Like vm_create_default_with_vcpus, but accepts mode as a parameter */
/* Like vm_create_default_with_vcpus, but accepts mode and slot0 memory as a parameter */
struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
uint64_t extra_mem_pages, uint32_t num_percpu_pages,
void *guest_code, uint32_t vcpuids[]);
uint64_t slot0_mem_pages, uint64_t extra_mem_pages,
uint32_t num_percpu_pages, void *guest_code,
uint32_t vcpuids[]);
/*
* Adds a vCPU with reasonable defaults (e.g. a stack)
......
......@@ -268,7 +268,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
/* Create a VM with enough guest pages */
guest_num_pages = test_mem_size / guest_page_size;
vm = vm_create_with_vcpus(mode, nr_vcpus,
vm = vm_create_with_vcpus(mode, nr_vcpus, DEFAULT_GUEST_PHY_PAGES,
guest_num_pages, 0, guest_code, NULL);
/* Align down GPA of the testing memslot */
......
......@@ -175,6 +175,7 @@ const char *vm_guest_mode_string(uint32_t i)
[VM_MODE_P40V48_4K] = "PA-bits:40, VA-bits:48, 4K pages",
[VM_MODE_P40V48_64K] = "PA-bits:40, VA-bits:48, 64K pages",
[VM_MODE_PXXV48_4K] = "PA-bits:ANY, VA-bits:48, 4K pages",
[VM_MODE_P47V64_4K] = "PA-bits:47, VA-bits:64, 4K pages",
};
_Static_assert(sizeof(strings)/sizeof(char *) == NUM_VM_MODES,
"Missing new mode strings?");
......@@ -192,6 +193,7 @@ const struct vm_guest_mode_params vm_guest_mode_params[] = {
{ 40, 48, 0x1000, 12 },
{ 40, 48, 0x10000, 16 },
{ 0, 0, 0x1000, 12 },
{ 47, 64, 0x1000, 12 },
};
_Static_assert(sizeof(vm_guest_mode_params)/sizeof(struct vm_guest_mode_params) == NUM_VM_MODES,
"Missing new mode params?");
......@@ -277,6 +279,9 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
TEST_FAIL("VM_MODE_PXXV48_4K not supported on non-x86 platforms");
#endif
break;
case VM_MODE_P47V64_4K:
vm->pgtable_levels = 5;
break;
default:
TEST_FAIL("Unknown guest mode, mode: 0x%x", mode);
}
......@@ -308,21 +313,50 @@ struct kvm_vm *vm_create(enum vm_guest_mode mode, uint64_t phy_pages, int perm)
return vm;
}
/*
* VM Create with customized parameters
*
* Input Args:
* mode - VM Mode (e.g. VM_MODE_P52V48_4K)
* nr_vcpus - VCPU count
* slot0_mem_pages - Slot0 physical memory size
* extra_mem_pages - Non-slot0 physical memory total size
* num_percpu_pages - Per-cpu physical memory pages
* guest_code - Guest entry point
* vcpuids - VCPU IDs
*
* Output Args: None
*
* Return:
* Pointer to opaque structure that describes the created VM.
*
* Creates a VM with the mode specified by mode (e.g. VM_MODE_P52V48_4K),
* with customized slot0 memory size, at least 512 pages currently.
* extra_mem_pages is only used to calculate the maximum page table size,
* no real memory allocation for non-slot0 memory in this function.
*/
struct kvm_vm *vm_create_with_vcpus(enum vm_guest_mode mode, uint32_t nr_vcpus,
uint64_t extra_mem_pages, uint32_t num_percpu_pages,
void *guest_code, uint32_t vcpuids[])
uint64_t slot0_mem_pages, uint64_t extra_mem_pages,
uint32_t num_percpu_pages, void *guest_code,
uint32_t vcpuids[])
{
uint64_t vcpu_pages, extra_pg_pages, pages;
struct kvm_vm *vm;
int i;
/* Force slot0 memory size not small than DEFAULT_GUEST_PHY_PAGES */
if (slot0_mem_pages < DEFAULT_GUEST_PHY_PAGES)
slot0_mem_pages = DEFAULT_GUEST_PHY_PAGES;
/* The maximum page table size for a memory region will be when the
* smallest pages are used. Considering each page contains x page
* table descriptors, the total extra size for page tables (for extra
* N pages) will be: N/x+N/x^2+N/x^3+... which is definitely smaller
* than N/x*2.
*/
uint64_t vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus;
uint64_t extra_pg_pages = (extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2;
uint64_t pages = DEFAULT_GUEST_PHY_PAGES + extra_mem_pages + vcpu_pages + extra_pg_pages;
struct kvm_vm *vm;
int i;
vcpu_pages = (DEFAULT_STACK_PGS + num_percpu_pages) * nr_vcpus;
extra_pg_pages = (slot0_mem_pages + extra_mem_pages + vcpu_pages) / PTES_PER_MIN_PAGE * 2;
pages = slot0_mem_pages + vcpu_pages + extra_pg_pages;
TEST_ASSERT(nr_vcpus <= kvm_check_cap(KVM_CAP_MAX_VCPUS),
"nr_vcpus = %d too large for host, max-vcpus = %d",
......@@ -354,8 +388,8 @@ struct kvm_vm *vm_create_default_with_vcpus(uint32_t nr_vcpus, uint64_t extra_me
uint32_t num_percpu_pages, void *guest_code,
uint32_t vcpuids[])
{
return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, extra_mem_pages,
num_percpu_pages, guest_code, vcpuids);
return vm_create_with_vcpus(VM_MODE_DEFAULT, nr_vcpus, DEFAULT_GUEST_PHY_PAGES,
extra_mem_pages, num_percpu_pages, guest_code, vcpuids);
}
struct kvm_vm *vm_create_default(uint32_t vcpuid, uint64_t extra_mem_pages,
......
......@@ -69,7 +69,7 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus,
TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0,
"Guest memory size is not guest page size aligned.");
vm = vm_create_with_vcpus(mode, vcpus,
vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES,
(vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size,
0, guest_code, NULL);
......
......@@ -267,7 +267,7 @@ static bool prepare_vm(struct vm_data *data, int nslots, uint64_t *maxslots,
data->hva_slots = malloc(sizeof(*data->hva_slots) * data->nslots);
TEST_ASSERT(data->hva_slots, "malloc() fail");
data->vm = vm_create_default(VCPU_ID, 1024, guest_code);
data->vm = vm_create_default(VCPU_ID, mempages, guest_code);
pr_info_v("Adding slots 1..%i, each slot with %"PRIu64" pages + %"PRIu64" extra pages last\n",
max_mem_slots - 1, data->pages_per_slot, rempages);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment