Commit 8a7ae055 authored by Izik Eidus's avatar Izik Eidus Committed by Avi Kivity

KVM: MMU: Partial swapping of guest memory

This allows guest memory to be swapped.  Pages which are currently mapped
via shadow page tables are pinned into memory, but all other pages can
be freely swapped.

The patch makes gfn_to_page() elevate the page's reference count, and
introduces kvm_release_page() that pairs with it.
Signed-off-by: default avatarIzik Eidus <izike@qumranet.com>
Signed-off-by: default avatarAvi Kivity <avi@qumranet.com>
parent cea7bb21
...@@ -409,6 +409,7 @@ struct kvm_memory_slot { ...@@ -409,6 +409,7 @@ struct kvm_memory_slot {
unsigned long *rmap; unsigned long *rmap;
unsigned long *dirty_bitmap; unsigned long *dirty_bitmap;
int user_alloc; /* user allocated memory */ int user_alloc; /* user allocated memory */
unsigned long userspace_addr;
}; };
struct kvm { struct kvm {
...@@ -570,6 +571,7 @@ extern struct page *bad_page; ...@@ -570,6 +571,7 @@ extern struct page *bad_page;
int is_error_page(struct page *page); int is_error_page(struct page *page);
gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn); gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn);
struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn); struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn);
void kvm_release_page(struct page *page);
int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
int len); int len);
int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len); int kvm_read_guest(struct kvm *kvm, gpa_t gpa, void *data, unsigned long len);
......
...@@ -300,19 +300,6 @@ static struct kvm *kvm_create_vm(void) ...@@ -300,19 +300,6 @@ static struct kvm *kvm_create_vm(void)
return kvm; return kvm;
} }
static void kvm_free_userspace_physmem(struct kvm_memory_slot *free)
{
int i;
for (i = 0; i < free->npages; ++i) {
if (free->phys_mem[i]) {
if (!PageReserved(free->phys_mem[i]))
SetPageDirty(free->phys_mem[i]);
page_cache_release(free->phys_mem[i]);
}
}
}
static void kvm_free_kernel_physmem(struct kvm_memory_slot *free) static void kvm_free_kernel_physmem(struct kvm_memory_slot *free)
{ {
int i; int i;
...@@ -330,9 +317,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free, ...@@ -330,9 +317,7 @@ static void kvm_free_physmem_slot(struct kvm_memory_slot *free,
{ {
if (!dont || free->phys_mem != dont->phys_mem) if (!dont || free->phys_mem != dont->phys_mem)
if (free->phys_mem) { if (free->phys_mem) {
if (free->user_alloc) if (!free->user_alloc)
kvm_free_userspace_physmem(free);
else
kvm_free_kernel_physmem(free); kvm_free_kernel_physmem(free);
vfree(free->phys_mem); vfree(free->phys_mem);
} }
...@@ -361,7 +346,7 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu) ...@@ -361,7 +346,7 @@ static void free_pio_guest_pages(struct kvm_vcpu *vcpu)
for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i) for (i = 0; i < ARRAY_SIZE(vcpu->pio.guest_pages); ++i)
if (vcpu->pio.guest_pages[i]) { if (vcpu->pio.guest_pages[i]) {
__free_page(vcpu->pio.guest_pages[i]); kvm_release_page(vcpu->pio.guest_pages[i]);
vcpu->pio.guest_pages[i] = NULL; vcpu->pio.guest_pages[i] = NULL;
} }
} }
...@@ -752,19 +737,8 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm, ...@@ -752,19 +737,8 @@ static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
memset(new.phys_mem, 0, npages * sizeof(struct page *)); memset(new.phys_mem, 0, npages * sizeof(struct page *));
memset(new.rmap, 0, npages * sizeof(*new.rmap)); memset(new.rmap, 0, npages * sizeof(*new.rmap));
if (user_alloc) { if (user_alloc) {
unsigned long pages_num;
new.user_alloc = 1; new.user_alloc = 1;
down_read(&current->mm->mmap_sem); new.userspace_addr = mem->userspace_addr;
pages_num = get_user_pages(current, current->mm,
mem->userspace_addr,
npages, 1, 1, new.phys_mem,
NULL);
up_read(&current->mm->mmap_sem);
if (pages_num != npages)
goto out_unlock;
} else { } else {
for (i = 0; i < npages; ++i) { for (i = 0; i < npages; ++i) {
new.phys_mem[i] = alloc_page(GFP_HIGHUSER new.phys_mem[i] = alloc_page(GFP_HIGHUSER
...@@ -1039,12 +1013,39 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn) ...@@ -1039,12 +1013,39 @@ struct page *gfn_to_page(struct kvm *kvm, gfn_t gfn)
gfn = unalias_gfn(kvm, gfn); gfn = unalias_gfn(kvm, gfn);
slot = __gfn_to_memslot(kvm, gfn); slot = __gfn_to_memslot(kvm, gfn);
if (!slot) if (!slot) {
get_page(bad_page);
return bad_page; return bad_page;
}
if (slot->user_alloc) {
struct page *page[1];
int npages;
down_read(&current->mm->mmap_sem);
npages = get_user_pages(current, current->mm,
slot->userspace_addr
+ (gfn - slot->base_gfn) * PAGE_SIZE, 1,
1, 1, page, NULL);
up_read(&current->mm->mmap_sem);
if (npages != 1) {
get_page(bad_page);
return bad_page;
}
return page[0];
}
get_page(slot->phys_mem[gfn - slot->base_gfn]);
return slot->phys_mem[gfn - slot->base_gfn]; return slot->phys_mem[gfn - slot->base_gfn];
} }
EXPORT_SYMBOL_GPL(gfn_to_page); EXPORT_SYMBOL_GPL(gfn_to_page);
void kvm_release_page(struct page *page)
{
if (!PageReserved(page))
SetPageDirty(page);
put_page(page);
}
EXPORT_SYMBOL_GPL(kvm_release_page);
static int next_segment(unsigned long len, int offset) static int next_segment(unsigned long len, int offset)
{ {
if (len > PAGE_SIZE - offset) if (len > PAGE_SIZE - offset)
...@@ -1060,13 +1061,16 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, ...@@ -1060,13 +1061,16 @@ int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset,
struct page *page; struct page *page;
page = gfn_to_page(kvm, gfn); page = gfn_to_page(kvm, gfn);
if (is_error_page(page)) if (is_error_page(page)) {
kvm_release_page(page);
return -EFAULT; return -EFAULT;
}
page_virt = kmap_atomic(page, KM_USER0); page_virt = kmap_atomic(page, KM_USER0);
memcpy(data, page_virt + offset, len); memcpy(data, page_virt + offset, len);
kunmap_atomic(page_virt, KM_USER0); kunmap_atomic(page_virt, KM_USER0);
kvm_release_page(page);
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(kvm_read_guest_page); EXPORT_SYMBOL_GPL(kvm_read_guest_page);
...@@ -1098,14 +1102,17 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data, ...@@ -1098,14 +1102,17 @@ int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn, const void *data,
struct page *page; struct page *page;
page = gfn_to_page(kvm, gfn); page = gfn_to_page(kvm, gfn);
if (is_error_page(page)) if (is_error_page(page)) {
kvm_release_page(page);
return -EFAULT; return -EFAULT;
}
page_virt = kmap_atomic(page, KM_USER0); page_virt = kmap_atomic(page, KM_USER0);
memcpy(page_virt + offset, data, len); memcpy(page_virt + offset, data, len);
kunmap_atomic(page_virt, KM_USER0); kunmap_atomic(page_virt, KM_USER0);
mark_page_dirty(kvm, gfn); mark_page_dirty(kvm, gfn);
kvm_release_page(page);
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(kvm_write_guest_page); EXPORT_SYMBOL_GPL(kvm_write_guest_page);
...@@ -1136,13 +1143,16 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len) ...@@ -1136,13 +1143,16 @@ int kvm_clear_guest_page(struct kvm *kvm, gfn_t gfn, int offset, int len)
struct page *page; struct page *page;
page = gfn_to_page(kvm, gfn); page = gfn_to_page(kvm, gfn);
if (is_error_page(page)) if (is_error_page(page)) {
kvm_release_page(page);
return -EFAULT; return -EFAULT;
}
page_virt = kmap_atomic(page, KM_USER0); page_virt = kmap_atomic(page, KM_USER0);
memset(page_virt + offset, 0, len); memset(page_virt + offset, 0, len);
kunmap_atomic(page_virt, KM_USER0); kunmap_atomic(page_virt, KM_USER0);
kvm_release_page(page);
return 0; return 0;
} }
EXPORT_SYMBOL_GPL(kvm_clear_guest_page); EXPORT_SYMBOL_GPL(kvm_clear_guest_page);
...@@ -2070,8 +2080,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, ...@@ -2070,8 +2080,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
for (i = 0; i < nr_pages; ++i) { for (i = 0; i < nr_pages; ++i) {
mutex_lock(&vcpu->kvm->lock); mutex_lock(&vcpu->kvm->lock);
page = gva_to_page(vcpu, address + i * PAGE_SIZE); page = gva_to_page(vcpu, address + i * PAGE_SIZE);
if (page)
get_page(page);
vcpu->pio.guest_pages[i] = page; vcpu->pio.guest_pages[i] = page;
mutex_unlock(&vcpu->kvm->lock); mutex_unlock(&vcpu->kvm->lock);
if (!page) { if (!page) {
...@@ -3074,9 +3082,10 @@ static struct page *kvm_vm_nopage(struct vm_area_struct *vma, ...@@ -3074,9 +3082,10 @@ static struct page *kvm_vm_nopage(struct vm_area_struct *vma,
pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;
page = gfn_to_page(kvm, pgoff); page = gfn_to_page(kvm, pgoff);
if (is_error_page(page)) if (is_error_page(page)) {
kvm_release_page(page);
return NOPAGE_SIGBUS; return NOPAGE_SIGBUS;
get_page(page); }
if (type != NULL) if (type != NULL)
*type = VM_FAULT_MINOR; *type = VM_FAULT_MINOR;
......
...@@ -425,6 +425,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte) ...@@ -425,6 +425,8 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
if (!is_rmap_pte(*spte)) if (!is_rmap_pte(*spte))
return; return;
page = page_header(__pa(spte)); page = page_header(__pa(spte));
kvm_release_page(pfn_to_page((*spte & PT64_BASE_ADDR_MASK) >>
PAGE_SHIFT));
rmapp = gfn_to_rmap(kvm, page->gfns[spte - page->spt]); rmapp = gfn_to_rmap(kvm, page->gfns[spte - page->spt]);
if (!*rmapp) { if (!*rmapp) {
printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte);
...@@ -911,6 +913,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) ...@@ -911,6 +913,8 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
PT_USER_MASK; PT_USER_MASK;
if (!was_rmapped) if (!was_rmapped)
rmap_add(vcpu, &table[index], v >> PAGE_SHIFT); rmap_add(vcpu, &table[index], v >> PAGE_SHIFT);
else
kvm_release_page(pfn_to_page(p >> PAGE_SHIFT));
return 0; return 0;
} }
...@@ -925,6 +929,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p) ...@@ -925,6 +929,7 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, hpa_t p)
1, 3, &table[index]); 1, 3, &table[index]);
if (!new_table) { if (!new_table) {
pgprintk("nonpaging_map: ENOMEM\n"); pgprintk("nonpaging_map: ENOMEM\n");
kvm_release_page(pfn_to_page(p >> PAGE_SHIFT));
return -ENOMEM; return -ENOMEM;
} }
...@@ -1039,8 +1044,11 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, ...@@ -1039,8 +1044,11 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
paddr = gpa_to_hpa(vcpu->kvm, addr & PT64_BASE_ADDR_MASK); paddr = gpa_to_hpa(vcpu->kvm, addr & PT64_BASE_ADDR_MASK);
if (is_error_hpa(paddr)) if (is_error_hpa(paddr)) {
kvm_release_page(pfn_to_page((paddr & PT64_BASE_ADDR_MASK)
>> PAGE_SHIFT));
return 1; return 1;
}
return nonpaging_map(vcpu, addr & PAGE_MASK, paddr); return nonpaging_map(vcpu, addr & PAGE_MASK, paddr);
} }
...@@ -1507,6 +1515,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, ...@@ -1507,6 +1515,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
} else { } else {
gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va); gpa_t gpa = vcpu->mmu.gva_to_gpa(vcpu, va);
hpa_t hpa = gpa_to_hpa(vcpu, gpa); hpa_t hpa = gpa_to_hpa(vcpu, gpa);
struct page *page;
if (is_shadow_present_pte(ent) if (is_shadow_present_pte(ent)
&& (ent & PT64_BASE_ADDR_MASK) != hpa) && (ent & PT64_BASE_ADDR_MASK) != hpa)
...@@ -1519,6 +1528,9 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte, ...@@ -1519,6 +1528,9 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
&& !is_error_hpa(hpa)) && !is_error_hpa(hpa))
printk(KERN_ERR "audit: (%s) notrap shadow," printk(KERN_ERR "audit: (%s) notrap shadow,"
" valid guest gva %lx\n", audit_msg, va); " valid guest gva %lx\n", audit_msg, va);
page = pfn_to_page((gpa & PT64_BASE_ADDR_MASK)
>> PAGE_SHIFT);
kvm_release_page(page);
} }
} }
......
...@@ -72,7 +72,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, ...@@ -72,7 +72,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
struct kvm_vcpu *vcpu, gva_t addr, struct kvm_vcpu *vcpu, gva_t addr,
int write_fault, int user_fault, int fetch_fault) int write_fault, int user_fault, int fetch_fault)
{ {
struct page *page; struct page *page = NULL;
pt_element_t *table; pt_element_t *table;
pt_element_t pte; pt_element_t pte;
gfn_t table_gfn; gfn_t table_gfn;
...@@ -149,6 +149,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, ...@@ -149,6 +149,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
walker->inherited_ar &= pte; walker->inherited_ar &= pte;
--walker->level; --walker->level;
kvm_release_page(page);
} }
if (write_fault && !is_dirty_pte(pte)) { if (write_fault && !is_dirty_pte(pte)) {
...@@ -162,6 +163,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker, ...@@ -162,6 +163,7 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte)); kvm_mmu_pte_write(vcpu, pte_gpa, (u8 *)&pte, sizeof(pte));
} }
kvm_release_page(page);
walker->pte = pte; walker->pte = pte;
pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)pte); pgprintk("%s: pte %llx\n", __FUNCTION__, (u64)pte);
return 1; return 1;
...@@ -180,6 +182,8 @@ static int FNAME(walk_addr)(struct guest_walker *walker, ...@@ -180,6 +182,8 @@ static int FNAME(walk_addr)(struct guest_walker *walker,
walker->error_code |= PFERR_USER_MASK; walker->error_code |= PFERR_USER_MASK;
if (fetch_fault) if (fetch_fault)
walker->error_code |= PFERR_FETCH_MASK; walker->error_code |= PFERR_FETCH_MASK;
if (page)
kvm_release_page(page);
return 0; return 0;
} }
...@@ -223,6 +227,8 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, ...@@ -223,6 +227,8 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
if (is_error_hpa(paddr)) { if (is_error_hpa(paddr)) {
set_shadow_pte(shadow_pte, set_shadow_pte(shadow_pte,
shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK); shadow_trap_nonpresent_pte | PT_SHADOW_IO_MARK);
kvm_release_page(pfn_to_page((paddr & PT64_BASE_ADDR_MASK)
>> PAGE_SHIFT));
return; return;
} }
...@@ -260,9 +266,20 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu, ...@@ -260,9 +266,20 @@ static void FNAME(set_pte_common)(struct kvm_vcpu *vcpu,
pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte); pgprintk("%s: setting spte %llx\n", __FUNCTION__, spte);
set_shadow_pte(shadow_pte, spte); set_shadow_pte(shadow_pte, spte);
page_header_update_slot(vcpu->kvm, shadow_pte, gaddr); page_header_update_slot(vcpu->kvm, shadow_pte, gaddr);
if (!was_rmapped) if (!was_rmapped) {
rmap_add(vcpu, shadow_pte, (gaddr & PT64_BASE_ADDR_MASK) rmap_add(vcpu, shadow_pte, (gaddr & PT64_BASE_ADDR_MASK)
>> PAGE_SHIFT); >> PAGE_SHIFT);
if (!is_rmap_pte(*shadow_pte)) {
struct page *page;
page = pfn_to_page((paddr & PT64_BASE_ADDR_MASK)
>> PAGE_SHIFT);
kvm_release_page(page);
}
}
else
kvm_release_page(pfn_to_page((paddr & PT64_BASE_ADDR_MASK)
>> PAGE_SHIFT));
if (!ptwrite || !*ptwrite) if (!ptwrite || !*ptwrite)
vcpu->last_pte_updated = shadow_pte; vcpu->last_pte_updated = shadow_pte;
} }
...@@ -486,19 +503,22 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu, ...@@ -486,19 +503,22 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
{ {
int i; int i;
pt_element_t *gpt; pt_element_t *gpt;
struct page *page;
if (sp->role.metaphysical || PTTYPE == 32) { if (sp->role.metaphysical || PTTYPE == 32) {
nonpaging_prefetch_page(vcpu, sp); nonpaging_prefetch_page(vcpu, sp);
return; return;
} }
gpt = kmap_atomic(gfn_to_page(vcpu->kvm, sp->gfn), KM_USER0); page = gfn_to_page(vcpu->kvm, sp->gfn);
gpt = kmap_atomic(page, KM_USER0);
for (i = 0; i < PT64_ENT_PER_PAGE; ++i) for (i = 0; i < PT64_ENT_PER_PAGE; ++i)
if (is_present_pte(gpt[i])) if (is_present_pte(gpt[i]))
sp->spt[i] = shadow_trap_nonpresent_pte; sp->spt[i] = shadow_trap_nonpresent_pte;
else else
sp->spt[i] = shadow_notrap_nonpresent_pte; sp->spt[i] = shadow_notrap_nonpresent_pte;
kunmap_atomic(gpt, KM_USER0); kunmap_atomic(gpt, KM_USER0);
kvm_release_page(page);
} }
#undef pt_element_t #undef pt_element_t
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment