Commit 3ee07371 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

[PATCH] vmtrunc: unmap_mapping dropping i_mmap_lock

vmtruncate (or more generally, unmap_mapping_range) has been observed
responsible for very high latencies: the lockbreak work in unmap_vmas is good
for munmap or exit_mmap, but no use while mapping->i_mmap_lock is held, to
keep our place in the prio_tree (or list) of a file's vmas.

Extend the zap_details block with i_mmap_lock pointer, so unmap_vmas can
detect if that needs lockbreak, and break_addr so it can notify where it left
off.

Add unmap_mapping_range_vma, used from both prio_tree and nonlinear list
handlers.  This is what now calls zap_page_range (above unmap_vmas), but
handles the lockbreak and restart issues: letting unmap_mapping_range_ tree or
list know when they need to start over because lock was dropped.

When restarting, of course there's a danger of never making progress.  Add
vm_truncate_count field to vm_area_struct, update that to mapping->
truncate_count once fully scanned, skip up-to-date vmas without a scan (and
without dropping i_mmap_lock).

Further danger of never making progress if a vma is very large: when breaking
out, save restart_vma and restart_addr (and restart_pgoff to confirm, in case
vma gets reused), to help continue where we left off.
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 25f5906c
...@@ -105,6 +105,7 @@ struct vm_area_struct { ...@@ -105,6 +105,7 @@ struct vm_area_struct {
units, *not* PAGE_CACHE_SIZE */ units, *not* PAGE_CACHE_SIZE */
struct file * vm_file; /* File we map to (can be NULL). */ struct file * vm_file; /* File we map to (can be NULL). */
void * vm_private_data; /* was vm_pte (shared mem) */ void * vm_private_data; /* was vm_pte (shared mem) */
unsigned int vm_truncate_count; /* compare mapping->truncate_count */
#ifndef CONFIG_MMU #ifndef CONFIG_MMU
atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */ atomic_t vm_usage; /* refcount (VMAs shared if !MMU) */
...@@ -577,7 +578,12 @@ struct zap_details { ...@@ -577,7 +578,12 @@ struct zap_details {
struct address_space *check_mapping; /* Check page->mapping if set */ struct address_space *check_mapping; /* Check page->mapping if set */
pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t first_index; /* Lowest page->index to unmap */
pgoff_t last_index; /* Highest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */
int atomic; /* May not schedule() */ spinlock_t *i_mmap_lock; /* For unmap_mapping_range: */
struct vm_area_struct *restart_vma; /* Where lock was dropped */
pgoff_t restart_pgoff; /* File offset for restart */
unsigned long restart_addr; /* Where we should restart */
unsigned long break_addr; /* Where unmap_vmas stopped */
unsigned int truncate_count; /* Compare vm_truncate_count */
}; };
void zap_page_range(struct vm_area_struct *vma, unsigned long address, void zap_page_range(struct vm_area_struct *vma, unsigned long address,
......
...@@ -655,7 +655,8 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, ...@@ -655,7 +655,8 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
unsigned long tlb_start = 0; /* For tlb_finish_mmu */ unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0; int tlb_start_valid = 0;
int ret = 0; int ret = 0;
int atomic = details && details->atomic; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
int fullmm = tlb_is_full_mm(*tlbp);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
unsigned long start; unsigned long start;
...@@ -694,16 +695,28 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, ...@@ -694,16 +695,28 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
if ((long)zap_bytes > 0) if ((long)zap_bytes > 0)
continue; continue;
if (!atomic) {
int fullmm = tlb_is_full_mm(*tlbp);
tlb_finish_mmu(*tlbp, tlb_start, start); tlb_finish_mmu(*tlbp, tlb_start, start);
cond_resched_lock(&mm->page_table_lock);
if (need_resched() ||
need_lockbreak(&mm->page_table_lock) ||
(i_mmap_lock && need_lockbreak(i_mmap_lock))) {
if (i_mmap_lock) {
/* must reset count of rss freed */
*tlbp = tlb_gather_mmu(mm, fullmm); *tlbp = tlb_gather_mmu(mm, fullmm);
tlb_start_valid = 0; details->break_addr = start;
goto out;
} }
spin_unlock(&mm->page_table_lock);
cond_resched();
spin_lock(&mm->page_table_lock);
}
*tlbp = tlb_gather_mmu(mm, fullmm);
tlb_start_valid = 0;
zap_bytes = ZAP_BLOCK_SIZE; zap_bytes = ZAP_BLOCK_SIZE;
} }
} }
out:
return ret; return ret;
} }
...@@ -1355,7 +1368,114 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, ...@@ -1355,7 +1368,114 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma,
/* /*
* Helper functions for unmap_mapping_range(). * Helper functions for unmap_mapping_range().
*/ *
* __ Notes on dropping i_mmap_lock to reduce latency while unmapping __
*
* We have to restart searching the prio_tree whenever we drop the lock,
* since the iterator is only valid while the lock is held, and anyway
* a later vma might be split and reinserted earlier while lock dropped.
*
* The list of nonlinear vmas could be handled more efficiently, using
* a placeholder, but handle it in the same way until a need is shown.
* It is important to search the prio_tree before nonlinear list: a vma
* may become nonlinear and be shifted from prio_tree to nonlinear list
* while the lock is dropped; but never shifted from list to prio_tree.
*
* In order to make forward progress despite restarting the search,
* vm_truncate_count is used to mark a vma as now dealt with, so we can
* quickly skip it next time around. Since the prio_tree search only
* shows us those vmas affected by unmapping the range in question, we
* can't efficiently keep all vmas in step with mapping->truncate_count:
* so instead reset them all whenever it wraps back to 0 (then go to 1).
* mapping->truncate_count and vma->vm_truncate_count are protected by
* i_mmap_lock.
*
* In order to make forward progress despite repeatedly restarting some
* large vma, note the break_addr set by unmap_vmas when it breaks out:
* and restart from that address when we reach that vma again (so long
* as another such was not inserted earlier while the lock was dropped).
*
* There is no guarantee that the restart_vma will remain intact when
* the lock is regained: but if it has been freed to some other use,
* it cannot then appear in the tree or list of vmas, so no harm done;
* if it has been reused for a new vma of the same mapping, nopage
* checks on i_size and truncate_count ensure it cannot be mapping any
* of the truncated pages, so the area below restart_addr is still safe
* to skip - but we must check pgoff to prevent spurious unmapping; or
* restart_vma may have been split or merged, shrunk or extended - but
* never shifted, so restart_addr and restart_pgoff remain in synch
* (even in the case of mremap move, which makes a copy of the vma).
*/
static void reset_vma_truncate_counts(struct address_space *mapping)
{
struct vm_area_struct *vma;
struct prio_tree_iter iter;
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
vma->vm_truncate_count = 0;
list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
vma->vm_truncate_count = 0;
}
static int unmap_mapping_range_vma(struct vm_area_struct *vma,
unsigned long start_addr, unsigned long end_addr,
struct zap_details *details)
{
int need_break;
if (vma == details->restart_vma &&
start_addr < details->restart_addr) {
/*
* Be careful: this vm_area_struct may have been reused
* meanwhile. If pgoff matches up, it's probably the
* same one (perhaps shrunk or extended), but no harm is
* done if actually it's new. If pgoff does not match,
* it would likely be wrong to unmap from restart_addr,
* but it must be new, so we can just mark it completed.
*/
start_addr = details->restart_addr;
if (linear_page_index(vma, start_addr) !=
details->restart_pgoff || start_addr >= end_addr) {
vma->vm_truncate_count = details->truncate_count;
return 0;
}
}
again:
details->break_addr = end_addr;
zap_page_range(vma, start_addr, end_addr - start_addr, details);
/*
* We cannot rely on the break test in unmap_vmas:
* on the one hand, we don't want to restart our loop
* just because that broke out for the page_table_lock;
* on the other hand, it does no test when vma is small.
*/
need_break = need_resched() ||
need_lockbreak(details->i_mmap_lock);
if (details->break_addr >= end_addr) {
/* We have now completed this vma: mark it so */
vma->vm_truncate_count = details->truncate_count;
if (!need_break)
return 0;
} else {
if (!need_break) {
start_addr = details->break_addr;
goto again;
}
details->restart_vma = vma;
details->restart_pgoff =
linear_page_index(vma, details->break_addr);
details->restart_addr = details->break_addr;
}
spin_unlock(details->i_mmap_lock);
cond_resched();
spin_lock(details->i_mmap_lock);
return -EINTR;
}
static inline void unmap_mapping_range_tree(struct prio_tree_root *root, static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
struct zap_details *details) struct zap_details *details)
{ {
...@@ -1363,8 +1483,13 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root, ...@@ -1363,8 +1483,13 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
struct prio_tree_iter iter; struct prio_tree_iter iter;
pgoff_t vba, vea, zba, zea; pgoff_t vba, vea, zba, zea;
restart:
vma_prio_tree_foreach(vma, &iter, root, vma_prio_tree_foreach(vma, &iter, root,
details->first_index, details->last_index) { details->first_index, details->last_index) {
/* Skip quickly over those we have already dealt with */
if (vma->vm_truncate_count == details->truncate_count)
continue;
vba = vma->vm_pgoff; vba = vma->vm_pgoff;
vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1;
/* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */
...@@ -1374,9 +1499,12 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root, ...@@ -1374,9 +1499,12 @@ static inline void unmap_mapping_range_tree(struct prio_tree_root *root,
zea = details->last_index; zea = details->last_index;
if (zea > vea) if (zea > vea)
zea = vea; zea = vea;
zap_page_range(vma,
if (unmap_mapping_range_vma(vma,
((zba - vba) << PAGE_SHIFT) + vma->vm_start, ((zba - vba) << PAGE_SHIFT) + vma->vm_start,
(zea - zba + 1) << PAGE_SHIFT, details); ((zea - vba + 1) << PAGE_SHIFT) + vma->vm_start,
details) < 0)
goto restart;
} }
} }
...@@ -1391,10 +1519,15 @@ static inline void unmap_mapping_range_list(struct list_head *head, ...@@ -1391,10 +1519,15 @@ static inline void unmap_mapping_range_list(struct list_head *head,
* across *all* the pages in each nonlinear VMA, not just the pages * across *all* the pages in each nonlinear VMA, not just the pages
* whose virtual address lies outside the file truncation point. * whose virtual address lies outside the file truncation point.
*/ */
restart:
list_for_each_entry(vma, head, shared.vm_set.list) { list_for_each_entry(vma, head, shared.vm_set.list) {
/* Skip quickly over those we have already dealt with */
if (vma->vm_truncate_count == details->truncate_count)
continue;
details->nonlinear_vma = vma; details->nonlinear_vma = vma;
zap_page_range(vma, vma->vm_start, if (unmap_mapping_range_vma(vma, vma->vm_start,
vma->vm_end - vma->vm_start, details); vma->vm_end, details) < 0)
goto restart;
} }
} }
...@@ -1433,13 +1566,20 @@ void unmap_mapping_range(struct address_space *mapping, ...@@ -1433,13 +1566,20 @@ void unmap_mapping_range(struct address_space *mapping,
details.nonlinear_vma = NULL; details.nonlinear_vma = NULL;
details.first_index = hba; details.first_index = hba;
details.last_index = hba + hlen - 1; details.last_index = hba + hlen - 1;
details.atomic = 1; /* A spinlock is held */
if (details.last_index < details.first_index) if (details.last_index < details.first_index)
details.last_index = ULONG_MAX; details.last_index = ULONG_MAX;
details.i_mmap_lock = &mapping->i_mmap_lock;
details.restart_vma = NULL;
spin_lock(&mapping->i_mmap_lock); spin_lock(&mapping->i_mmap_lock);
/* Protect against page fault */
/* Protect against page faults, and endless unmapping loops */
mapping->truncate_count++;
if (unlikely(mapping->truncate_count == 0)) {
mapping->truncate_count++; mapping->truncate_count++;
reset_vma_truncate_counts(mapping);
}
details.truncate_count = mapping->truncate_count;
if (unlikely(!prio_tree_empty(&mapping->i_mmap))) if (unlikely(!prio_tree_empty(&mapping->i_mmap)))
unmap_mapping_range_tree(&mapping->i_mmap, &details); unmap_mapping_range_tree(&mapping->i_mmap, &details);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment