Commit 6f924b79 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] fix unuse_pmd fixme

From: Hugh Dickins <hugh@veritas.com>

try_to_unuse drop mmlist_lock across unuse_process (with pretty dance
of atomic_incs and mmputs of various mmlist markers, and a polite new
cond_resched there), so unuse_process can pte_chain_alloc(GFP_KERNEL)
and pass that down and down and down and down to unuse_pte: which
cannot succeed more than once on a given mm (make that explicit by
returning back up once succeeded).  Preliminary checks moved up from
unuse_pte to unuse_pmd, and done more efficiently (avoid that extra
pte_file test added recently), swapoff spends far too long in here.
Updated locking comments and references to try_to_swap_out.
parent 9549db1d
...@@ -377,41 +377,33 @@ void free_swap_and_cache(swp_entry_t entry) ...@@ -377,41 +377,33 @@ void free_swap_and_cache(swp_entry_t entry)
* share this swap entry, so be cautious and let do_wp_page work out * share this swap entry, so be cautious and let do_wp_page work out
* what to do if a write is requested later. * what to do if a write is requested later.
*/ */
/* mmlist_lock and vma->vm_mm->page_table_lock are held */ /* vma->vm_mm->page_table_lock is held */
static void static void
unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir, unuse_pte(struct vm_area_struct *vma, unsigned long address, pte_t *dir,
swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp) swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
{ {
pte_t pte = *dir; vma->vm_mm->rss++;
if (pte_file(pte))
return;
if (likely(pte_to_swp_entry(pte).val != entry.val))
return;
if (unlikely(pte_none(pte) || pte_present(pte)))
return;
get_page(page); get_page(page);
set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot))); set_pte(dir, pte_mkold(mk_pte(page, vma->vm_page_prot)));
*pte_chainp = page_add_rmap(page, dir, *pte_chainp); *pte_chainp = page_add_rmap(page, dir, *pte_chainp);
swap_free(entry); swap_free(entry);
++vma->vm_mm->rss;
} }
/* mmlist_lock and vma->vm_mm->page_table_lock are held */ /* vma->vm_mm->page_table_lock is held */
static void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, static int unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
unsigned long address, unsigned long size, unsigned long offset, unsigned long address, unsigned long size, unsigned long offset,
swp_entry_t entry, struct page* page) swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
{ {
pte_t * pte; pte_t * pte;
unsigned long end; unsigned long end;
struct pte_chain *pte_chain = NULL; pte_t swp_pte = swp_entry_to_pte(entry);
if (pmd_none(*dir)) if (pmd_none(*dir))
return; return 0;
if (pmd_bad(*dir)) { if (pmd_bad(*dir)) {
pmd_ERROR(*dir); pmd_ERROR(*dir);
pmd_clear(dir); pmd_clear(dir);
return; return 0;
} }
pte = pte_offset_map(dir, address); pte = pte_offset_map(dir, address);
offset += address & PMD_MASK; offset += address & PMD_MASK;
...@@ -421,33 +413,36 @@ static void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir, ...@@ -421,33 +413,36 @@ static void unuse_pmd(struct vm_area_struct * vma, pmd_t *dir,
end = PMD_SIZE; end = PMD_SIZE;
do { do {
/* /*
* FIXME: handle pte_chain_alloc() failures * swapoff spends a _lot_ of time in this loop!
* Test inline before going to call unuse_pte.
*/ */
if (pte_chain == NULL) if (unlikely(pte_same(*pte, swp_pte))) {
pte_chain = pte_chain_alloc(GFP_ATOMIC); unuse_pte(vma, offset + address, pte,
unuse_pte(vma, offset+address-vma->vm_start, entry, page, pte_chainp);
pte, entry, page, &pte_chain); pte_unmap(pte);
return 1;
}
address += PAGE_SIZE; address += PAGE_SIZE;
pte++; pte++;
} while (address && (address < end)); } while (address && (address < end));
pte_unmap(pte - 1); pte_unmap(pte - 1);
pte_chain_free(pte_chain); return 0;
} }
/* mmlist_lock and vma->vm_mm->page_table_lock are held */ /* vma->vm_mm->page_table_lock is held */
static void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, static int unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
unsigned long address, unsigned long size, unsigned long address, unsigned long size,
swp_entry_t entry, struct page* page) swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
{ {
pmd_t * pmd; pmd_t * pmd;
unsigned long offset, end; unsigned long offset, end;
if (pgd_none(*dir)) if (pgd_none(*dir))
return; return 0;
if (pgd_bad(*dir)) { if (pgd_bad(*dir)) {
pgd_ERROR(*dir); pgd_ERROR(*dir);
pgd_clear(dir); pgd_clear(dir);
return; return 0;
} }
pmd = pmd_offset(dir, address); pmd = pmd_offset(dir, address);
offset = address & PGDIR_MASK; offset = address & PGDIR_MASK;
...@@ -458,32 +453,42 @@ static void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir, ...@@ -458,32 +453,42 @@ static void unuse_pgd(struct vm_area_struct * vma, pgd_t *dir,
if (address >= end) if (address >= end)
BUG(); BUG();
do { do {
unuse_pmd(vma, pmd, address, end - address, offset, entry, if (unuse_pmd(vma, pmd, address, end - address,
page); offset, entry, page, pte_chainp))
return 1;
address = (address + PMD_SIZE) & PMD_MASK; address = (address + PMD_SIZE) & PMD_MASK;
pmd++; pmd++;
} while (address && (address < end)); } while (address && (address < end));
return 0;
} }
/* mmlist_lock and vma->vm_mm->page_table_lock are held */ /* vma->vm_mm->page_table_lock is held */
static void unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir, static int unuse_vma(struct vm_area_struct * vma, pgd_t *pgdir,
swp_entry_t entry, struct page* page) swp_entry_t entry, struct page *page, struct pte_chain **pte_chainp)
{ {
unsigned long start = vma->vm_start, end = vma->vm_end; unsigned long start = vma->vm_start, end = vma->vm_end;
if (start >= end) if (start >= end)
BUG(); BUG();
do { do {
unuse_pgd(vma, pgdir, start, end - start, entry, page); if (unuse_pgd(vma, pgdir, start, end - start,
entry, page, pte_chainp))
return 1;
start = (start + PGDIR_SIZE) & PGDIR_MASK; start = (start + PGDIR_SIZE) & PGDIR_MASK;
pgdir++; pgdir++;
} while (start && (start < end)); } while (start && (start < end));
return 0;
} }
static void unuse_process(struct mm_struct * mm, static int unuse_process(struct mm_struct * mm,
swp_entry_t entry, struct page* page) swp_entry_t entry, struct page* page)
{ {
struct vm_area_struct* vma; struct vm_area_struct* vma;
struct pte_chain *pte_chain;
pte_chain = pte_chain_alloc(GFP_KERNEL);
if (!pte_chain)
return -ENOMEM;
/* /*
* Go through process' page directory. * Go through process' page directory.
...@@ -491,10 +496,12 @@ static void unuse_process(struct mm_struct * mm, ...@@ -491,10 +496,12 @@ static void unuse_process(struct mm_struct * mm,
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
for (vma = mm->mmap; vma; vma = vma->vm_next) { for (vma = mm->mmap; vma; vma = vma->vm_next) {
pgd_t * pgd = pgd_offset(mm, vma->vm_start); pgd_t * pgd = pgd_offset(mm, vma->vm_start);
unuse_vma(vma, pgd, entry, page); if (unuse_vma(vma, pgd, entry, page, &pte_chain))
break;
} }
spin_unlock(&mm->page_table_lock); spin_unlock(&mm->page_table_lock);
return; pte_chain_free(pte_chain);
return 0;
} }
/* /*
...@@ -638,36 +645,54 @@ static int try_to_unuse(unsigned int type) ...@@ -638,36 +645,54 @@ static int try_to_unuse(unsigned int type)
if (start_mm == &init_mm) if (start_mm == &init_mm)
shmem = shmem_unuse(entry, page); shmem = shmem_unuse(entry, page);
else else
unuse_process(start_mm, entry, page); retval = unuse_process(start_mm, entry, page);
} }
if (*swap_map > 1) { if (*swap_map > 1) {
int set_start_mm = (*swap_map >= swcount); int set_start_mm = (*swap_map >= swcount);
struct list_head *p = &start_mm->mmlist; struct list_head *p = &start_mm->mmlist;
struct mm_struct *new_start_mm = start_mm; struct mm_struct *new_start_mm = start_mm;
struct mm_struct *prev_mm = start_mm;
struct mm_struct *mm; struct mm_struct *mm;
atomic_inc(&new_start_mm->mm_users);
atomic_inc(&prev_mm->mm_users);
spin_lock(&mmlist_lock); spin_lock(&mmlist_lock);
while (*swap_map > 1 && while (*swap_map > 1 && !retval &&
(p = p->next) != &start_mm->mmlist) { (p = p->next) != &start_mm->mmlist) {
mm = list_entry(p, struct mm_struct, mmlist); mm = list_entry(p, struct mm_struct, mmlist);
atomic_inc(&mm->mm_users);
spin_unlock(&mmlist_lock);
mmput(prev_mm);
prev_mm = mm;
cond_resched();
swcount = *swap_map; swcount = *swap_map;
if (mm == &init_mm) { if (swcount <= 1)
;
else if (mm == &init_mm) {
set_start_mm = 1; set_start_mm = 1;
spin_unlock(&mmlist_lock);
shmem = shmem_unuse(entry, page); shmem = shmem_unuse(entry, page);
spin_lock(&mmlist_lock);
} else } else
unuse_process(mm, entry, page); retval = unuse_process(mm, entry, page);
if (set_start_mm && *swap_map < swcount) { if (set_start_mm && *swap_map < swcount) {
mmput(new_start_mm);
atomic_inc(&mm->mm_users);
new_start_mm = mm; new_start_mm = mm;
set_start_mm = 0; set_start_mm = 0;
} }
spin_lock(&mmlist_lock);
} }
atomic_inc(&new_start_mm->mm_users);
spin_unlock(&mmlist_lock); spin_unlock(&mmlist_lock);
mmput(prev_mm);
mmput(start_mm); mmput(start_mm);
start_mm = new_start_mm; start_mm = new_start_mm;
} }
if (retval) {
unlock_page(page);
page_cache_release(page);
break;
}
/* /*
* How could swap count reach 0x7fff when the maximum * How could swap count reach 0x7fff when the maximum
...@@ -691,7 +716,7 @@ static int try_to_unuse(unsigned int type) ...@@ -691,7 +716,7 @@ static int try_to_unuse(unsigned int type)
/* /*
* If a reference remains (rare), we would like to leave * If a reference remains (rare), we would like to leave
* the page in the swap cache; but try_to_swap_out could * the page in the swap cache; but try_to_unmap could
* then re-duplicate the entry once we drop page lock, * then re-duplicate the entry once we drop page lock,
* so we might loop indefinitely; also, that page could * so we might loop indefinitely; also, that page could
* not be swapped out to other storage meanwhile. So: * not be swapped out to other storage meanwhile. So:
...@@ -727,7 +752,7 @@ static int try_to_unuse(unsigned int type) ...@@ -727,7 +752,7 @@ static int try_to_unuse(unsigned int type)
/* /*
* So we could skip searching mms once swap count went * So we could skip searching mms once swap count went
* to 1, we did not mark any present ptes as dirty: must * to 1, we did not mark any present ptes as dirty: must
* mark page dirty so try_to_swap_out will preserve it. * mark page dirty so shrink_list will preserve it.
*/ */
SetPageDirty(page); SetPageDirty(page);
unlock_page(page); unlock_page(page);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment