Commit edcc56dc authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

[PATCH] rmaplock: kill page_map_lock

The pte_chains rmap used pte_chain_lock (bit_spin_lock on PG_chainlock) to
lock its pte_chains.  We kept this (as page_map_lock: bit_spin_lock on
PG_maplock) when we moved to objrmap.  But the file objrmap locks its vma tree
with mapping->i_mmap_lock, and the anon objrmap locks its vma list with
anon_vma->lock: so isn't the page_map_lock superfluous?

Pretty much, yes.  The mapcount was protected by it, and needs to become an
atomic: starting at -1 like page _count, so nr_mapped can be tracked precisely
up and down.  The last page_remove_rmap can't clear anon page mapping any
more, because of races with page_add_rmap; from which some BUG_ONs must go for
the same reason, but they've served their purpose.

vmscan decisions are naturally racy, little change there beyond removing
page_map_lock/unlock.  But to stabilize the file-backed page->mapping against
truncation while acquiring i_mmap_lock, page_referenced_file now needs page
lock to be held even for refill_inactive_zone.  There's a similar issue in
acquiring anon_vma->lock, where page lock doesn't help: which this patch
pretends to handle, but actually it needs the next.

Roughly 10% cut off lmbench fork numbers on my 2*HT*P4.  Must confess my
testing failed to show the races even while they were knowingly exposed: would
benefit from testing on racier equipment.
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 6f055bc1
......@@ -201,10 +201,9 @@ struct page {
page_flags_t flags; /* Atomic flags, some possibly
* updated asynchronously */
atomic_t _count; /* Usage count, see below. */
unsigned int mapcount; /* Count of ptes mapped in mms,
atomic_t _mapcount; /* Count of ptes mapped in mms,
* to show when page is mapped
* & limit reverse map searches,
* protected by PG_maplock.
* & limit reverse map searches.
*/
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads
......@@ -477,12 +476,27 @@ static inline pgoff_t page_index(struct page *page)
return page->index;
}
/*
* The atomic page->_mapcount, like _count, starts from -1:
* so that transitions both from it and to it can be tracked,
* using atomic_inc_and_test and atomic_add_negative(-1).
*/
static inline void reset_page_mapcount(struct page *page)
{
atomic_set(&(page)->_mapcount, -1);
}
static inline int page_mapcount(struct page *page)
{
return atomic_read(&(page)->_mapcount) + 1;
}
/*
* Return true if this page is mapped into pagetables.
*/
static inline int page_mapped(struct page *page)
{
return page->mapcount != 0;
return atomic_read(&(page)->_mapcount) >= 0;
}
/*
......
......@@ -69,12 +69,11 @@
#define PG_private 12 /* Has something at ->private */
#define PG_writeback 13 /* Page is under writeback */
#define PG_nosave 14 /* Used for system suspend/resume */
#define PG_maplock 15 /* Lock bit for rmap to ptes */
#define PG_compound 15 /* Part of a compound page */
#define PG_swapcache 16 /* Swap page: swp_entry_t in private */
#define PG_mappedtodisk 17 /* Has blocks allocated on-disk */
#define PG_reclaim 18 /* To be reclaimed asap */
#define PG_compound 19 /* Part of a compound page */
/*
......
......@@ -9,11 +9,6 @@
#include <linux/slab.h>
#include <linux/spinlock.h>
#define page_map_lock(page) \
bit_spin_lock(PG_maplock, (unsigned long *)&(page)->flags)
#define page_map_unlock(page) \
bit_spin_unlock(PG_maplock, (unsigned long *)&(page)->flags)
/*
* The anon_vma heads a list of private "related" vmas, to scan if
* an anonymous page pointing to this anon_vma needs to be unmapped:
......@@ -87,15 +82,13 @@ void page_remove_rmap(struct page *);
*/
static inline void page_dup_rmap(struct page *page)
{
page_map_lock(page);
page->mapcount++;
page_map_unlock(page);
atomic_inc(&page->_mapcount);
}
/*
* Called from mm/vmscan.c to handle paging out
*/
int page_referenced(struct page *);
int page_referenced(struct page *, int is_locked);
int try_to_unmap(struct page *);
#else /* !CONFIG_MMU */
......@@ -104,7 +97,7 @@ int try_to_unmap(struct page *);
#define anon_vma_prepare(vma) (0)
#define anon_vma_link(vma) do {} while (0)
#define page_referenced(page) TestClearPageReferenced(page)
#define page_referenced(page,l) TestClearPageReferenced(page)
#define try_to_unmap(page) SWAP_FAIL
#endif /* CONFIG_MMU */
......
......@@ -78,7 +78,7 @@ static void bad_page(const char *function, struct page *page)
function, current->comm, page);
printk(KERN_EMERG "flags:0x%08lx mapping:%p mapcount:%d count:%d\n",
(unsigned long)page->flags, page->mapping,
(int)page->mapcount, page_count(page));
page_mapcount(page), page_count(page));
printk(KERN_EMERG "Backtrace:\n");
dump_stack();
printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
......@@ -87,12 +87,11 @@ static void bad_page(const char *function, struct page *page)
1 << PG_lru |
1 << PG_active |
1 << PG_dirty |
1 << PG_maplock |
1 << PG_swapcache |
1 << PG_writeback);
set_page_count(page, 0);
reset_page_mapcount(page);
page->mapping = NULL;
page->mapcount = 0;
}
#ifndef CONFIG_HUGETLB_PAGE
......@@ -228,7 +227,6 @@ static inline void free_pages_check(const char *function, struct page *page)
1 << PG_active |
1 << PG_reclaim |
1 << PG_slab |
1 << PG_maplock |
1 << PG_swapcache |
1 << PG_writeback )))
bad_page(function, page);
......@@ -350,7 +348,6 @@ static void prep_new_page(struct page *page, int order)
1 << PG_active |
1 << PG_dirty |
1 << PG_reclaim |
1 << PG_maplock |
1 << PG_swapcache |
1 << PG_writeback )))
bad_page(__FUNCTION__, page);
......@@ -512,6 +509,8 @@ static void fastcall free_hot_cold_page(struct page *page, int cold)
kernel_map_pages(page, 1, 0);
inc_page_state(pgfree);
if (PageAnon(page))
page->mapping = NULL;
free_pages_check(__FUNCTION__, page);
pcp = &zone->pageset[get_cpu()].pcp[cold];
local_irq_save(flags);
......@@ -1389,6 +1388,7 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
for (page = start; page < (start + size); page++) {
set_page_zone(page, NODEZONE(nid, zone));
set_page_count(page, 0);
reset_page_mapcount(page);
SetPageReserved(page);
INIT_LIST_HEAD(&page->lru);
#ifdef WANT_PAGE_VIRTUAL
......
......@@ -163,13 +163,6 @@ void __init anon_vma_init(void)
sizeof(struct anon_vma), 0, SLAB_PANIC, anon_vma_ctor, NULL);
}
/* this needs the page->flags PG_maplock held */
static inline void clear_page_anon(struct page *page)
{
BUG_ON(!page->mapping);
page->mapping = NULL;
}
/*
* At what user virtual address is page expected in vma?
*/
......@@ -242,15 +235,22 @@ static int page_referenced_one(struct page *page,
return referenced;
}
static inline int page_referenced_anon(struct page *page)
static int page_referenced_anon(struct page *page)
{
unsigned int mapcount = page->mapcount;
unsigned int mapcount;
struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
struct vm_area_struct *vma;
int referenced = 0;
/*
* Recheck mapcount: it is not safe to take anon_vma->lock after
* last page_remove_rmap, since struct anon_vma might be reused.
*/
mapcount = page_mapcount(page);
if (!mapcount)
return referenced;
spin_lock(&anon_vma->lock);
BUG_ON(list_empty(&anon_vma->head));
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
referenced += page_referenced_one(page, vma, &mapcount);
if (!mapcount)
......@@ -274,18 +274,39 @@ static inline int page_referenced_anon(struct page *page)
* The spinlock address_space->i_mmap_lock is tried. If it can't be gotten,
* assume a reference count of 0, so try_to_unmap will then have a go.
*/
static inline int page_referenced_file(struct page *page)
static int page_referenced_file(struct page *page)
{
unsigned int mapcount = page->mapcount;
unsigned int mapcount;
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
struct vm_area_struct *vma;
struct prio_tree_iter iter;
int referenced = 0;
/*
* The caller's checks on page->mapping and !PageAnon have made
* sure that this is a file page: the check for page->mapping
* excludes the case just before it gets set on an anon page.
*/
BUG_ON(PageAnon(page));
/*
* The page lock not only makes sure that page->mapping cannot
* suddenly be NULLified by truncation, it makes sure that the
* structure at mapping cannot be freed and reused yet,
* so we can safely take mapping->i_mmap_lock.
*/
BUG_ON(!PageLocked(page));
if (!spin_trylock(&mapping->i_mmap_lock))
return 0;
/*
* i_mmap_lock does not stabilize mapcount at all, but mapcount
* is more likely to be accurate if we note it after spinning.
*/
mapcount = page_mapcount(page);
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE))
== (VM_LOCKED|VM_MAYSHARE)) {
......@@ -304,12 +325,12 @@ static inline int page_referenced_file(struct page *page)
/**
* page_referenced - test if the page was referenced
* @page: the page to test
* @is_locked: caller holds lock on the page
*
* Quick test_and_clear_referenced for all mappings to a page,
* returns the number of ptes which referenced the page.
* Caller needs to hold the rmap lock.
*/
int page_referenced(struct page *page)
int page_referenced(struct page *page, int is_locked)
{
int referenced = 0;
......@@ -319,11 +340,17 @@ int page_referenced(struct page *page)
if (TestClearPageReferenced(page))
referenced++;
if (page->mapcount && page->mapping) {
if (page_mapped(page) && page->mapping) {
if (PageAnon(page))
referenced += page_referenced_anon(page);
else
else if (is_locked)
referenced += page_referenced_file(page);
else if (TestSetPageLocked(page))
referenced++;
else if (page->mapping) {
referenced += page_referenced_file(page);
unlock_page(page);
}
}
return referenced;
}
......@@ -350,18 +377,12 @@ void page_add_anon_rmap(struct page *page,
index += vma->vm_pgoff;
index >>= PAGE_CACHE_SHIFT - PAGE_SHIFT;
page_map_lock(page);
if (!page->mapcount) {
BUG_ON(page->mapping);
if (atomic_inc_and_test(&page->_mapcount)) {
page->index = index;
page->mapping = (struct address_space *) anon_vma;
inc_page_state(nr_mapped);
} else {
BUG_ON(page->index != index);
BUG_ON(page->mapping != (struct address_space *) anon_vma);
}
page->mapcount++;
page_map_unlock(page);
/* else checking page index and mapping is racy */
}
/**
......@@ -376,11 +397,8 @@ void page_add_file_rmap(struct page *page)
if (!pfn_valid(page_to_pfn(page)) || PageReserved(page))
return;
page_map_lock(page);
if (!page->mapcount)
if (atomic_inc_and_test(&page->_mapcount))
inc_page_state(nr_mapped);
page->mapcount++;
page_map_unlock(page);
}
/**
......@@ -392,18 +410,20 @@ void page_add_file_rmap(struct page *page)
void page_remove_rmap(struct page *page)
{
BUG_ON(PageReserved(page));
BUG_ON(!page->mapcount);
page_map_lock(page);
page->mapcount--;
if (!page->mapcount) {
if (atomic_add_negative(-1, &page->_mapcount)) {
BUG_ON(page_mapcount(page) < 0);
/*
* It would be tidy to reset the PageAnon mapping here,
* but that might overwrite a racing page_add_anon_rmap
* which increments mapcount after us but sets mapping
* before us: so leave the reset to free_hot_cold_page,
* and remember that it's only reliable while mapped.
*/
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
if (PageAnon(page))
clear_page_anon(page);
dec_page_state(nr_mapped);
}
page_map_unlock(page);
}
/*
......@@ -475,7 +495,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
* ptes from being unmapped, so swapoff can make progress.
*/
if (PageSwapCache(page) &&
page_count(page) != page->mapcount + 2) {
page_count(page) != page_mapcount(page) + 2) {
ret = SWAP_FAIL;
goto out_unmap;
}
......@@ -501,8 +521,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
}
mm->rss--;
BUG_ON(!page->mapcount);
page->mapcount--;
page_remove_rmap(page);
page_cache_release(page);
out_unmap:
......@@ -613,17 +632,23 @@ static int try_to_unmap_cluster(unsigned long cursor,
return SWAP_AGAIN;
}
static inline int try_to_unmap_anon(struct page *page)
static int try_to_unmap_anon(struct page *page)
{
struct anon_vma *anon_vma = (void *) page->mapping - PAGE_MAPPING_ANON;
struct vm_area_struct *vma;
int ret = SWAP_AGAIN;
/*
* Recheck mapped: it is not safe to take anon_vma->lock after
* last page_remove_rmap, since struct anon_vma might be reused.
*/
if (!page_mapped(page))
return ret;
spin_lock(&anon_vma->lock);
BUG_ON(list_empty(&anon_vma->head));
list_for_each_entry(vma, &anon_vma->head, anon_vma_node) {
ret = try_to_unmap_one(page, vma);
if (ret == SWAP_FAIL || !page->mapcount)
if (ret == SWAP_FAIL || !page_mapped(page))
break;
}
spin_unlock(&anon_vma->lock);
......@@ -642,7 +667,7 @@ static inline int try_to_unmap_anon(struct page *page)
* The spinlock address_space->i_mmap_lock is tried. If it can't be gotten,
* return a temporary error.
*/
static inline int try_to_unmap_file(struct page *page)
static int try_to_unmap_file(struct page *page)
{
struct address_space *mapping = page->mapping;
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
......@@ -659,7 +684,7 @@ static inline int try_to_unmap_file(struct page *page)
vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) {
ret = try_to_unmap_one(page, vma);
if (ret == SWAP_FAIL || !page->mapcount)
if (ret == SWAP_FAIL || !page_mapped(page))
goto out;
}
......@@ -688,8 +713,9 @@ static inline int try_to_unmap_file(struct page *page)
* The mapcount of the page we came in with is irrelevant,
* but even so use it as a guide to how hard we should try?
*/
mapcount = page->mapcount;
page_map_unlock(page);
mapcount = page_mapcount(page);
if (!mapcount)
goto out;
cond_resched_lock(&mapping->i_mmap_lock);
max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
......@@ -712,7 +738,7 @@ static inline int try_to_unmap_file(struct page *page)
cursor += CLUSTER_SIZE;
vma->vm_private_data = (void *) cursor;
if ((int)mapcount <= 0)
goto relock;
goto out;
}
if (ret != SWAP_FAIL)
vma->vm_private_data =
......@@ -733,8 +759,6 @@ static inline int try_to_unmap_file(struct page *page)
if (!(vma->vm_flags & VM_RESERVED))
vma->vm_private_data = NULL;
}
relock:
page_map_lock(page);
out:
spin_unlock(&mapping->i_mmap_lock);
return ret;
......@@ -745,11 +769,11 @@ static inline int try_to_unmap_file(struct page *page)
* @page: the page to get unmapped
*
* Tries to remove all the page table entries which are mapping this
* page, used in the pageout path. Caller must hold the page lock
* and its rmap lock. Return values are:
* page, used in the pageout path. Caller must hold the page lock.
* Return values are:
*
* SWAP_SUCCESS - we succeeded in removing all mappings
* SWAP_AGAIN - we missed a trylock, try again later
* SWAP_AGAIN - we missed a mapping, try again later
* SWAP_FAIL - the page is unswappable
*/
int try_to_unmap(struct page *page)
......@@ -758,20 +782,13 @@ int try_to_unmap(struct page *page)
BUG_ON(PageReserved(page));
BUG_ON(!PageLocked(page));
BUG_ON(!page->mapcount);
if (PageAnon(page))
ret = try_to_unmap_anon(page);
else
ret = try_to_unmap_file(page);
if (!page->mapcount) {
if (page_test_and_clear_dirty(page))
set_page_dirty(page);
if (PageAnon(page))
clear_page_anon(page);
dec_page_state(nr_mapped);
if (!page_mapped(page))
ret = SWAP_SUCCESS;
}
return ret;
}
......@@ -219,7 +219,7 @@ static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
return 0;
}
/* Must be called with page's rmap lock held. */
/* Called without lock on whether page is mapped, so answer is unstable */
static inline int page_mapping_inuse(struct page *page)
{
struct address_space *mapping;
......@@ -377,26 +377,19 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
page_map_lock(page);
referenced = page_referenced(page);
if (referenced && page_mapping_inuse(page)) {
/* In active use or really unfreeable. Activate it. */
page_map_unlock(page);
referenced = page_referenced(page, 1);
/* In active use or really unfreeable? Activate it. */
if (referenced && page_mapping_inuse(page))
goto activate_locked;
}
#ifdef CONFIG_SWAP
/*
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*
* XXX: implement swap clustering ?
*/
if (PageAnon(page) && !PageSwapCache(page)) {
page_map_unlock(page);
if (!add_to_swap(page))
goto activate_locked;
page_map_lock(page);
}
#endif /* CONFIG_SWAP */
......@@ -411,16 +404,13 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
if (page_mapped(page) && mapping) {
switch (try_to_unmap(page)) {
case SWAP_FAIL:
page_map_unlock(page);
goto activate_locked;
case SWAP_AGAIN:
page_map_unlock(page);
goto keep_locked;
case SWAP_SUCCESS:
; /* try to free the page below */
}
}
page_map_unlock(page);
if (PageDirty(page)) {
if (referenced)
......@@ -723,25 +713,12 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc)
page = lru_to_page(&l_hold);
list_del(&page->lru);
if (page_mapped(page)) {
if (!reclaim_mapped) {
list_add(&page->lru, &l_active);
continue;
}
page_map_lock(page);
if (page_referenced(page)) {
page_map_unlock(page);
if (!reclaim_mapped ||
(total_swap_pages == 0 && PageAnon(page)) ||
page_referenced(page, 0)) {
list_add(&page->lru, &l_active);
continue;
}
page_map_unlock(page);
}
/*
* FIXME: need to consider page_count(page) here if/when we
* reap orphaned pages via the LRU (Daniel's locking stuff)
*/
if (total_swap_pages == 0 && PageAnon(page)) {
list_add(&page->lru, &l_active);
continue;
}
list_add(&page->lru, &l_inactive);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment