Commit 4c21e2f2 authored by Hugh Dickins's avatar Hugh Dickins Committed by Linus Torvalds

[PATCH] mm: split page table lock

Christoph Lameter demonstrated very poor scalability on the SGI 512-way, with
a many-threaded application which concurrently initializes different parts of
a large anonymous area.

This patch corrects that, by using a separate spinlock per page table page, to
guard the page table entries in that page, instead of using the mm's single
page_table_lock.  (But even then, page_table_lock is still used to guard page
table allocation, and anon_vma allocation.)

In this implementation, the spinlock is tucked inside the struct page of the
page table page: with a BUILD_BUG_ON in case it overflows - which it would in
the case of 32-bit PA-RISC with spinlock debugging enabled.

Splitting the lock is not quite for free: another cacheline access.  Ideally,
I suppose we would use split ptlock only for multi-threaded processes on
multi-cpu machines; but deciding that dynamically would have its own costs.
So for now enable it by config, at some number of cpus - since the Kconfig
language doesn't support inequalities, let preprocessor compare that with
NR_CPUS.  But I don't think it's worth being user-configurable: for good
testing of both split and unsplit configs, split now at 4 cpus, and perhaps
change that to 8 later.

There is a benefit even for singly threaded processes: kswapd can be attacking
one part of the mm while another part is busy faulting.
Signed-off-by: default avatarHugh Dickins <hugh@veritas.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent b38c6845
...@@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd) ...@@ -229,6 +229,7 @@ void free_pgd_slow(pgd_t *pgd)
pte = pmd_page(*pmd); pte = pmd_page(*pmd);
pmd_clear(pmd); pmd_clear(pmd);
dec_page_state(nr_page_table_pages); dec_page_state(nr_page_table_pages);
pte_lock_deinit(pte);
pte_free(pte); pte_free(pte);
pmd_free(pmd); pmd_free(pmd);
free: free:
......
...@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd) ...@@ -87,14 +87,14 @@ static inline void pgd_list_add(pgd_t *pgd)
if (pgd_list) if (pgd_list)
pgd_list->private = (unsigned long) &page->index; pgd_list->private = (unsigned long) &page->index;
pgd_list = page; pgd_list = page;
page->private = (unsigned long) &pgd_list; set_page_private(page, (unsigned long)&pgd_list);
} }
static inline void pgd_list_del(pgd_t *pgd) static inline void pgd_list_del(pgd_t *pgd)
{ {
struct page *next, **pprev, *page = virt_to_page(pgd); struct page *next, **pprev, *page = virt_to_page(pgd);
next = (struct page *) page->index; next = (struct page *) page->index;
pprev = (struct page **) page->private; pprev = (struct page **)page_private(page);
*pprev = next; *pprev = next;
if (next) if (next)
next->private = (unsigned long) pprev; next->private = (unsigned long) pprev;
......
...@@ -188,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd) ...@@ -188,19 +188,19 @@ static inline void pgd_list_add(pgd_t *pgd)
struct page *page = virt_to_page(pgd); struct page *page = virt_to_page(pgd);
page->index = (unsigned long)pgd_list; page->index = (unsigned long)pgd_list;
if (pgd_list) if (pgd_list)
pgd_list->private = (unsigned long)&page->index; set_page_private(pgd_list, (unsigned long)&page->index);
pgd_list = page; pgd_list = page;
page->private = (unsigned long)&pgd_list; set_page_private(page, (unsigned long)&pgd_list);
} }
static inline void pgd_list_del(pgd_t *pgd) static inline void pgd_list_del(pgd_t *pgd)
{ {
struct page *next, **pprev, *page = virt_to_page(pgd); struct page *next, **pprev, *page = virt_to_page(pgd);
next = (struct page *)page->index; next = (struct page *)page->index;
pprev = (struct page **)page->private; pprev = (struct page **)page_private(page);
*pprev = next; *pprev = next;
if (next) if (next)
next->private = (unsigned long)pprev; set_page_private(next, (unsigned long)pprev);
} }
void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused) void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
......
...@@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm) ...@@ -144,6 +144,7 @@ void destroy_context_skas(struct mm_struct *mm)
if(!proc_mm || !ptrace_faultinfo){ if(!proc_mm || !ptrace_faultinfo){
free_page(mmu->id.stack); free_page(mmu->id.stack);
pte_lock_deinit(virt_to_page(mmu->last_page_table));
pte_free_kernel((pte_t *) mmu->last_page_table); pte_free_kernel((pte_t *) mmu->last_page_table);
dec_page_state(nr_page_table_pages); dec_page_state(nr_page_table_pages);
#ifdef CONFIG_3_LEVEL_PGTABLES #ifdef CONFIG_3_LEVEL_PGTABLES
......
...@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags) ...@@ -291,8 +291,8 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
cachefs_uncache_page(vnode->cache, page); cachefs_uncache_page(vnode->cache, page);
#endif #endif
pageio = (struct cachefs_page *) page->private; pageio = (struct cachefs_page *) page_private(page);
page->private = 0; set_page_private(page, 0);
ClearPagePrivate(page); ClearPagePrivate(page);
if (pageio) if (pageio)
......
...@@ -96,7 +96,7 @@ static void ...@@ -96,7 +96,7 @@ static void
__clear_page_buffers(struct page *page) __clear_page_buffers(struct page *page)
{ {
ClearPagePrivate(page); ClearPagePrivate(page);
page->private = 0; set_page_private(page, 0);
page_cache_release(page); page_cache_release(page);
} }
......
...@@ -86,7 +86,7 @@ struct meta_anchor { ...@@ -86,7 +86,7 @@ struct meta_anchor {
atomic_t io_count; atomic_t io_count;
struct metapage *mp[MPS_PER_PAGE]; struct metapage *mp[MPS_PER_PAGE];
}; };
#define mp_anchor(page) ((struct meta_anchor *)page->private) #define mp_anchor(page) ((struct meta_anchor *)page_private(page))
static inline struct metapage *page_to_mp(struct page *page, uint offset) static inline struct metapage *page_to_mp(struct page *page, uint offset)
{ {
...@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) ...@@ -108,7 +108,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
if (!a) if (!a)
return -ENOMEM; return -ENOMEM;
memset(a, 0, sizeof(struct meta_anchor)); memset(a, 0, sizeof(struct meta_anchor));
page->private = (unsigned long)a; set_page_private(page, (unsigned long)a);
SetPagePrivate(page); SetPagePrivate(page);
kmap(page); kmap(page);
} }
...@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp) ...@@ -136,7 +136,7 @@ static inline void remove_metapage(struct page *page, struct metapage *mp)
a->mp[index] = NULL; a->mp[index] = NULL;
if (--a->mp_count == 0) { if (--a->mp_count == 0) {
kfree(a); kfree(a);
page->private = 0; set_page_private(page, 0);
ClearPagePrivate(page); ClearPagePrivate(page);
kunmap(page); kunmap(page);
} }
...@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *)) ...@@ -156,13 +156,13 @@ static inline void dec_io(struct page *page, void (*handler) (struct page *))
#else #else
static inline struct metapage *page_to_mp(struct page *page, uint offset) static inline struct metapage *page_to_mp(struct page *page, uint offset)
{ {
return PagePrivate(page) ? (struct metapage *)page->private : NULL; return PagePrivate(page) ? (struct metapage *)page_private(page) : NULL;
} }
static inline int insert_metapage(struct page *page, struct metapage *mp) static inline int insert_metapage(struct page *page, struct metapage *mp)
{ {
if (mp) { if (mp) {
page->private = (unsigned long)mp; set_page_private(page, (unsigned long)mp);
SetPagePrivate(page); SetPagePrivate(page);
kmap(page); kmap(page);
} }
...@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp) ...@@ -171,7 +171,7 @@ static inline int insert_metapage(struct page *page, struct metapage *mp)
static inline void remove_metapage(struct page *page, struct metapage *mp) static inline void remove_metapage(struct page *page, struct metapage *mp)
{ {
page->private = 0; set_page_private(page, 0);
ClearPagePrivate(page); ClearPagePrivate(page);
kunmap(page); kunmap(page);
} }
......
...@@ -181,8 +181,9 @@ set_page_region( ...@@ -181,8 +181,9 @@ set_page_region(
size_t offset, size_t offset,
size_t length) size_t length)
{ {
page->private |= page_region_mask(offset, length); set_page_private(page,
if (page->private == ~0UL) page_private(page) | page_region_mask(offset, length));
if (page_private(page) == ~0UL)
SetPageUptodate(page); SetPageUptodate(page);
} }
...@@ -194,7 +195,7 @@ test_page_region( ...@@ -194,7 +195,7 @@ test_page_region(
{ {
unsigned long mask = page_region_mask(offset, length); unsigned long mask = page_region_mask(offset, length);
return (mask && (page->private & mask) == mask); return (mask && (page_private(page) & mask) == mask);
} }
/* /*
......
...@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp) ...@@ -126,8 +126,8 @@ BUFFER_FNS(Eopnotsupp, eopnotsupp)
/* If we *know* page->private refers to buffer_heads */ /* If we *know* page->private refers to buffer_heads */
#define page_buffers(page) \ #define page_buffers(page) \
({ \ ({ \
BUG_ON(!PagePrivate(page)); \ BUG_ON(!PagePrivate(page)); \
((struct buffer_head *)(page)->private); \ ((struct buffer_head *)page_private(page)); \
}) })
#define page_has_buffers(page) PagePrivate(page) #define page_has_buffers(page) PagePrivate(page)
...@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page, ...@@ -219,7 +219,7 @@ static inline void attach_page_buffers(struct page *page,
{ {
page_cache_get(page); page_cache_get(page);
SetPagePrivate(page); SetPagePrivate(page);
page->private = (unsigned long)head; set_page_private(page, (unsigned long)head);
} }
static inline void get_bh(struct buffer_head *bh) static inline void get_bh(struct buffer_head *bh)
......
...@@ -226,13 +226,18 @@ struct page { ...@@ -226,13 +226,18 @@ struct page {
* to show when page is mapped * to show when page is mapped
* & limit reverse map searches. * & limit reverse map searches.
*/ */
unsigned long private; /* Mapping-private opaque data: union {
unsigned long private; /* Mapping-private opaque data:
* usually used for buffer_heads * usually used for buffer_heads
* if PagePrivate set; used for * if PagePrivate set; used for
* swp_entry_t if PageSwapCache * swp_entry_t if PageSwapCache
* When page is free, this indicates * When page is free, this indicates
* order in the buddy system. * order in the buddy system.
*/ */
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
spinlock_t ptl;
#endif
} u;
struct address_space *mapping; /* If low bit clear, points to struct address_space *mapping; /* If low bit clear, points to
* inode address_space, or NULL. * inode address_space, or NULL.
* If page mapped as anonymous * If page mapped as anonymous
...@@ -260,6 +265,9 @@ struct page { ...@@ -260,6 +265,9 @@ struct page {
#endif /* WANT_PAGE_VIRTUAL */ #endif /* WANT_PAGE_VIRTUAL */
}; };
#define page_private(page) ((page)->u.private)
#define set_page_private(page, v) ((page)->u.private = (v))
/* /*
* FIXME: take this include out, include page-flags.h in * FIXME: take this include out, include page-flags.h in
* files which need it (119 of them) * files which need it (119 of them)
...@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *)); ...@@ -311,17 +319,17 @@ extern void FASTCALL(__page_cache_release(struct page *));
#ifdef CONFIG_HUGETLB_PAGE #ifdef CONFIG_HUGETLB_PAGE
static inline int page_count(struct page *p) static inline int page_count(struct page *page)
{ {
if (PageCompound(p)) if (PageCompound(page))
p = (struct page *)p->private; page = (struct page *)page_private(page);
return atomic_read(&(p)->_count) + 1; return atomic_read(&page->_count) + 1;
} }
static inline void get_page(struct page *page) static inline void get_page(struct page *page)
{ {
if (unlikely(PageCompound(page))) if (unlikely(PageCompound(page)))
page = (struct page *)page->private; page = (struct page *)page_private(page);
atomic_inc(&page->_count); atomic_inc(&page->_count);
} }
...@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page) ...@@ -587,7 +595,7 @@ static inline int PageAnon(struct page *page)
static inline pgoff_t page_index(struct page *page) static inline pgoff_t page_index(struct page *page)
{ {
if (unlikely(PageSwapCache(page))) if (unlikely(PageSwapCache(page)))
return page->private; return page_private(page);
return page->index; return page->index;
} }
...@@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a ...@@ -779,9 +787,31 @@ static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long a
} }
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */ #endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */
#if NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS
/*
* We tuck a spinlock to guard each pagetable page into its struct page,
* at page->private, with BUILD_BUG_ON to make sure that this will not
* overflow into the next struct page (as it might with DEBUG_SPINLOCK).
* When freeing, reset page->mapping so free_pages_check won't complain.
*/
#define __pte_lockptr(page) &((page)->u.ptl)
#define pte_lock_init(_page) do { \
spin_lock_init(__pte_lockptr(_page)); \
} while (0)
#define pte_lock_deinit(page) ((page)->mapping = NULL)
#define pte_lockptr(mm, pmd) ({(void)(mm); __pte_lockptr(pmd_page(*(pmd)));})
#else
/*
* We use mm->page_table_lock to guard all pagetable pages of the mm.
*/
#define pte_lock_init(page) do {} while (0)
#define pte_lock_deinit(page) do {} while (0)
#define pte_lockptr(mm, pmd) ({(void)(pmd); &(mm)->page_table_lock;})
#endif /* NR_CPUS < CONFIG_SPLIT_PTLOCK_CPUS */
#define pte_offset_map_lock(mm, pmd, address, ptlp) \ #define pte_offset_map_lock(mm, pmd, address, ptlp) \
({ \ ({ \
spinlock_t *__ptl = &(mm)->page_table_lock; \ spinlock_t *__ptl = pte_lockptr(mm, pmd); \
pte_t *__pte = pte_offset_map(pmd, address); \ pte_t *__pte = pte_offset_map(pmd, address); \
*(ptlp) = __ptl; \ *(ptlp) = __ptl; \
spin_lock(__ptl); \ spin_lock(__ptl); \
......
...@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) ...@@ -334,7 +334,7 @@ static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
if (pages) { if (pages) {
unsigned int count, i; unsigned int count, i;
pages->mapping = NULL; pages->mapping = NULL;
pages->private = order; set_page_private(pages, order);
count = 1 << order; count = 1 << order;
for (i = 0; i < count; i++) for (i = 0; i < count; i++)
SetPageReserved(pages + i); SetPageReserved(pages + i);
...@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page) ...@@ -347,7 +347,7 @@ static void kimage_free_pages(struct page *page)
{ {
unsigned int order, count, i; unsigned int order, count, i;
order = page->private; order = page_private(page);
count = 1 << order; count = 1 << order;
for (i = 0; i < count; i++) for (i = 0; i < count; i++)
ClearPageReserved(page + i); ClearPageReserved(page + i);
......
...@@ -111,3 +111,16 @@ config SPARSEMEM_STATIC ...@@ -111,3 +111,16 @@ config SPARSEMEM_STATIC
config SPARSEMEM_EXTREME config SPARSEMEM_EXTREME
def_bool y def_bool y
depends on SPARSEMEM && !SPARSEMEM_STATIC depends on SPARSEMEM && !SPARSEMEM_STATIC
# Heavily threaded applications may benefit from splitting the mm-wide
# page_table_lock, so that faults on different parts of the user address
# space can be handled with less contention: split it at this NR_CPUS.
# Default to 4 for wider testing, though 8 might be more appropriate.
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
# PA-RISC's debug spinlock_t is too large for the 32-bit struct page.
#
config SPLIT_PTLOCK_CPUS
int
default "4096" if ARM && !CPU_CACHE_VIPT
default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT
default "4"
...@@ -152,7 +152,7 @@ static int sync_page(void *word) ...@@ -152,7 +152,7 @@ static int sync_page(void *word)
* in the ->sync_page() methods make essential use of the * in the ->sync_page() methods make essential use of the
* page_mapping(), merely passing the page down to the backing * page_mapping(), merely passing the page down to the backing
* device's unplug functions when it's non-NULL, which in turn * device's unplug functions when it's non-NULL, which in turn
* ignore it for all cases but swap, where only page->private is * ignore it for all cases but swap, where only page_private(page) is
* of interest. When page_mapping() does go NULL, the entire * of interest. When page_mapping() does go NULL, the entire
* call stack gracefully ignores the page and returns. * call stack gracefully ignores the page and returns.
* -- wli * -- wli
......
...@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) ...@@ -114,6 +114,7 @@ static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
{ {
struct page *page = pmd_page(*pmd); struct page *page = pmd_page(*pmd);
pmd_clear(pmd); pmd_clear(pmd);
pte_lock_deinit(page);
pte_free_tlb(tlb, page); pte_free_tlb(tlb, page);
dec_page_state(nr_page_table_pages); dec_page_state(nr_page_table_pages);
tlb->mm->nr_ptes--; tlb->mm->nr_ptes--;
...@@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address) ...@@ -294,10 +295,12 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
if (!new) if (!new)
return -ENOMEM; return -ENOMEM;
pte_lock_init(new);
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
if (pmd_present(*pmd)) /* Another has populated it */ if (pmd_present(*pmd)) { /* Another has populated it */
pte_lock_deinit(new);
pte_free(new); pte_free(new);
else { } else {
mm->nr_ptes++; mm->nr_ptes++;
inc_page_state(nr_page_table_pages); inc_page_state(nr_page_table_pages);
pmd_populate(mm, pmd, new); pmd_populate(mm, pmd, new);
...@@ -432,7 +435,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, ...@@ -432,7 +435,7 @@ static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!dst_pte) if (!dst_pte)
return -ENOMEM; return -ENOMEM;
src_pte = pte_offset_map_nested(src_pmd, addr); src_pte = pte_offset_map_nested(src_pmd, addr);
src_ptl = &src_mm->page_table_lock; src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock(src_ptl); spin_lock(src_ptl);
do { do {
...@@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range); ...@@ -1194,15 +1197,16 @@ EXPORT_SYMBOL(remap_pfn_range);
* (but do_wp_page is only called after already making such a check; * (but do_wp_page is only called after already making such a check;
* and do_anonymous_page and do_no_page can safely check later on). * and do_anonymous_page and do_no_page can safely check later on).
*/ */
static inline int pte_unmap_same(struct mm_struct *mm, static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
pte_t *page_table, pte_t orig_pte) pte_t *page_table, pte_t orig_pte)
{ {
int same = 1; int same = 1;
#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT) #if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT)
if (sizeof(pte_t) > sizeof(unsigned long)) { if (sizeof(pte_t) > sizeof(unsigned long)) {
spin_lock(&mm->page_table_lock); spinlock_t *ptl = pte_lockptr(mm, pmd);
spin_lock(ptl);
same = pte_same(*page_table, orig_pte); same = pte_same(*page_table, orig_pte);
spin_unlock(&mm->page_table_lock); spin_unlock(ptl);
} }
#endif #endif
pte_unmap(page_table); pte_unmap(page_table);
...@@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1655,7 +1659,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t pte; pte_t pte;
int ret = VM_FAULT_MINOR; int ret = VM_FAULT_MINOR;
if (!pte_unmap_same(mm, page_table, orig_pte)) if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
goto out; goto out;
entry = pte_to_swp_entry(orig_pte); entry = pte_to_swp_entry(orig_pte);
...@@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1773,7 +1777,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_cache_get(page); page_cache_get(page);
entry = mk_pte(page, vma->vm_page_prot); entry = mk_pte(page, vma->vm_page_prot);
ptl = &mm->page_table_lock; ptl = pte_lockptr(mm, pmd);
spin_lock(ptl); spin_lock(ptl);
if (!pte_none(*page_table)) if (!pte_none(*page_table))
goto release; goto release;
...@@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -1934,7 +1938,7 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
pgoff_t pgoff; pgoff_t pgoff;
int err; int err;
if (!pte_unmap_same(mm, page_table, orig_pte)) if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
return VM_FAULT_MINOR; return VM_FAULT_MINOR;
if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) { if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
...@@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, ...@@ -1992,7 +1996,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
pte, pmd, write_access, entry); pte, pmd, write_access, entry);
} }
ptl = &mm->page_table_lock; ptl = pte_lockptr(mm, pmd);
spin_lock(ptl); spin_lock(ptl);
if (unlikely(!pte_same(*pte, entry))) if (unlikely(!pte_same(*pte, entry)))
goto unlock; goto unlock;
......
...@@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, ...@@ -72,7 +72,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
struct address_space *mapping = NULL; struct address_space *mapping = NULL;
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
pte_t *old_pte, *new_pte, pte; pte_t *old_pte, *new_pte, pte;
spinlock_t *old_ptl; spinlock_t *old_ptl, *new_ptl;
if (vma->vm_file) { if (vma->vm_file) {
/* /*
...@@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, ...@@ -88,8 +88,15 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
new_vma->vm_truncate_count = 0; new_vma->vm_truncate_count = 0;
} }
/*
* We don't have to worry about the ordering of src and dst
* pte locks because exclusive mmap_sem prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl); old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
new_pte = pte_offset_map_nested(new_pmd, new_addr); new_pte = pte_offset_map_nested(new_pmd, new_addr);
new_ptl = pte_lockptr(mm, new_pmd);
if (new_ptl != old_ptl)
spin_lock(new_ptl);
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE, for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
new_pte++, new_addr += PAGE_SIZE) { new_pte++, new_addr += PAGE_SIZE) {
...@@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, ...@@ -101,6 +108,8 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
set_pte_at(mm, new_addr, new_pte, pte); set_pte_at(mm, new_addr, new_pte, pte);
} }
if (new_ptl != old_ptl)
spin_unlock(new_ptl);
pte_unmap_nested(new_pte - 1); pte_unmap_nested(new_pte - 1);
pte_unmap_unlock(old_pte - 1, old_ptl); pte_unmap_unlock(old_pte - 1, old_ptl);
if (mapping) if (mapping)
......
...@@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order) ...@@ -154,7 +154,7 @@ static void prep_compound_page(struct page *page, unsigned long order)
struct page *p = page + i; struct page *p = page + i;
SetPageCompound(p); SetPageCompound(p);
p->private = (unsigned long)page; set_page_private(p, (unsigned long)page);
} }
} }
...@@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order) ...@@ -174,7 +174,7 @@ static void destroy_compound_page(struct page *page, unsigned long order)
if (!PageCompound(p)) if (!PageCompound(p))
bad_page(__FUNCTION__, page); bad_page(__FUNCTION__, page);
if (p->private != (unsigned long)page) if (page_private(p) != (unsigned long)page)
bad_page(__FUNCTION__, page); bad_page(__FUNCTION__, page);
ClearPageCompound(p); ClearPageCompound(p);
} }
...@@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order) ...@@ -187,18 +187,18 @@ static void destroy_compound_page(struct page *page, unsigned long order)
* So, we don't need atomic page->flags operations here. * So, we don't need atomic page->flags operations here.
*/ */
static inline unsigned long page_order(struct page *page) { static inline unsigned long page_order(struct page *page) {
return page->private; return page_private(page);
} }
static inline void set_page_order(struct page *page, int order) { static inline void set_page_order(struct page *page, int order) {
page->private = order; set_page_private(page, order);
__SetPagePrivate(page); __SetPagePrivate(page);
} }
static inline void rmv_page_order(struct page *page) static inline void rmv_page_order(struct page *page)
{ {
__ClearPagePrivate(page); __ClearPagePrivate(page);
page->private = 0; set_page_private(page, 0);
} }
/* /*
...@@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order) ...@@ -238,7 +238,7 @@ __find_combined_index(unsigned long page_idx, unsigned int order)
* (a) the buddy is free && * (a) the buddy is free &&
* (b) the buddy is on the buddy system && * (b) the buddy is on the buddy system &&
* (c) a page and its buddy have the same order. * (c) a page and its buddy have the same order.
* for recording page's order, we use page->private and PG_private. * for recording page's order, we use page_private(page) and PG_private.
* *
*/ */
static inline int page_is_buddy(struct page *page, int order) static inline int page_is_buddy(struct page *page, int order)
...@@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order) ...@@ -264,7 +264,7 @@ static inline int page_is_buddy(struct page *page, int order)
* parts of the VM system. * parts of the VM system.
* At each level, we keep a list of pages, which are heads of continuous * At each level, we keep a list of pages, which are heads of continuous
* free pages of length of (1 << order) and marked with PG_Private.Page's * free pages of length of (1 << order) and marked with PG_Private.Page's
* order is recorded in page->private field. * order is recorded in page_private(page) field.
* So when we are allocating or freeing one, we can derive the state of the * So when we are allocating or freeing one, we can derive the state of the
* other. That is, if we allocate a small block, and both were * other. That is, if we allocate a small block, and both were
* free, the remainder of the region must be split into blocks. * free, the remainder of the region must be split into blocks.
...@@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order) ...@@ -463,7 +463,7 @@ static void prep_new_page(struct page *page, int order)
page->flags &= ~(1 << PG_uptodate | 1 << PG_error | page->flags &= ~(1 << PG_uptodate | 1 << PG_error |
1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_referenced | 1 << PG_arch_1 |
1 << PG_checked | 1 << PG_mappedtodisk); 1 << PG_checked | 1 << PG_mappedtodisk);
page->private = 0; set_page_private(page, 0);
set_page_refs(page, order); set_page_refs(page, order);
kernel_map_pages(page, 1 << order, 1); kernel_map_pages(page, 1 << order, 1);
} }
......
...@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) ...@@ -91,7 +91,8 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
unlock_page(page); unlock_page(page);
goto out; goto out;
} }
bio = get_swap_bio(GFP_NOIO, page->private, page, end_swap_bio_write); bio = get_swap_bio(GFP_NOIO, page_private(page), page,
end_swap_bio_write);
if (bio == NULL) { if (bio == NULL) {
set_page_dirty(page); set_page_dirty(page);
unlock_page(page); unlock_page(page);
...@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page) ...@@ -115,7 +116,8 @@ int swap_readpage(struct file *file, struct page *page)
BUG_ON(!PageLocked(page)); BUG_ON(!PageLocked(page));
ClearPageUptodate(page); ClearPageUptodate(page);
bio = get_swap_bio(GFP_KERNEL, page->private, page, end_swap_bio_read); bio = get_swap_bio(GFP_KERNEL, page_private(page), page,
end_swap_bio_read);
if (bio == NULL) { if (bio == NULL) {
unlock_page(page); unlock_page(page);
ret = -ENOMEM; ret = -ENOMEM;
......
...@@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, ...@@ -274,7 +274,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
return NULL; return NULL;
} }
ptl = &mm->page_table_lock; ptl = pte_lockptr(mm, pmd);
spin_lock(ptl); spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) { if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl; *ptlp = ptl;
...@@ -550,7 +550,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) ...@@ -550,7 +550,7 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma)
update_hiwater_rss(mm); update_hiwater_rss(mm);
if (PageAnon(page)) { if (PageAnon(page)) {
swp_entry_t entry = { .val = page->private }; swp_entry_t entry = { .val = page_private(page) };
/* /*
* Store the swap location in the pte. * Store the swap location in the pte.
* See handle_pte_fault() ... * See handle_pte_fault() ...
......
...@@ -71,9 +71,6 @@ ...@@ -71,9 +71,6 @@
/* Pretend that each entry is of this size in directory's i_size */ /* Pretend that each entry is of this size in directory's i_size */
#define BOGO_DIRENT_SIZE 20 #define BOGO_DIRENT_SIZE 20
/* Keep swapped page count in private field of indirect struct page */
#define nr_swapped private
/* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */ /* Flag allocation requirements to shmem_getpage and shmem_swp_alloc */
enum sgp_type { enum sgp_type {
SGP_QUICK, /* don't try more than file page cache lookup */ SGP_QUICK, /* don't try more than file page cache lookup */
...@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns ...@@ -324,8 +321,10 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns
entry->val = value; entry->val = value;
info->swapped += incdec; info->swapped += incdec;
if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) if ((unsigned long)(entry - info->i_direct) >= SHMEM_NR_DIRECT) {
kmap_atomic_to_page(entry)->nr_swapped += incdec; struct page *page = kmap_atomic_to_page(entry);
set_page_private(page, page_private(page) + incdec);
}
} }
/* /*
...@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long ...@@ -368,9 +367,8 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long
spin_unlock(&info->lock); spin_unlock(&info->lock);
page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO); page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping) | __GFP_ZERO);
if (page) { if (page)
page->nr_swapped = 0; set_page_private(page, 0);
}
spin_lock(&info->lock); spin_lock(&info->lock);
if (!page) { if (!page) {
...@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode) ...@@ -561,7 +559,7 @@ static void shmem_truncate(struct inode *inode)
diroff = 0; diroff = 0;
} }
subdir = dir[diroff]; subdir = dir[diroff];
if (subdir && subdir->nr_swapped) { if (subdir && page_private(subdir)) {
size = limit - idx; size = limit - idx;
if (size > ENTRIES_PER_PAGE) if (size > ENTRIES_PER_PAGE)
size = ENTRIES_PER_PAGE; size = ENTRIES_PER_PAGE;
...@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode) ...@@ -572,10 +570,10 @@ static void shmem_truncate(struct inode *inode)
nr_swaps_freed += freed; nr_swaps_freed += freed;
if (offset) if (offset)
spin_lock(&info->lock); spin_lock(&info->lock);
subdir->nr_swapped -= freed; set_page_private(subdir, page_private(subdir) - freed);
if (offset) if (offset)
spin_unlock(&info->lock); spin_unlock(&info->lock);
BUG_ON(subdir->nr_swapped > offset); BUG_ON(page_private(subdir) > offset);
} }
if (offset) if (offset)
offset = 0; offset = 0;
...@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s ...@@ -743,7 +741,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, swp_entry_t entry, s
dir = shmem_dir_map(subdir); dir = shmem_dir_map(subdir);
} }
subdir = *dir; subdir = *dir;
if (subdir && subdir->nr_swapped) { if (subdir && page_private(subdir)) {
ptr = shmem_swp_map(subdir); ptr = shmem_swp_map(subdir);
size = limit - idx; size = limit - idx;
if (size > ENTRIES_PER_PAGE) if (size > ENTRIES_PER_PAGE)
......
...@@ -39,7 +39,7 @@ int page_cluster; ...@@ -39,7 +39,7 @@ int page_cluster;
void put_page(struct page *page) void put_page(struct page *page)
{ {
if (unlikely(PageCompound(page))) { if (unlikely(PageCompound(page))) {
page = (struct page *)page->private; page = (struct page *)page_private(page);
if (put_page_testzero(page)) { if (put_page_testzero(page)) {
void (*dtor)(struct page *page); void (*dtor)(struct page *page);
......
...@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry, ...@@ -83,7 +83,7 @@ static int __add_to_swap_cache(struct page *page, swp_entry_t entry,
page_cache_get(page); page_cache_get(page);
SetPageLocked(page); SetPageLocked(page);
SetPageSwapCache(page); SetPageSwapCache(page);
page->private = entry.val; set_page_private(page, entry.val);
total_swapcache_pages++; total_swapcache_pages++;
pagecache_acct(1); pagecache_acct(1);
} }
...@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page) ...@@ -126,8 +126,8 @@ void __delete_from_swap_cache(struct page *page)
BUG_ON(PageWriteback(page)); BUG_ON(PageWriteback(page));
BUG_ON(PagePrivate(page)); BUG_ON(PagePrivate(page));
radix_tree_delete(&swapper_space.page_tree, page->private); radix_tree_delete(&swapper_space.page_tree, page_private(page));
page->private = 0; set_page_private(page, 0);
ClearPageSwapCache(page); ClearPageSwapCache(page);
total_swapcache_pages--; total_swapcache_pages--;
pagecache_acct(-1); pagecache_acct(-1);
...@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page) ...@@ -197,7 +197,7 @@ void delete_from_swap_cache(struct page *page)
{ {
swp_entry_t entry; swp_entry_t entry;
entry.val = page->private; entry.val = page_private(page);
write_lock_irq(&swapper_space.tree_lock); write_lock_irq(&swapper_space.tree_lock);
__delete_from_swap_cache(page); __delete_from_swap_cache(page);
......
...@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) ...@@ -61,7 +61,7 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
swp_entry_t entry; swp_entry_t entry;
down_read(&swap_unplug_sem); down_read(&swap_unplug_sem);
entry.val = page->private; entry.val = page_private(page);
if (PageSwapCache(page)) { if (PageSwapCache(page)) {
struct block_device *bdev = swap_info[swp_type(entry)].bdev; struct block_device *bdev = swap_info[swp_type(entry)].bdev;
struct backing_dev_info *bdi; struct backing_dev_info *bdi;
...@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page) ...@@ -69,8 +69,8 @@ void swap_unplug_io_fn(struct backing_dev_info *unused_bdi, struct page *page)
/* /*
* If the page is removed from swapcache from under us (with a * If the page is removed from swapcache from under us (with a
* racy try_to_unuse/swapoff) we need an additional reference * racy try_to_unuse/swapoff) we need an additional reference
* count to avoid reading garbage from page->private above. If * count to avoid reading garbage from page_private(page) above.
* the WARN_ON triggers during a swapoff it maybe the race * If the WARN_ON triggers during a swapoff it maybe the race
* condition and it's harmless. However if it triggers without * condition and it's harmless. However if it triggers without
* swapoff it signals a problem. * swapoff it signals a problem.
*/ */
...@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page) ...@@ -294,7 +294,7 @@ static inline int page_swapcount(struct page *page)
struct swap_info_struct *p; struct swap_info_struct *p;
swp_entry_t entry; swp_entry_t entry;
entry.val = page->private; entry.val = page_private(page);
p = swap_info_get(entry); p = swap_info_get(entry);
if (p) { if (p) {
/* Subtract the 1 for the swap cache itself */ /* Subtract the 1 for the swap cache itself */
...@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page) ...@@ -339,7 +339,7 @@ int remove_exclusive_swap_page(struct page *page)
if (page_count(page) != 2) /* 2: us + cache */ if (page_count(page) != 2) /* 2: us + cache */
return 0; return 0;
entry.val = page->private; entry.val = page_private(page);
p = swap_info_get(entry); p = swap_info_get(entry);
if (!p) if (!p)
return 0; return 0;
...@@ -1042,7 +1042,7 @@ int page_queue_congested(struct page *page) ...@@ -1042,7 +1042,7 @@ int page_queue_congested(struct page *page)
BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */ BUG_ON(!PageLocked(page)); /* It pins the swap_info_struct */
if (PageSwapCache(page)) { if (PageSwapCache(page)) {
swp_entry_t entry = { .val = page->private }; swp_entry_t entry = { .val = page_private(page) };
struct swap_info_struct *sis; struct swap_info_struct *sis;
sis = get_swap_info_struct(swp_type(entry)); sis = get_swap_info_struct(swp_type(entry));
......
...@@ -521,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) ...@@ -521,7 +521,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc)
#ifdef CONFIG_SWAP #ifdef CONFIG_SWAP
if (PageSwapCache(page)) { if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page->private }; swp_entry_t swap = { .val = page_private(page) };
__delete_from_swap_cache(page); __delete_from_swap_cache(page);
write_unlock_irq(&mapping->tree_lock); write_unlock_irq(&mapping->tree_lock);
swap_free(swap); swap_free(swap);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment