Commit 6beadb3b authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] re-slabify i386 pgd's and pmd's

From: William Lee Irwin III <wli@holomorphy.com>

The original pgd/pmd slabification patches had a critical bug on
non-PAE where both modifications of pgd entries to remove pagetables
attached for non-PSE mappings back to a PSE state and modifications of
pgd entries to attach pagetables to bring PSE mappings into a non-PSE
state were not propagated to cached pgd's. PAE was immune to it owing
to the shared kernel pmd.

The following patch vs. 2.5.69 restores the slabification done to cache
preconstructed pagetables with the proper propagation of conversions
to and from PSE mappings to cached pgd's for the non-PAE case.

This is an optimization to reduce the bitblitting overhead for spawning
small tasks (for larger ones, bottom-level pagetable copies dominate)
primarily on non-PAE; the PAE code change is largely to remove #ifdefs
and to treat the two cases uniformly, though some positive but small
performance improvement has been observed for PAE in one of mbligh's
posts. The non-PAE performance improvement has been observed on a box
running a script-heavy end-user workload as a large long-term profile
hit count reduction for pgd_alloc() and relatives thereof.

I would very much appreciate outside testers. Even though I've been
able to verify this boots and runs properly and survives several cycles
of restarting X on my non-PAE Thinkpad T21, that environment has never
been able to reproduce the bug. Those with the proper graphics hardware
to prod the affected codepaths into action are the ones best suited to
verify proper functionality. There is also some locking introduced; if
some performance verification on non-PAE SMP i386 targets (my SMP
targets unfortunately all require PAE due to arch code dependencies)
that also have the proper hardware could be done, that would help
determine whether alternative locking schemes that competed against
the one shown here are preferable (in particular, the ticket-based
scheme mentioned in the comments).
parent 7bbf0e05
...@@ -509,20 +509,30 @@ void __init mem_init(void) ...@@ -509,20 +509,30 @@ void __init mem_init(void)
#endif #endif
} }
#ifdef CONFIG_X86_PAE kmem_cache_t *pgd_cache;
struct kmem_cache_s *pae_pgd_cachep; kmem_cache_t *pmd_cache;
void __init pgtable_cache_init(void) void __init pgtable_cache_init(void)
{ {
/* if (PTRS_PER_PMD > 1) {
* PAE pgds must be 16-byte aligned: pmd_cache = kmem_cache_create("pmd",
*/ PTRS_PER_PMD*sizeof(pmd_t),
pae_pgd_cachep = kmem_cache_create("pae_pgd", 32, 0, 0,
SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN, NULL, NULL); SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
if (!pae_pgd_cachep) pmd_ctor,
panic("init_pae(): Cannot alloc pae_pgd SLAB cache"); NULL);
if (!pmd_cache)
panic("pgtable_cache_init(): cannot create pmd cache");
}
pgd_cache = kmem_cache_create("pgd",
PTRS_PER_PGD*sizeof(pgd_t),
0,
SLAB_HWCACHE_ALIGN | SLAB_MUST_HWCACHE_ALIGN,
pgd_ctor,
PTRS_PER_PMD == 1 ? pgd_dtor : NULL);
if (!pgd_cache)
panic("pgtable_cache_init(): Cannot create pgd cache");
} }
#endif
/* /*
* This function cannot be __init, since exceptions don't work in that * This function cannot be __init, since exceptions don't work in that
......
...@@ -67,19 +67,22 @@ static void flush_kernel_map(void *dummy) ...@@ -67,19 +67,22 @@ static void flush_kernel_map(void *dummy)
static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte) static void set_pmd_pte(pte_t *kpte, unsigned long address, pte_t pte)
{ {
struct page *page;
unsigned long flags;
set_pte_atomic(kpte, pte); /* change init_mm */ set_pte_atomic(kpte, pte); /* change init_mm */
#ifndef CONFIG_X86_PAE if (PTRS_PER_PMD > 1)
{ return;
struct list_head *l;
spin_lock(&mmlist_lock); spin_lock_irqsave(&pgd_lock, flags);
list_for_each(l, &init_mm.mmlist) { list_for_each_entry(page, &pgd_list, lru) {
struct mm_struct *mm = list_entry(l, struct mm_struct, mmlist); pgd_t *pgd;
pmd_t *pmd = pmd_offset(pgd_offset(mm, address), address); pmd_t *pmd;
set_pte_atomic((pte_t *)pmd, pte); pgd = (pgd_t *)page_address(page) + pgd_index(address);
} pmd = pmd_offset(pgd, address);
spin_unlock(&mmlist_lock); set_pte_atomic((pte_t *)pmd, pte);
} }
#endif spin_unlock_irqrestore(&pgd_lock, flags);
} }
/* /*
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <linux/highmem.h> #include <linux/highmem.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/pagemap.h> #include <linux/pagemap.h>
#include <linux/spinlock.h>
#include <asm/system.h> #include <asm/system.h>
#include <asm/pgtable.h> #include <asm/pgtable.h>
...@@ -151,61 +152,88 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address) ...@@ -151,61 +152,88 @@ struct page *pte_alloc_one(struct mm_struct *mm, unsigned long address)
return pte; return pte;
} }
#ifdef CONFIG_X86_PAE void pmd_ctor(void *pmd, kmem_cache_t *cache, unsigned long flags)
{
memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t));
}
pgd_t *pgd_alloc(struct mm_struct *mm) /*
* List of all pgd's needed for non-PAE so it can invalidate entries
* in both cached and uncached pgd's; not needed for PAE since the
* kernel pmd is shared. If PAE were not to share the pmd a similar
* tactic would be needed. This is essentially codepath-based locking
* against pageattr.c; it is the unique case in which a valid change
* of kernel pagetables can't be lazily synchronized by vmalloc faults.
* vmalloc faults work because attached pagetables are never freed.
* If the locking proves to be non-performant, a ticketing scheme with
* checks at dup_mmap(), exec(), and other mmlist addition points
* could be used. The locking scheme was chosen on the basis of
* manfred's recommendations and having no core impact whatsoever.
* -- wli
*/
spinlock_t pgd_lock = SPIN_LOCK_UNLOCKED;
LIST_HEAD(pgd_list);
void pgd_ctor(void *pgd, kmem_cache_t *cache, unsigned long unused)
{ {
int i; unsigned long flags;
pgd_t *pgd = kmem_cache_alloc(pae_pgd_cachep, GFP_KERNEL);
if (PTRS_PER_PMD == 1)
if (pgd) { spin_lock_irqsave(&pgd_lock, flags);
for (i = 0; i < USER_PTRS_PER_PGD; i++) {
unsigned long pmd = __get_free_page(GFP_KERNEL); memcpy((pgd_t *)pgd + USER_PTRS_PER_PGD,
if (!pmd)
goto out_oom;
clear_page(pmd);
set_pgd(pgd + i, __pgd(1 + __pa(pmd)));
}
memcpy(pgd + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD, swapper_pg_dir + USER_PTRS_PER_PGD,
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); (PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t));
}
return pgd; if (PTRS_PER_PMD > 1)
out_oom: return;
for (i--; i >= 0; i--)
free_page((unsigned long)__va(pgd_val(pgd[i])-1)); list_add(&virt_to_page(pgd)->lru, &pgd_list);
kmem_cache_free(pae_pgd_cachep, pgd); spin_unlock_irqrestore(&pgd_lock, flags);
return NULL; memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t));
} }
void pgd_free(pgd_t *pgd) /* never called when PTRS_PER_PMD > 1 */
void pgd_dtor(void *pgd, kmem_cache_t *cache, unsigned long unused)
{ {
int i; unsigned long flags; /* can be called from interrupt context */
for (i = 0; i < USER_PTRS_PER_PGD; i++) spin_lock_irqsave(&pgd_lock, flags);
free_page((unsigned long)__va(pgd_val(pgd[i])-1)); list_del(&virt_to_page(pgd)->lru);
kmem_cache_free(pae_pgd_cachep, pgd); spin_unlock_irqrestore(&pgd_lock, flags);
} }
#else
pgd_t *pgd_alloc(struct mm_struct *mm) pgd_t *pgd_alloc(struct mm_struct *mm)
{ {
pgd_t *pgd = (pgd_t *)__get_free_page(GFP_KERNEL); int i;
pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL);
if (pgd) { if (PTRS_PER_PMD == 1 || !pgd)
memset(pgd, 0, USER_PTRS_PER_PGD * sizeof(pgd_t)); return pgd;
memcpy(pgd + USER_PTRS_PER_PGD,
swapper_pg_dir + USER_PTRS_PER_PGD, for (i = 0; i < USER_PTRS_PER_PGD; ++i) {
(PTRS_PER_PGD - USER_PTRS_PER_PGD) * sizeof(pgd_t)); pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL);
if (!pmd)
goto out_oom;
set_pgd(&pgd[i], __pgd(1 + __pa((u64)((u32)pmd))));
} }
return pgd; return pgd;
out_oom:
for (i--; i >= 0; i--)
kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
kmem_cache_free(pgd_cache, pgd);
return NULL;
} }
void pgd_free(pgd_t *pgd) void pgd_free(pgd_t *pgd)
{ {
free_page((unsigned long)pgd); int i;
}
#endif /* CONFIG_X86_PAE */
/* in the PAE case user pgd entries are overwritten before usage */
if (PTRS_PER_PMD > 1)
for (i = 0; i < USER_PTRS_PER_PGD; ++i)
kmem_cache_free(pmd_cache, (void *)__va(pgd_val(pgd[i])-1));
/* in the non-PAE case, clear_page_tables() clears user pgd entries */
kmem_cache_free(pgd_cache, pgd);
}
...@@ -123,6 +123,4 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) ...@@ -123,6 +123,4 @@ static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot)
#define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) }) #define pgoff_to_pte(off) ((pte_t) { _PAGE_FILE, (off) })
#define PTE_FILE_MAX_BITS 32 #define PTE_FILE_MAX_BITS 32
extern struct kmem_cache_s *pae_pgd_cachep;
#endif /* _I386_PGTABLE_3LEVEL_H */ #endif /* _I386_PGTABLE_3LEVEL_H */
...@@ -21,15 +21,27 @@ ...@@ -21,15 +21,27 @@
#include <asm/bitops.h> #include <asm/bitops.h>
#endif #endif
extern pgd_t swapper_pg_dir[1024]; #include <linux/slab.h>
extern void paging_init(void); #include <linux/list.h>
#include <linux/spinlock.h>
/* /*
* ZERO_PAGE is a global shared page that is always zero: used * ZERO_PAGE is a global shared page that is always zero: used
* for zero-mapped memory areas etc.. * for zero-mapped memory areas etc..
*/ */
extern unsigned long empty_zero_page[1024];
#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page))
extern unsigned long empty_zero_page[1024];
extern pgd_t swapper_pg_dir[1024];
extern kmem_cache_t *pgd_cache;
extern kmem_cache_t *pmd_cache;
extern spinlock_t pgd_lock;
extern struct list_head pgd_list;
void pmd_ctor(void *, kmem_cache_t *, unsigned long);
void pgd_ctor(void *, kmem_cache_t *, unsigned long);
void pgd_dtor(void *, kmem_cache_t *, unsigned long);
void pgtable_cache_init(void);
void paging_init(void);
#endif /* !__ASSEMBLY__ */ #endif /* !__ASSEMBLY__ */
...@@ -41,20 +53,8 @@ extern unsigned long empty_zero_page[1024]; ...@@ -41,20 +53,8 @@ extern unsigned long empty_zero_page[1024];
#ifndef __ASSEMBLY__ #ifndef __ASSEMBLY__
#ifdef CONFIG_X86_PAE #ifdef CONFIG_X86_PAE
# include <asm/pgtable-3level.h> # include <asm/pgtable-3level.h>
/*
* Need to initialise the X86 PAE caches
*/
extern void pgtable_cache_init(void);
#else #else
# include <asm/pgtable-2level.h> # include <asm/pgtable-2level.h>
/*
* No page table caches to initialise
*/
#define pgtable_cache_init() do { } while (0)
#endif #endif
#endif #endif
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment