Commit 203b4fc9 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 mm updates from Thomas Gleixner:

 - Make lazy TLB mode even lazier to avoid pointless switch_mm()
   operations, which reduces CPU load by 1-2% for memcache workloads

 - Small cleanups and improvements all over the place

* 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
  x86/mm: Remove redundant check for kmem_cache_create()
  arm/asm/tlb.h: Fix build error implicit func declaration
  x86/mm/tlb: Make clear_asid_other() static
  x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off()
  x86/mm/tlb: Always use lazy TLB mode
  x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs
  x86/mm/tlb: Make lazy TLB mode lazier
  x86/mm/tlb: Restructure switch_mm_irqs_off()
  x86/mm/tlb: Leave lazy TLB mode at page table free time
  mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids
  x86/mm: Add TLB purge to free pmd/pte page interfaces
  ioremap: Update pgtable free interfaces with addr
  x86/mm: Disable ioremap free page handling on x86-PAE
parents 7edcf0d3 765d28f1
...@@ -292,5 +292,13 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, ...@@ -292,5 +292,13 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
{ {
} }
static inline void tlb_flush_remove_tables(struct mm_struct *mm)
{
}
static inline void tlb_flush_remove_tables_local(void *arg)
{
}
#endif /* CONFIG_MMU */ #endif /* CONFIG_MMU */
#endif #endif
...@@ -977,12 +977,12 @@ int pmd_clear_huge(pmd_t *pmdp) ...@@ -977,12 +977,12 @@ int pmd_clear_huge(pmd_t *pmdp)
return 1; return 1;
} }
int pud_free_pmd_page(pud_t *pud) int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{ {
return pud_none(*pud); return pud_none(*pud);
} }
int pmd_free_pte_page(pmd_t *pmd) int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{ {
return pmd_none(*pmd); return pmd_none(*pmd);
} }
...@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) ...@@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif #endif
static inline bool tlb_defer_switch_to_init_mm(void)
{
/*
* If we have PCID, then switching to init_mm is reasonably
* fast. If we don't have PCID, then switching to init_mm is
* quite slow, so we try to defer it in the hopes that we can
* avoid it entirely. The latter approach runs the risk of
* receiving otherwise unnecessary IPIs.
*
* This choice is just a heuristic. The tlb code can handle this
* function returning true or false regardless of whether we have
* PCID.
*/
return !static_cpu_has(X86_FEATURE_PCID);
}
struct tlb_context { struct tlb_context {
u64 ctx_id; u64 ctx_id;
u64 tlb_gen; u64 tlb_gen;
...@@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); ...@@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
native_flush_tlb_others(mask, info) native_flush_tlb_others(mask, info)
#endif #endif
extern void tlb_flush_remove_tables(struct mm_struct *mm);
extern void tlb_flush_remove_tables_local(void *arg);
#define HAVE_TLB_FLUSH_REMOVE_TABLES
#endif /* _ASM_X86_TLBFLUSH_H */ #endif /* _ASM_X86_TLBFLUSH_H */
...@@ -329,9 +329,6 @@ static int __init pgd_cache_init(void) ...@@ -329,9 +329,6 @@ static int __init pgd_cache_init(void)
*/ */
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN, pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
SLAB_PANIC, NULL); SLAB_PANIC, NULL);
if (!pgd_cache)
return -ENOMEM;
return 0; return 0;
} }
core_initcall(pgd_cache_init); core_initcall(pgd_cache_init);
...@@ -719,28 +716,50 @@ int pmd_clear_huge(pmd_t *pmd) ...@@ -719,28 +716,50 @@ int pmd_clear_huge(pmd_t *pmd)
return 0; return 0;
} }
#ifdef CONFIG_X86_64
/** /**
* pud_free_pmd_page - Clear pud entry and free pmd page. * pud_free_pmd_page - Clear pud entry and free pmd page.
* @pud: Pointer to a PUD. * @pud: Pointer to a PUD.
* @addr: Virtual address associated with pud.
* *
* Context: The pud range has been unmaped and TLB purged. * Context: The pud range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise. * Return: 1 if clearing the entry succeeded. 0 otherwise.
*
* NOTE: Callers must allow a single page allocation.
*/ */
int pud_free_pmd_page(pud_t *pud) int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{ {
pmd_t *pmd; pmd_t *pmd, *pmd_sv;
pte_t *pte;
int i; int i;
if (pud_none(*pud)) if (pud_none(*pud))
return 1; return 1;
pmd = (pmd_t *)pud_page_vaddr(*pud); pmd = (pmd_t *)pud_page_vaddr(*pud);
pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
if (!pmd_sv)
return 0;
for (i = 0; i < PTRS_PER_PMD; i++) for (i = 0; i < PTRS_PER_PMD; i++) {
if (!pmd_free_pte_page(&pmd[i])) pmd_sv[i] = pmd[i];
return 0; if (!pmd_none(pmd[i]))
pmd_clear(&pmd[i]);
}
pud_clear(pud); pud_clear(pud);
/* INVLPG to clear all paging-structure caches */
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
for (i = 0; i < PTRS_PER_PMD; i++) {
if (!pmd_none(pmd_sv[i])) {
pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
free_page((unsigned long)pte);
}
}
free_page((unsigned long)pmd_sv);
free_page((unsigned long)pmd); free_page((unsigned long)pmd);
return 1; return 1;
...@@ -749,11 +768,12 @@ int pud_free_pmd_page(pud_t *pud) ...@@ -749,11 +768,12 @@ int pud_free_pmd_page(pud_t *pud)
/** /**
* pmd_free_pte_page - Clear pmd entry and free pte page. * pmd_free_pte_page - Clear pmd entry and free pte page.
* @pmd: Pointer to a PMD. * @pmd: Pointer to a PMD.
* @addr: Virtual address associated with pmd.
* *
* Context: The pmd range has been unmaped and TLB purged. * Context: The pmd range has been unmapped and TLB purged.
* Return: 1 if clearing the entry succeeded. 0 otherwise. * Return: 1 if clearing the entry succeeded. 0 otherwise.
*/ */
int pmd_free_pte_page(pmd_t *pmd) int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{ {
pte_t *pte; pte_t *pte;
...@@ -762,8 +782,30 @@ int pmd_free_pte_page(pmd_t *pmd) ...@@ -762,8 +782,30 @@ int pmd_free_pte_page(pmd_t *pmd)
pte = (pte_t *)pmd_page_vaddr(*pmd); pte = (pte_t *)pmd_page_vaddr(*pmd);
pmd_clear(pmd); pmd_clear(pmd);
/* INVLPG to clear all paging-structure caches */
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
free_page((unsigned long)pte); free_page((unsigned long)pte);
return 1; return 1;
} }
#else /* !CONFIG_X86_64 */
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{
return pud_none(*pud);
}
/*
* Disable free page handling on x86-PAE. This assures that ioremap()
* does not update sync'd pmd entries. See vmalloc_sync_one().
*/
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{
return pmd_none(*pmd);
}
#endif /* CONFIG_X86_64 */
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
This diff is collapsed.
...@@ -82,6 +82,7 @@ struct mm_struct efi_mm = { ...@@ -82,6 +82,7 @@ struct mm_struct efi_mm = {
.mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem), .mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem),
.page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock), .page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
.mmlist = LIST_HEAD_INIT(efi_mm.mmlist), .mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
.cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
}; };
struct workqueue_struct *efi_rts_wq; struct workqueue_struct *efi_rts_wq;
......
...@@ -1019,8 +1019,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot); ...@@ -1019,8 +1019,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot); int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
int pud_clear_huge(pud_t *pud); int pud_clear_huge(pud_t *pud);
int pmd_clear_huge(pmd_t *pmd); int pmd_clear_huge(pmd_t *pmd);
int pud_free_pmd_page(pud_t *pud); int pud_free_pmd_page(pud_t *pud, unsigned long addr);
int pmd_free_pte_page(pmd_t *pmd); int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */ #else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot) static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
{ {
...@@ -1046,11 +1046,11 @@ static inline int pmd_clear_huge(pmd_t *pmd) ...@@ -1046,11 +1046,11 @@ static inline int pmd_clear_huge(pmd_t *pmd)
{ {
return 0; return 0;
} }
static inline int pud_free_pmd_page(pud_t *pud) static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
{ {
return 0; return 0;
} }
static inline int pmd_free_pte_page(pmd_t *pmd) static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
{ {
return 0; return 0;
} }
......
...@@ -303,4 +303,14 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb, ...@@ -303,4 +303,14 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
#define tlb_migrate_finish(mm) do {} while (0) #define tlb_migrate_finish(mm) do {} while (0)
/*
* Used to flush the TLB when page tables are removed, when lazy
* TLB mode may cause a CPU to retain intermediate translations
* pointing to about-to-be-freed page table memory.
*/
#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
#define tlb_flush_remove_tables(mm) do {} while (0)
#define tlb_flush_remove_tables_local(mm) do {} while (0)
#endif
#endif /* _ASM_GENERIC__TLB_H */ #endif /* _ASM_GENERIC__TLB_H */
This diff is collapsed.
...@@ -2276,6 +2276,8 @@ static void sighand_ctor(void *data) ...@@ -2276,6 +2276,8 @@ static void sighand_ctor(void *data)
void __init proc_caches_init(void) void __init proc_caches_init(void)
{ {
unsigned int mm_size;
sighand_cachep = kmem_cache_create("sighand_cache", sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0, sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU| SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
...@@ -2292,15 +2294,16 @@ void __init proc_caches_init(void) ...@@ -2292,15 +2294,16 @@ void __init proc_caches_init(void)
sizeof(struct fs_struct), 0, sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL); NULL);
/* /*
* FIXME! The "sizeof(struct mm_struct)" currently includes the * The mm_cpumask is located at the end of mm_struct, and is
* whole struct cpumask for the OFFSTACK case. We could change * dynamically sized based on the maximum CPU number this system
* this to *only* allocate as much of it as required by the * can have, taking hotplug into account (nr_cpu_ids).
* maximum number of CPU's we can ever have. The cpumask_allocation
* is at the end of the structure, exactly for that reason.
*/ */
mm_size = sizeof(struct mm_struct) + cpumask_size();
mm_cachep = kmem_cache_create_usercopy("mm_struct", mm_cachep = kmem_cache_create_usercopy("mm_struct",
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
offsetof(struct mm_struct, saved_auxv), offsetof(struct mm_struct, saved_auxv),
sizeof_field(struct mm_struct, saved_auxv), sizeof_field(struct mm_struct, saved_auxv),
......
...@@ -92,7 +92,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr, ...@@ -92,7 +92,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
if (ioremap_pmd_enabled() && if (ioremap_pmd_enabled() &&
((next - addr) == PMD_SIZE) && ((next - addr) == PMD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PMD_SIZE) && IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
pmd_free_pte_page(pmd)) { pmd_free_pte_page(pmd, addr)) {
if (pmd_set_huge(pmd, phys_addr + addr, prot)) if (pmd_set_huge(pmd, phys_addr + addr, prot))
continue; continue;
} }
...@@ -119,7 +119,7 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr, ...@@ -119,7 +119,7 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
if (ioremap_pud_enabled() && if (ioremap_pud_enabled() &&
((next - addr) == PUD_SIZE) && ((next - addr) == PUD_SIZE) &&
IS_ALIGNED(phys_addr + addr, PUD_SIZE) && IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
pud_free_pmd_page(pud)) { pud_free_pmd_page(pud, addr)) {
if (pud_set_huge(pud, phys_addr + addr, prot)) if (pud_set_huge(pud, phys_addr + addr, prot))
continue; continue;
} }
......
...@@ -15,6 +15,16 @@ ...@@ -15,6 +15,16 @@
#define INIT_MM_CONTEXT(name) #define INIT_MM_CONTEXT(name)
#endif #endif
/*
* For dynamically allocated mm_structs, there is a dynamically sized cpumask
* at the end of the structure, the size of which depends on the maximum CPU
* number the system can see. That way we allocate only as much memory for
* mm_cpumask() as needed for the hundreds, or thousands of processes that
* a system typically runs.
*
* Since there is only one init_mm in the entire system, keep it simple
* and size this cpu_bitmask to NR_CPUS.
*/
struct mm_struct init_mm = { struct mm_struct init_mm = {
.mm_rb = RB_ROOT, .mm_rb = RB_ROOT,
.pgd = swapper_pg_dir, .pgd = swapper_pg_dir,
...@@ -25,5 +35,6 @@ struct mm_struct init_mm = { ...@@ -25,5 +35,6 @@ struct mm_struct init_mm = {
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist), .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns = &init_user_ns, .user_ns = &init_user_ns,
.cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
INIT_MM_CONTEXT(init_mm) INIT_MM_CONTEXT(init_mm)
}; };
...@@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_ ...@@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
#ifdef CONFIG_HAVE_RCU_TABLE_FREE #ifdef CONFIG_HAVE_RCU_TABLE_FREE
/*
* See the comment near struct mmu_table_batch.
*/
static void tlb_remove_table_smp_sync(void *arg) static void tlb_remove_table_smp_sync(void *arg)
{ {
/* Simply deliver the interrupt */ struct mm_struct __maybe_unused *mm = arg;
/*
* On most architectures this does nothing. Simply delivering the
* interrupt is enough to prevent races with software page table
* walking like that done in get_user_pages_fast.
*
* See the comment near struct mmu_table_batch.
*/
tlb_flush_remove_tables_local(mm);
} }
static void tlb_remove_table_one(void *table) static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
{ {
/* /*
* This isn't an RCU grace period and hence the page-tables cannot be * This isn't an RCU grace period and hence the page-tables cannot be
...@@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table) ...@@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table)
* It is however sufficient for software page-table walkers that rely on * It is however sufficient for software page-table walkers that rely on
* IRQ disabling. See the comment near struct mmu_table_batch. * IRQ disabling. See the comment near struct mmu_table_batch.
*/ */
smp_call_function(tlb_remove_table_smp_sync, NULL, 1); smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
__tlb_remove_table(table); __tlb_remove_table(table);
} }
...@@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb) ...@@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb)
{ {
struct mmu_table_batch **batch = &tlb->batch; struct mmu_table_batch **batch = &tlb->batch;
tlb_flush_remove_tables(tlb->mm);
if (*batch) { if (*batch) {
call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu); call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
*batch = NULL; *batch = NULL;
...@@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) ...@@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
if (*batch == NULL) { if (*batch == NULL) {
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN); *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
if (*batch == NULL) { if (*batch == NULL) {
tlb_remove_table_one(table); tlb_remove_table_one(table, tlb);
return; return;
} }
(*batch)->nr = 0; (*batch)->nr = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment