Commit 5af9c2e1 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge fixes from Andrew Morton:
 "22 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (22 commits)
  epoll: restrict EPOLLEXCLUSIVE to POLLIN and POLLOUT
  radix-tree: fix oops after radix_tree_iter_retry
  MAINTAINERS: trim the file triggers for ABI/API
  dax: dirty inode only if required
  thp: make deferred_split_scan() work again
  mm: replace vma_lock_anon_vma with anon_vma_lock_read/write
  ocfs2/dlm: clear refmap bit of recovery lock while doing local recovery cleanup
  um: asm/page.h: remove the pte_high member from struct pte_t
  mm, hugetlb: don't require CMA for runtime gigantic pages
  mm/hugetlb: fix gigantic page initialization/allocation
  mm: downgrade VM_BUG in isolate_lru_page() to warning
  mempolicy: do not try to queue pages from !vma_migratable()
  mm, vmstat: fix wrong WQ sleep when memory reclaim doesn't make any progress
  vmstat: make vmstat_update deferrable
  mm, vmstat: make quiet_vmstat lighter
  mm/Kconfig: correct description of DEFERRED_STRUCT_PAGE_INIT
  memblock: don't mark memblock_phys_mem_size() as __init
  dump_stack: avoid potential deadlocks
  mm: validate_mm browse_rb SMP race condition
  m32r: fix build failure due to SMP and MMU
  ...
parents 5d6a6a75 b6a515c8
...@@ -223,9 +223,7 @@ F: drivers/scsi/aacraid/ ...@@ -223,9 +223,7 @@ F: drivers/scsi/aacraid/
ABI/API ABI/API
L: linux-api@vger.kernel.org L: linux-api@vger.kernel.org
F: Documentation/ABI/
F: include/linux/syscalls.h F: include/linux/syscalls.h
F: include/uapi/
F: kernel/sys_ni.c F: kernel/sys_ni.c
ABIT UGURU 1,2 HARDWARE MONITOR DRIVER ABIT UGURU 1,2 HARDWARE MONITOR DRIVER
......
...@@ -276,6 +276,7 @@ source "kernel/Kconfig.preempt" ...@@ -276,6 +276,7 @@ source "kernel/Kconfig.preempt"
config SMP config SMP
bool "Symmetric multi-processing support" bool "Symmetric multi-processing support"
depends on MMU
---help--- ---help---
This enables support for systems with more than one CPU. If you have This enables support for systems with more than one CPU. If you have
a system with only one CPU, say N. If you have a system with more a system with only one CPU, say N. If you have a system with more
......
...@@ -34,21 +34,18 @@ struct page; ...@@ -34,21 +34,18 @@ struct page;
#if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT) #if defined(CONFIG_3_LEVEL_PGTABLES) && !defined(CONFIG_64BIT)
typedef struct { unsigned long pte_low, pte_high; } pte_t; typedef struct { unsigned long pte; } pte_t;
typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pmd; } pmd_t;
typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgd; } pgd_t;
#define pte_val(x) ((x).pte_low | ((unsigned long long) (x).pte_high << 32)) #define pte_val(p) ((p).pte)
#define pte_get_bits(pte, bits) ((pte).pte_low & (bits)) #define pte_get_bits(p, bits) ((p).pte & (bits))
#define pte_set_bits(pte, bits) ((pte).pte_low |= (bits)) #define pte_set_bits(p, bits) ((p).pte |= (bits))
#define pte_clear_bits(pte, bits) ((pte).pte_low &= ~(bits)) #define pte_clear_bits(p, bits) ((p).pte &= ~(bits))
#define pte_copy(to, from) ({ (to).pte_high = (from).pte_high; \ #define pte_copy(to, from) ({ (to).pte = (from).pte; })
smp_wmb(); \ #define pte_is_zero(p) (!((p).pte & ~_PAGE_NEWPAGE))
(to).pte_low = (from).pte_low; }) #define pte_set_val(p, phys, prot) \
#define pte_is_zero(pte) (!((pte).pte_low & ~_PAGE_NEWPAGE) && !(pte).pte_high) ({ (p).pte = (phys) | pgprot_val(prot); })
#define pte_set_val(pte, phys, prot) \
({ (pte).pte_high = (phys) >> 32; \
(pte).pte_low = (phys) | pgprot_val(prot); })
#define pmd_val(x) ((x).pmd) #define pmd_val(x) ((x).pmd)
#define __pmd(x) ((pmd_t) { (x) } ) #define __pmd(x) ((pmd_t) { (x) } )
......
...@@ -173,10 +173,10 @@ static __init int setup_hugepagesz(char *opt) ...@@ -173,10 +173,10 @@ static __init int setup_hugepagesz(char *opt)
} }
__setup("hugepagesz=", setup_hugepagesz); __setup("hugepagesz=", setup_hugepagesz);
#ifdef CONFIG_CMA #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static __init int gigantic_pages_init(void) static __init int gigantic_pages_init(void)
{ {
/* With CMA we can allocate gigantic pages at runtime */ /* With compaction or CMA we can allocate gigantic pages at runtime */
if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT))
hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT);
return 0; return 0;
......
...@@ -1730,6 +1730,12 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) ...@@ -1730,6 +1730,12 @@ static int blkdev_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
return __dax_fault(vma, vmf, blkdev_get_block, NULL); return __dax_fault(vma, vmf, blkdev_get_block, NULL);
} }
static int blkdev_dax_pfn_mkwrite(struct vm_area_struct *vma,
struct vm_fault *vmf)
{
return dax_pfn_mkwrite(vma, vmf);
}
static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, unsigned int flags) pmd_t *pmd, unsigned int flags)
{ {
...@@ -1739,7 +1745,7 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, ...@@ -1739,7 +1745,7 @@ static int blkdev_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
static const struct vm_operations_struct blkdev_dax_vm_ops = { static const struct vm_operations_struct blkdev_dax_vm_ops = {
.fault = blkdev_dax_fault, .fault = blkdev_dax_fault,
.pmd_fault = blkdev_dax_pmd_fault, .pmd_fault = blkdev_dax_pmd_fault,
.pfn_mkwrite = blkdev_dax_fault, .pfn_mkwrite = blkdev_dax_pfn_mkwrite,
}; };
static const struct vm_operations_struct blkdev_default_vm_ops = { static const struct vm_operations_struct blkdev_default_vm_ops = {
......
...@@ -358,7 +358,8 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index, ...@@ -358,7 +358,8 @@ static int dax_radix_entry(struct address_space *mapping, pgoff_t index,
void *entry; void *entry;
WARN_ON_ONCE(pmd_entry && !dirty); WARN_ON_ONCE(pmd_entry && !dirty);
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
spin_lock_irq(&mapping->tree_lock); spin_lock_irq(&mapping->tree_lock);
......
...@@ -94,6 +94,11 @@ ...@@ -94,6 +94,11 @@
/* Epoll private bits inside the event mask */ /* Epoll private bits inside the event mask */
#define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE) #define EP_PRIVATE_BITS (EPOLLWAKEUP | EPOLLONESHOT | EPOLLET | EPOLLEXCLUSIVE)
#define EPOLLINOUT_BITS (POLLIN | POLLOUT)
#define EPOLLEXCLUSIVE_OK_BITS (EPOLLINOUT_BITS | POLLERR | POLLHUP | \
EPOLLWAKEUP | EPOLLET | EPOLLEXCLUSIVE)
/* Maximum number of nesting allowed inside epoll sets */ /* Maximum number of nesting allowed inside epoll sets */
#define EP_MAX_NESTS 4 #define EP_MAX_NESTS 4
...@@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k ...@@ -1068,7 +1073,22 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k
* wait list. * wait list.
*/ */
if (waitqueue_active(&ep->wq)) { if (waitqueue_active(&ep->wq)) {
ewake = 1; if ((epi->event.events & EPOLLEXCLUSIVE) &&
!((unsigned long)key & POLLFREE)) {
switch ((unsigned long)key & EPOLLINOUT_BITS) {
case POLLIN:
if (epi->event.events & POLLIN)
ewake = 1;
break;
case POLLOUT:
if (epi->event.events & POLLOUT)
ewake = 1;
break;
case 0:
ewake = 1;
break;
}
}
wake_up_locked(&ep->wq); wake_up_locked(&ep->wq);
} }
if (waitqueue_active(&ep->poll_wait)) if (waitqueue_active(&ep->poll_wait))
...@@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -1875,9 +1895,13 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
* so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation. * so EPOLLEXCLUSIVE is not allowed for a EPOLL_CTL_MOD operation.
* Also, we do not currently supported nested exclusive wakeups. * Also, we do not currently supported nested exclusive wakeups.
*/ */
if ((epds.events & EPOLLEXCLUSIVE) && (op == EPOLL_CTL_MOD || if (epds.events & EPOLLEXCLUSIVE) {
(op == EPOLL_CTL_ADD && is_file_epoll(tf.file)))) if (op == EPOLL_CTL_MOD)
goto error_tgt_fput; goto error_tgt_fput;
if (op == EPOLL_CTL_ADD && (is_file_epoll(tf.file) ||
(epds.events & ~EPOLLEXCLUSIVE_OK_BITS)))
goto error_tgt_fput;
}
/* /*
* At this point it is safe to assume that the "private_data" contains * At this point it is safe to assume that the "private_data" contains
...@@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, ...@@ -1950,8 +1974,10 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
break; break;
case EPOLL_CTL_MOD: case EPOLL_CTL_MOD:
if (epi) { if (epi) {
epds.events |= POLLERR | POLLHUP; if (!(epi->event.events & EPOLLEXCLUSIVE)) {
error = ep_modify(ep, epi, &epds); epds.events |= POLLERR | POLLHUP;
error = ep_modify(ep, epi, &epds);
}
} else } else
error = -ENOENT; error = -ENOENT;
break; break;
......
...@@ -2367,6 +2367,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) ...@@ -2367,6 +2367,8 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
break; break;
} }
} }
dlm_lockres_clear_refmap_bit(dlm, res,
dead_node);
spin_unlock(&res->spinlock); spin_unlock(&res->spinlock);
continue; continue;
} }
......
...@@ -547,16 +547,16 @@ static inline bool pm_suspended_storage(void) ...@@ -547,16 +547,16 @@ static inline bool pm_suspended_storage(void)
} }
#endif /* CONFIG_PM_SLEEP */ #endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_CMA #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
/* The below functions must be run on a range from a single zone. */ /* The below functions must be run on a range from a single zone. */
extern int alloc_contig_range(unsigned long start, unsigned long end, extern int alloc_contig_range(unsigned long start, unsigned long end,
unsigned migratetype); unsigned migratetype);
extern void free_contig_range(unsigned long pfn, unsigned nr_pages); extern void free_contig_range(unsigned long pfn, unsigned nr_pages);
#endif
#ifdef CONFIG_CMA
/* CMA stuff */ /* CMA stuff */
extern void init_cma_reserved_pageblock(struct page *page); extern void init_cma_reserved_pageblock(struct page *page);
#endif #endif
#endif /* __LINUX_GFP_H */ #endif /* __LINUX_GFP_H */
...@@ -400,7 +400,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter) ...@@ -400,7 +400,7 @@ void **radix_tree_iter_retry(struct radix_tree_iter *iter)
* @iter: pointer to radix tree iterator * @iter: pointer to radix tree iterator
* Returns: current chunk size * Returns: current chunk size
*/ */
static __always_inline unsigned static __always_inline long
radix_tree_chunk_size(struct radix_tree_iter *iter) radix_tree_chunk_size(struct radix_tree_iter *iter)
{ {
return iter->next_index - iter->index; return iter->next_index - iter->index;
...@@ -434,9 +434,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) ...@@ -434,9 +434,9 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags)
return slot + offset + 1; return slot + offset + 1;
} }
} else { } else {
unsigned size = radix_tree_chunk_size(iter) - 1; long size = radix_tree_chunk_size(iter);
while (size--) { while (--size > 0) {
slot++; slot++;
iter->index++; iter->index++;
if (likely(*slot)) if (likely(*slot))
......
...@@ -109,20 +109,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma) ...@@ -109,20 +109,6 @@ static inline void put_anon_vma(struct anon_vma *anon_vma)
__put_anon_vma(anon_vma); __put_anon_vma(anon_vma);
} }
static inline void vma_lock_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
down_write(&anon_vma->root->rwsem);
}
static inline void vma_unlock_anon_vma(struct vm_area_struct *vma)
{
struct anon_vma *anon_vma = vma->anon_vma;
if (anon_vma)
up_write(&anon_vma->root->rwsem);
}
static inline void anon_vma_lock_write(struct anon_vma *anon_vma) static inline void anon_vma_lock_write(struct anon_vma *anon_vma)
{ {
down_write(&anon_vma->root->rwsem); down_write(&anon_vma->root->rwsem);
......
...@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set) ...@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked; current->saved_sigmask = current->blocked;
set_current_blocked(set); set_current_blocked(set);
__set_current_state(TASK_INTERRUPTIBLE); while (!signal_pending(current)) {
schedule(); __set_current_state(TASK_INTERRUPTIBLE);
schedule();
}
set_restore_sigmask(); set_restore_sigmask();
return -ERESTARTNOHAND; return -ERESTARTNOHAND;
} }
......
...@@ -25,6 +25,7 @@ static atomic_t dump_lock = ATOMIC_INIT(-1); ...@@ -25,6 +25,7 @@ static atomic_t dump_lock = ATOMIC_INIT(-1);
asmlinkage __visible void dump_stack(void) asmlinkage __visible void dump_stack(void)
{ {
unsigned long flags;
int was_locked; int was_locked;
int old; int old;
int cpu; int cpu;
...@@ -33,9 +34,8 @@ asmlinkage __visible void dump_stack(void) ...@@ -33,9 +34,8 @@ asmlinkage __visible void dump_stack(void)
* Permit this cpu to perform nested stack dumps while serialising * Permit this cpu to perform nested stack dumps while serialising
* against other CPUs * against other CPUs
*/ */
preempt_disable();
retry: retry:
local_irq_save(flags);
cpu = smp_processor_id(); cpu = smp_processor_id();
old = atomic_cmpxchg(&dump_lock, -1, cpu); old = atomic_cmpxchg(&dump_lock, -1, cpu);
if (old == -1) { if (old == -1) {
...@@ -43,6 +43,7 @@ asmlinkage __visible void dump_stack(void) ...@@ -43,6 +43,7 @@ asmlinkage __visible void dump_stack(void)
} else if (old == cpu) { } else if (old == cpu) {
was_locked = 1; was_locked = 1;
} else { } else {
local_irq_restore(flags);
cpu_relax(); cpu_relax();
goto retry; goto retry;
} }
...@@ -52,7 +53,7 @@ asmlinkage __visible void dump_stack(void) ...@@ -52,7 +53,7 @@ asmlinkage __visible void dump_stack(void)
if (!was_locked) if (!was_locked)
atomic_set(&dump_lock, -1); atomic_set(&dump_lock, -1);
preempt_enable(); local_irq_restore(flags);
} }
#else #else
asmlinkage __visible void dump_stack(void) asmlinkage __visible void dump_stack(void)
......
...@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT ...@@ -624,7 +624,7 @@ config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
bool bool
config DEFERRED_STRUCT_PAGE_INIT config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kswapd" bool "Defer initialisation of struct pages to kthreads"
default n default n
depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT
depends on MEMORY_HOTPLUG depends on MEMORY_HOTPLUG
...@@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT ...@@ -633,9 +633,10 @@ config DEFERRED_STRUCT_PAGE_INIT
single thread. On very large machines this can take a considerable single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up amount of time. If this option is set, large machines will bring up
a subset of memmap at boot and then initialise the rest in parallel a subset of memmap at boot and then initialise the rest in parallel
when kswapd starts. This has a potential performance impact on by starting one-off "pgdatinitX" kernel thread for each node X. This
processes running early in the lifetime of the systemm until kswapd has a potential performance impact on processes running early in the
finishes the initialisation. lifetime of the system until these kthreads finish the
initialisation.
config IDLE_PAGE_TRACKING config IDLE_PAGE_TRACKING
bool "Enable idle page tracking" bool "Enable idle page tracking"
......
...@@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout) ...@@ -989,7 +989,7 @@ long wait_iff_congested(struct zone *zone, int sync, long timeout)
* here rather than calling cond_resched(). * here rather than calling cond_resched().
*/ */
if (current->flags & PF_WQ_WORKER) if (current->flags & PF_WQ_WORKER)
schedule_timeout(1); schedule_timeout_uninterruptible(1);
else else
cond_resched(); cond_resched();
......
...@@ -3482,7 +3482,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink, ...@@ -3482,7 +3482,7 @@ static unsigned long deferred_split_scan(struct shrinker *shrink,
spin_lock_irqsave(&pgdata->split_queue_lock, flags); spin_lock_irqsave(&pgdata->split_queue_lock, flags);
/* Take pin on all head pages to avoid freeing them under us */ /* Take pin on all head pages to avoid freeing them under us */
list_for_each_safe(pos, next, &list) { list_for_each_safe(pos, next, &pgdata->split_queue) {
page = list_entry((void *)pos, struct page, mapping); page = list_entry((void *)pos, struct page, mapping);
page = compound_head(page); page = compound_head(page);
if (get_page_unless_zero(page)) { if (get_page_unless_zero(page)) {
......
...@@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) ...@@ -1001,7 +1001,7 @@ static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed)
((node = hstate_next_node_to_free(hs, mask)) || 1); \ ((node = hstate_next_node_to_free(hs, mask)) || 1); \
nr_nodes--) nr_nodes--)
#if defined(CONFIG_CMA) && defined(CONFIG_X86_64) #if defined(CONFIG_X86_64) && ((defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA))
static void destroy_compound_gigantic_page(struct page *page, static void destroy_compound_gigantic_page(struct page *page,
unsigned int order) unsigned int order)
{ {
...@@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page) ...@@ -1214,8 +1214,8 @@ void free_huge_page(struct page *page)
set_page_private(page, 0); set_page_private(page, 0);
page->mapping = NULL; page->mapping = NULL;
BUG_ON(page_count(page)); VM_BUG_ON_PAGE(page_count(page), page);
BUG_ON(page_mapcount(page)); VM_BUG_ON_PAGE(page_mapcount(page), page);
restore_reserve = PagePrivate(page); restore_reserve = PagePrivate(page);
ClearPagePrivate(page); ClearPagePrivate(page);
...@@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order) ...@@ -1286,6 +1286,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
set_page_count(p, 0); set_page_count(p, 0);
set_compound_head(p, page); set_compound_head(p, page);
} }
atomic_set(compound_mapcount_ptr(page), -1);
} }
/* /*
......
...@@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size) ...@@ -1448,7 +1448,7 @@ void __init __memblock_free_late(phys_addr_t base, phys_addr_t size)
* Remaining API functions * Remaining API functions
*/ */
phys_addr_t __init memblock_phys_mem_size(void) phys_addr_t __init_memblock memblock_phys_mem_size(void)
{ {
return memblock.memory.total_size; return memblock.memory.total_size;
} }
......
...@@ -548,8 +548,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr, ...@@ -548,8 +548,7 @@ static int queue_pages_pte_range(pmd_t *pmd, unsigned long addr,
goto retry; goto retry;
} }
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) migrate_page_add(page, qp->pagelist, flags);
migrate_page_add(page, qp->pagelist, flags);
} }
pte_unmap_unlock(pte - 1, ptl); pte_unmap_unlock(pte - 1, ptl);
cond_resched(); cond_resched();
...@@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, ...@@ -625,7 +624,7 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
unsigned long endvma = vma->vm_end; unsigned long endvma = vma->vm_end;
unsigned long flags = qp->flags; unsigned long flags = qp->flags;
if (vma->vm_flags & VM_PFNMAP) if (!vma_migratable(vma))
return 1; return 1;
if (endvma > end) if (endvma > end)
...@@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, ...@@ -644,16 +643,13 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end,
if (flags & MPOL_MF_LAZY) { if (flags & MPOL_MF_LAZY) {
/* Similar to task_numa_work, skip inaccessible VMAs */ /* Similar to task_numa_work, skip inaccessible VMAs */
if (vma_migratable(vma) && if (vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE))
change_prot_numa(vma, start, endvma); change_prot_numa(vma, start, endvma);
return 1; return 1;
} }
if ((flags & MPOL_MF_STRICT) || /* queue pages from current vma */
((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) && if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
vma_migratable(vma)))
/* queue pages from current vma */
return 0; return 0;
return 1; return 1;
} }
......
...@@ -390,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) ...@@ -390,8 +390,9 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma)
} }
#ifdef CONFIG_DEBUG_VM_RB #ifdef CONFIG_DEBUG_VM_RB
static int browse_rb(struct rb_root *root) static int browse_rb(struct mm_struct *mm)
{ {
struct rb_root *root = &mm->mm_rb;
int i = 0, j, bug = 0; int i = 0, j, bug = 0;
struct rb_node *nd, *pn = NULL; struct rb_node *nd, *pn = NULL;
unsigned long prev = 0, pend = 0; unsigned long prev = 0, pend = 0;
...@@ -414,12 +415,14 @@ static int browse_rb(struct rb_root *root) ...@@ -414,12 +415,14 @@ static int browse_rb(struct rb_root *root)
vma->vm_start, vma->vm_end); vma->vm_start, vma->vm_end);
bug = 1; bug = 1;
} }
spin_lock(&mm->page_table_lock);
if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) {
pr_emerg("free gap %lx, correct %lx\n", pr_emerg("free gap %lx, correct %lx\n",
vma->rb_subtree_gap, vma->rb_subtree_gap,
vma_compute_subtree_gap(vma)); vma_compute_subtree_gap(vma));
bug = 1; bug = 1;
} }
spin_unlock(&mm->page_table_lock);
i++; i++;
pn = nd; pn = nd;
prev = vma->vm_start; prev = vma->vm_start;
...@@ -456,12 +459,16 @@ static void validate_mm(struct mm_struct *mm) ...@@ -456,12 +459,16 @@ static void validate_mm(struct mm_struct *mm)
struct vm_area_struct *vma = mm->mmap; struct vm_area_struct *vma = mm->mmap;
while (vma) { while (vma) {
struct anon_vma *anon_vma = vma->anon_vma;
struct anon_vma_chain *avc; struct anon_vma_chain *avc;
vma_lock_anon_vma(vma); if (anon_vma) {
list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_lock_read(anon_vma);
anon_vma_interval_tree_verify(avc); list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
vma_unlock_anon_vma(vma); anon_vma_interval_tree_verify(avc);
anon_vma_unlock_read(anon_vma);
}
highest_address = vma->vm_end; highest_address = vma->vm_end;
vma = vma->vm_next; vma = vma->vm_next;
i++; i++;
...@@ -475,7 +482,7 @@ static void validate_mm(struct mm_struct *mm) ...@@ -475,7 +482,7 @@ static void validate_mm(struct mm_struct *mm)
mm->highest_vm_end, highest_address); mm->highest_vm_end, highest_address);
bug = 1; bug = 1;
} }
i = browse_rb(&mm->mm_rb); i = browse_rb(mm);
if (i != mm->map_count) { if (i != mm->map_count) {
if (i != -1) if (i != -1)
pr_emerg("map_count %d rb %d\n", mm->map_count, i); pr_emerg("map_count %d rb %d\n", mm->map_count, i);
...@@ -2142,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns ...@@ -2142,32 +2149,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
int expand_upwards(struct vm_area_struct *vma, unsigned long address) int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{ {
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
int error; int error = 0;
if (!(vma->vm_flags & VM_GROWSUP)) if (!(vma->vm_flags & VM_GROWSUP))
return -EFAULT; return -EFAULT;
/* /* Guard against wrapping around to address 0. */
* We must make sure the anon_vma is allocated if (address < PAGE_ALIGN(address+4))
* so that the anon_vma locking is not a noop. address = PAGE_ALIGN(address+4);
*/ else
return -ENOMEM;
/* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma))) if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM; return -ENOMEM;
vma_lock_anon_vma(vma);
/* /*
* vma->vm_start/vm_end cannot change under us because the caller * vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the * is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks. * anon_vma lock to serialize against concurrent expand_stacks.
* Also guard against wrapping around to address 0.
*/ */
if (address < PAGE_ALIGN(address+4)) anon_vma_lock_write(vma->anon_vma);
address = PAGE_ALIGN(address+4);
else {
vma_unlock_anon_vma(vma);
return -ENOMEM;
}
error = 0;
/* Somebody else might have raced and expanded it already */ /* Somebody else might have raced and expanded it already */
if (address > vma->vm_end) { if (address > vma->vm_end) {
...@@ -2185,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) ...@@ -2185,7 +2187,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
* updates, but we only hold a shared mmap_sem * updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against * lock here, so we need to protect against
* concurrent vma expansions. * concurrent vma expansions.
* vma_lock_anon_vma() doesn't help here, as * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas * we don't guarantee that all growable vmas
* in a mm share the same root anon vma. * in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard * So, we reuse mm->page_table_lock to guard
...@@ -2208,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) ...@@ -2208,7 +2210,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
} }
} }
} }
vma_unlock_anon_vma(vma); anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags); khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm); validate_mm(mm);
return error; return error;
...@@ -2224,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma, ...@@ -2224,25 +2226,21 @@ int expand_downwards(struct vm_area_struct *vma,
struct mm_struct *mm = vma->vm_mm; struct mm_struct *mm = vma->vm_mm;
int error; int error;
/*
* We must make sure the anon_vma is allocated
* so that the anon_vma locking is not a noop.
*/
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
address &= PAGE_MASK; address &= PAGE_MASK;
error = security_mmap_addr(address); error = security_mmap_addr(address);
if (error) if (error)
return error; return error;
vma_lock_anon_vma(vma); /* We must make sure the anon_vma is allocated. */
if (unlikely(anon_vma_prepare(vma)))
return -ENOMEM;
/* /*
* vma->vm_start/vm_end cannot change under us because the caller * vma->vm_start/vm_end cannot change under us because the caller
* is required to hold the mmap_sem in read mode. We need the * is required to hold the mmap_sem in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks. * anon_vma lock to serialize against concurrent expand_stacks.
*/ */
anon_vma_lock_write(vma->anon_vma);
/* Somebody else might have raced and expanded it already */ /* Somebody else might have raced and expanded it already */
if (address < vma->vm_start) { if (address < vma->vm_start) {
...@@ -2260,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma, ...@@ -2260,7 +2258,7 @@ int expand_downwards(struct vm_area_struct *vma,
* updates, but we only hold a shared mmap_sem * updates, but we only hold a shared mmap_sem
* lock here, so we need to protect against * lock here, so we need to protect against
* concurrent vma expansions. * concurrent vma expansions.
* vma_lock_anon_vma() doesn't help here, as * anon_vma_lock_write() doesn't help here, as
* we don't guarantee that all growable vmas * we don't guarantee that all growable vmas
* in a mm share the same root anon vma. * in a mm share the same root anon vma.
* So, we reuse mm->page_table_lock to guard * So, we reuse mm->page_table_lock to guard
...@@ -2281,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma, ...@@ -2281,7 +2279,7 @@ int expand_downwards(struct vm_area_struct *vma,
} }
} }
} }
vma_unlock_anon_vma(vma); anon_vma_unlock_write(vma->anon_vma);
khugepaged_enter_vma_merge(vma, vma->vm_flags); khugepaged_enter_vma_merge(vma, vma->vm_flags);
validate_mm(mm); validate_mm(mm);
return error; return error;
......
...@@ -6620,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page) ...@@ -6620,7 +6620,7 @@ bool is_pageblock_removable_nolock(struct page *page)
return !has_unmovable_pages(zone, page, 0, true); return !has_unmovable_pages(zone, page, 0, true);
} }
#ifdef CONFIG_CMA #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA)
static unsigned long pfn_max_align_down(unsigned long pfn) static unsigned long pfn_max_align_down(unsigned long pfn)
{ {
......
...@@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page) ...@@ -1443,7 +1443,7 @@ int isolate_lru_page(struct page *page)
int ret = -EBUSY; int ret = -EBUSY;
VM_BUG_ON_PAGE(!page_count(page), page); VM_BUG_ON_PAGE(!page_count(page), page);
VM_BUG_ON_PAGE(PageTail(page), page); WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
if (PageLRU(page)) { if (PageLRU(page)) {
struct zone *zone = page_zone(page); struct zone *zone = page_zone(page);
......
...@@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w) ...@@ -1396,10 +1396,15 @@ static void vmstat_update(struct work_struct *w)
* Counters were updated so we expect more updates * Counters were updated so we expect more updates
* to occur in the future. Keep on running the * to occur in the future. Keep on running the
* update worker thread. * update worker thread.
* If we were marked on cpu_stat_off clear the flag
* so that vmstat_shepherd doesn't schedule us again.
*/ */
queue_delayed_work_on(smp_processor_id(), vmstat_wq, if (!cpumask_test_and_clear_cpu(smp_processor_id(),
this_cpu_ptr(&vmstat_work), cpu_stat_off)) {
round_jiffies_relative(sysctl_stat_interval)); queue_delayed_work_on(smp_processor_id(), vmstat_wq,
this_cpu_ptr(&vmstat_work),
round_jiffies_relative(sysctl_stat_interval));
}
} else { } else {
/* /*
* We did not update any counters so the app may be in * We did not update any counters so the app may be in
...@@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w) ...@@ -1417,18 +1422,6 @@ static void vmstat_update(struct work_struct *w)
* until the diffs stay at zero. The function is used by NOHZ and can only be * until the diffs stay at zero. The function is used by NOHZ and can only be
* invoked when tick processing is not active. * invoked when tick processing is not active.
*/ */
void quiet_vmstat(void)
{
if (system_state != SYSTEM_RUNNING)
return;
do {
if (!cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
cancel_delayed_work(this_cpu_ptr(&vmstat_work));
} while (refresh_cpu_vm_stats(false));
}
/* /*
* Check if the diffs for a certain cpu indicate that * Check if the diffs for a certain cpu indicate that
* an update is needed. * an update is needed.
...@@ -1452,6 +1445,30 @@ static bool need_update(int cpu) ...@@ -1452,6 +1445,30 @@ static bool need_update(int cpu)
return false; return false;
} }
void quiet_vmstat(void)
{
if (system_state != SYSTEM_RUNNING)
return;
/*
* If we are already in hands of the shepherd then there
* is nothing for us to do here.
*/
if (cpumask_test_and_set_cpu(smp_processor_id(), cpu_stat_off))
return;
if (!need_update(smp_processor_id()))
return;
/*
* Just refresh counters and do not care about the pending delayed
* vmstat_update. It doesn't fire that often to matter and canceling
* it would be too expensive from this path.
* vmstat_shepherd will take care about that for us.
*/
refresh_cpu_vm_stats(false);
}
/* /*
* Shepherd worker thread that checks the * Shepherd worker thread that checks the
...@@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w) ...@@ -1469,18 +1486,25 @@ static void vmstat_shepherd(struct work_struct *w)
get_online_cpus(); get_online_cpus();
/* Check processors whose vmstat worker threads have been disabled */ /* Check processors whose vmstat worker threads have been disabled */
for_each_cpu(cpu, cpu_stat_off) for_each_cpu(cpu, cpu_stat_off) {
if (need_update(cpu) && struct delayed_work *dw = &per_cpu(vmstat_work, cpu);
cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
queue_delayed_work_on(cpu, vmstat_wq,
&per_cpu(vmstat_work, cpu), 0);
if (need_update(cpu)) {
if (cpumask_test_and_clear_cpu(cpu, cpu_stat_off))
queue_delayed_work_on(cpu, vmstat_wq, dw, 0);
} else {
/*
* Cancel the work if quiet_vmstat has put this
* cpu on cpu_stat_off because the work item might
* be still scheduled
*/
cancel_delayed_work(dw);
}
}
put_online_cpus(); put_online_cpus();
schedule_delayed_work(&shepherd, schedule_delayed_work(&shepherd,
round_jiffies_relative(sysctl_stat_interval)); round_jiffies_relative(sysctl_stat_interval));
} }
static void __init start_shepherd_timer(void) static void __init start_shepherd_timer(void)
...@@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void) ...@@ -1488,7 +1512,7 @@ static void __init start_shepherd_timer(void)
int cpu; int cpu;
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
INIT_DELAYED_WORK(per_cpu_ptr(&vmstat_work, cpu), INIT_DEFERRABLE_WORK(per_cpu_ptr(&vmstat_work, cpu),
vmstat_update); vmstat_update);
if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL)) if (!alloc_cpumask_var(&cpu_stat_off, GFP_KERNEL))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment