Commit 58d4e450 authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "14 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes
  mm/vmalloc.c: don't unconditonally use __GFP_HIGHMEM
  mm/mempolicy: fix use after free when calling get_mempolicy
  mm/cma_debug.c: fix stack corruption due to sprintf usage
  signal: don't remove SIGNAL_UNKILLABLE for traced tasks.
  mm, oom: fix potential data corruption when oom_reaper races with writer
  mm: fix double mmap_sem unlock on MMF_UNSTABLE enforced SIGBUS
  slub: fix per memcg cache leak on css offline
  mm: discard memblock data later
  test_kmod: fix description for -s -and -c parameters
  kmod: fix wait on recursive loop
  wait: add wait_event_killable_timeout()
  kernel/watchdog: fix Kconfig constraints for perf hardlockup watchdog
  mm: memcontrol: fix NULL pointer crash in test_clear_page_writeback()
parents cc28fcdc c715b72c
...@@ -114,10 +114,10 @@ ...@@ -114,10 +114,10 @@
/* /*
* This is the base location for PIE (ET_DYN with INTERP) loads. On * This is the base location for PIE (ET_DYN with INTERP) loads. On
* 64-bit, this is raised to 4GB to leave the entire 32-bit address * 64-bit, this is above 4GB to leave the entire 32-bit address
* space open for things that want to use the area for 32-bit pointers. * space open for things that want to use the area for 32-bit pointers.
*/ */
#define ELF_ET_DYN_BASE 0x100000000UL #define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3)
#ifndef __ASSEMBLY__ #ifndef __ASSEMBLY__
......
...@@ -199,7 +199,7 @@ config PPC ...@@ -199,7 +199,7 @@ config PPC
select HAVE_OPTPROBES if PPC64 select HAVE_OPTPROBES if PPC64
select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS
select HAVE_PERF_EVENTS_NMI if PPC64 select HAVE_PERF_EVENTS_NMI if PPC64
select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
select HAVE_PERF_REGS select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP select HAVE_PERF_USER_STACK_DUMP
select HAVE_RCU_TABLE_FREE if SMP select HAVE_RCU_TABLE_FREE if SMP
......
...@@ -163,7 +163,7 @@ config X86 ...@@ -163,7 +163,7 @@ config X86
select HAVE_PCSPKR_PLATFORM select HAVE_PCSPKR_PLATFORM
select HAVE_PERF_EVENTS select HAVE_PERF_EVENTS
select HAVE_PERF_EVENTS_NMI select HAVE_PERF_EVENTS_NMI
select HAVE_HARDLOCKUP_DETECTOR_PERF if HAVE_PERF_EVENTS_NMI select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
select HAVE_PERF_REGS select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP select HAVE_PERF_USER_STACK_DUMP
select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_REGS_AND_STACK_ACCESS_API
......
...@@ -247,11 +247,11 @@ extern int force_personality32; ...@@ -247,11 +247,11 @@ extern int force_personality32;
/* /*
* This is the base location for PIE (ET_DYN with INTERP) loads. On * This is the base location for PIE (ET_DYN with INTERP) loads. On
* 64-bit, this is raised to 4GB to leave the entire 32-bit address * 64-bit, this is above 4GB to leave the entire 32-bit address
* space open for things that want to use the area for 32-bit pointers. * space open for things that want to use the area for 32-bit pointers.
*/ */
#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \
0x100000000UL) (TASK_SIZE / 3 * 2))
/* This yields a mask that user programs can use to figure out what /* This yields a mask that user programs can use to figure out what
instruction set this CPU supports. This could be done in user space, instruction set this CPU supports. This could be done in user space,
......
...@@ -61,6 +61,7 @@ extern int memblock_debug; ...@@ -61,6 +61,7 @@ extern int memblock_debug;
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
#define __init_memblock __meminit #define __init_memblock __meminit
#define __initdata_memblock __meminitdata #define __initdata_memblock __meminitdata
void memblock_discard(void);
#else #else
#define __init_memblock #define __init_memblock
#define __initdata_memblock #define __initdata_memblock
...@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align, ...@@ -74,8 +75,6 @@ phys_addr_t memblock_find_in_range_node(phys_addr_t size, phys_addr_t align,
int nid, ulong flags); int nid, ulong flags);
phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end, phys_addr_t memblock_find_in_range(phys_addr_t start, phys_addr_t end,
phys_addr_t size, phys_addr_t align); phys_addr_t size, phys_addr_t align);
phys_addr_t get_allocated_memblock_reserved_regions_info(phys_addr_t *addr);
phys_addr_t get_allocated_memblock_memory_regions_info(phys_addr_t *addr);
void memblock_allow_resize(void); void memblock_allow_resize(void);
int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid); int memblock_add_node(phys_addr_t base, phys_addr_t size, int nid);
int memblock_add(phys_addr_t base, phys_addr_t size); int memblock_add(phys_addr_t base, phys_addr_t size);
...@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags, ...@@ -110,6 +109,9 @@ void __next_mem_range_rev(u64 *idx, int nid, ulong flags,
void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start, void __next_reserved_mem_region(u64 *idx, phys_addr_t *out_start,
phys_addr_t *out_end); phys_addr_t *out_end);
void __memblock_free_early(phys_addr_t base, phys_addr_t size);
void __memblock_free_late(phys_addr_t base, phys_addr_t size);
/** /**
* for_each_mem_range - iterate through memblock areas from type_a and not * for_each_mem_range - iterate through memblock areas from type_a and not
* included in type_b. Or just type_a if type_b is NULL. * included in type_b. Or just type_a if type_b is NULL.
......
...@@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait); ...@@ -484,7 +484,8 @@ bool mem_cgroup_oom_synchronize(bool wait);
extern int do_swap_account; extern int do_swap_account;
#endif #endif
void lock_page_memcg(struct page *page); struct mem_cgroup *lock_page_memcg(struct page *page);
void __unlock_page_memcg(struct mem_cgroup *memcg);
void unlock_page_memcg(struct page *page); void unlock_page_memcg(struct page *page);
static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, static inline unsigned long memcg_page_state(struct mem_cgroup *memcg,
...@@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) ...@@ -809,7 +810,12 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{ {
} }
static inline void lock_page_memcg(struct page *page) static inline struct mem_cgroup *lock_page_memcg(struct page *page)
{
return NULL;
}
static inline void __unlock_page_memcg(struct mem_cgroup *memcg)
{ {
} }
......
...@@ -6,6 +6,8 @@ ...@@ -6,6 +6,8 @@
#include <linux/types.h> #include <linux/types.h>
#include <linux/nodemask.h> #include <linux/nodemask.h>
#include <uapi/linux/oom.h> #include <uapi/linux/oom.h>
#include <linux/sched/coredump.h> /* MMF_* */
#include <linux/mm.h> /* VM_FAULT* */
struct zonelist; struct zonelist;
struct notifier_block; struct notifier_block;
...@@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) ...@@ -63,6 +65,26 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk)
return tsk->signal->oom_mm; return tsk->signal->oom_mm;
} }
/*
* Checks whether a page fault on the given mm is still reliable.
* This is no longer true if the oom reaper started to reap the
* address space which is reflected by MMF_UNSTABLE flag set in
* the mm. At that moment any !shared mapping would lose the content
* and could cause a memory corruption (zero pages instead of the
* original content).
*
* User should call this before establishing a page table entry for
* a !shared mapping and under the proper page table lock.
*
* Return 0 when the PF is safe VM_FAULT_SIGBUS otherwise.
*/
static inline int check_stable_address_space(struct mm_struct *mm)
{
if (unlikely(test_bit(MMF_UNSTABLE, &mm->flags)))
return VM_FAULT_SIGBUS;
return 0;
}
extern unsigned long oom_badness(struct task_struct *p, extern unsigned long oom_badness(struct task_struct *p,
struct mem_cgroup *memcg, const nodemask_t *nodemask, struct mem_cgroup *memcg, const nodemask_t *nodemask,
unsigned long totalpages); unsigned long totalpages);
......
...@@ -757,6 +757,43 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *); ...@@ -757,6 +757,43 @@ extern int do_wait_intr_irq(wait_queue_head_t *, wait_queue_entry_t *);
__ret; \ __ret; \
}) })
#define __wait_event_killable_timeout(wq_head, condition, timeout) \
___wait_event(wq_head, ___wait_cond_timeout(condition), \
TASK_KILLABLE, 0, timeout, \
__ret = schedule_timeout(__ret))
/**
* wait_event_killable_timeout - sleep until a condition gets true or a timeout elapses
* @wq_head: the waitqueue to wait on
* @condition: a C expression for the event to wait for
* @timeout: timeout, in jiffies
*
* The process is put to sleep (TASK_KILLABLE) until the
* @condition evaluates to true or a kill signal is received.
* The @condition is checked each time the waitqueue @wq_head is woken up.
*
* wake_up() has to be called after changing any variable that could
* change the result of the wait condition.
*
* Returns:
* 0 if the @condition evaluated to %false after the @timeout elapsed,
* 1 if the @condition evaluated to %true after the @timeout elapsed,
* the remaining jiffies (at least 1) if the @condition evaluated
* to %true before the @timeout elapsed, or -%ERESTARTSYS if it was
* interrupted by a kill signal.
*
* Only kill signals interrupt this process.
*/
#define wait_event_killable_timeout(wq_head, condition, timeout) \
({ \
long __ret = timeout; \
might_sleep(); \
if (!___wait_cond_timeout(condition)) \
__ret = __wait_event_killable_timeout(wq_head, \
condition, timeout); \
__ret; \
})
#define __wait_event_lock_irq(wq_head, condition, lock, cmd) \ #define __wait_event_lock_irq(wq_head, condition, lock, cmd) \
(void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \ (void)___wait_event(wq_head, condition, TASK_UNINTERRUPTIBLE, 0, 0, \
......
...@@ -70,6 +70,18 @@ static DECLARE_RWSEM(umhelper_sem); ...@@ -70,6 +70,18 @@ static DECLARE_RWSEM(umhelper_sem);
static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT); static atomic_t kmod_concurrent_max = ATOMIC_INIT(MAX_KMOD_CONCURRENT);
static DECLARE_WAIT_QUEUE_HEAD(kmod_wq); static DECLARE_WAIT_QUEUE_HEAD(kmod_wq);
/*
* This is a restriction on having *all* MAX_KMOD_CONCURRENT threads
* running at the same time without returning. When this happens we
* believe you've somehow ended up with a recursive module dependency
* creating a loop.
*
* We have no option but to fail.
*
* Userspace should proactively try to detect and prevent these.
*/
#define MAX_KMOD_ALL_BUSY_TIMEOUT 5
/* /*
modprobe_path is set via /proc/sys. modprobe_path is set via /proc/sys.
*/ */
...@@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...) ...@@ -167,8 +179,17 @@ int __request_module(bool wait, const char *fmt, ...)
pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...", pr_warn_ratelimited("request_module: kmod_concurrent_max (%u) close to 0 (max_modprobes: %u), for module %s, throttling...",
atomic_read(&kmod_concurrent_max), atomic_read(&kmod_concurrent_max),
MAX_KMOD_CONCURRENT, module_name); MAX_KMOD_CONCURRENT, module_name);
wait_event_interruptible(kmod_wq, ret = wait_event_killable_timeout(kmod_wq,
atomic_dec_if_positive(&kmod_concurrent_max) >= 0); atomic_dec_if_positive(&kmod_concurrent_max) >= 0,
MAX_KMOD_ALL_BUSY_TIMEOUT * HZ);
if (!ret) {
pr_warn_ratelimited("request_module: modprobe %s cannot be processed, kmod busy with %d threads for more than %d seconds now",
module_name, MAX_KMOD_CONCURRENT, MAX_KMOD_ALL_BUSY_TIMEOUT);
return -ETIME;
} else if (ret == -ERESTARTSYS) {
pr_warn_ratelimited("request_module: sigkill sent for modprobe %s, giving up", module_name);
return ret;
}
} }
trace_module_request(module_name, wait, _RET_IP_); trace_module_request(module_name, wait, _RET_IP_);
......
...@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) ...@@ -1194,7 +1194,11 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t)
recalc_sigpending_and_wake(t); recalc_sigpending_and_wake(t);
} }
} }
if (action->sa.sa_handler == SIG_DFL) /*
* Don't clear SIGNAL_UNKILLABLE for traced tasks, users won't expect
* debugging to leave init killable.
*/
if (action->sa.sa_handler == SIG_DFL && !t->ptrace)
t->signal->flags &= ~SIGNAL_UNKILLABLE; t->signal->flags &= ~SIGNAL_UNKILLABLE;
ret = specific_send_sig_info(sig, info, t); ret = specific_send_sig_info(sig, info, t);
spin_unlock_irqrestore(&t->sighand->siglock, flags); spin_unlock_irqrestore(&t->sighand->siglock, flags);
......
...@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx) ...@@ -167,7 +167,7 @@ static void cma_debugfs_add_one(struct cma *cma, int idx)
char name[16]; char name[16];
int u32s; int u32s;
sprintf(name, "cma-%s", cma->name); scnprintf(name, sizeof(name), "cma-%s", cma->name);
tmp = debugfs_create_dir(name, cma_debugfs_root); tmp = debugfs_create_dir(name, cma_debugfs_root);
......
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include <linux/userfaultfd_k.h> #include <linux/userfaultfd_k.h>
#include <linux/page_idle.h> #include <linux/page_idle.h>
#include <linux/shmem_fs.h> #include <linux/shmem_fs.h>
#include <linux/oom.h>
#include <asm/tlb.h> #include <asm/tlb.h>
#include <asm/pgalloc.h> #include <asm/pgalloc.h>
...@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, ...@@ -550,6 +551,7 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
pgtable_t pgtable; pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK; unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
int ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page);
...@@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, ...@@ -561,9 +563,8 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
pgtable = pte_alloc_one(vma->vm_mm, haddr); pgtable = pte_alloc_one(vma->vm_mm, haddr);
if (unlikely(!pgtable)) { if (unlikely(!pgtable)) {
mem_cgroup_cancel_charge(page, memcg, true); ret = VM_FAULT_OOM;
put_page(page); goto release;
return VM_FAULT_OOM;
} }
clear_huge_page(page, haddr, HPAGE_PMD_NR); clear_huge_page(page, haddr, HPAGE_PMD_NR);
...@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, ...@@ -576,13 +577,14 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_none(*vmf->pmd))) { if (unlikely(!pmd_none(*vmf->pmd))) {
spin_unlock(vmf->ptl); goto unlock_release;
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
} else { } else {
pmd_t entry; pmd_t entry;
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock_release;
/* Deliver the page fault to userland */ /* Deliver the page fault to userland */
if (userfaultfd_missing(vma)) { if (userfaultfd_missing(vma)) {
int ret; int ret;
...@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, ...@@ -610,6 +612,15 @@ static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page,
} }
return 0; return 0;
unlock_release:
spin_unlock(vmf->ptl);
release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return ret;
} }
/* /*
...@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf) ...@@ -688,7 +699,10 @@ int do_huge_pmd_anonymous_page(struct vm_fault *vmf)
ret = 0; ret = 0;
set = false; set = false;
if (pmd_none(*vmf->pmd)) { if (pmd_none(*vmf->pmd)) {
if (userfaultfd_missing(vma)) { ret = check_stable_address_space(vma->vm_mm);
if (ret) {
spin_unlock(vmf->ptl);
} else if (userfaultfd_missing(vma)) {
spin_unlock(vmf->ptl); spin_unlock(vmf->ptl);
ret = handle_userfault(vmf, VM_UFFD_MISSING); ret = handle_userfault(vmf, VM_UFFD_MISSING);
VM_BUG_ON(ret & VM_FAULT_FALLBACK); VM_BUG_ON(ret & VM_FAULT_FALLBACK);
......
...@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u ...@@ -285,31 +285,27 @@ static void __init_memblock memblock_remove_region(struct memblock_type *type, u
} }
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK #ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
/**
phys_addr_t __init_memblock get_allocated_memblock_reserved_regions_info( * Discard memory and reserved arrays if they were allocated
phys_addr_t *addr) */
void __init memblock_discard(void)
{ {
if (memblock.reserved.regions == memblock_reserved_init_regions) phys_addr_t addr, size;
return 0;
*addr = __pa(memblock.reserved.regions); if (memblock.reserved.regions != memblock_reserved_init_regions) {
addr = __pa(memblock.reserved.regions);
return PAGE_ALIGN(sizeof(struct memblock_region) * size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.reserved.max); memblock.reserved.max);
} __memblock_free_late(addr, size);
}
phys_addr_t __init_memblock get_allocated_memblock_memory_regions_info(
phys_addr_t *addr)
{
if (memblock.memory.regions == memblock_memory_init_regions)
return 0;
*addr = __pa(memblock.memory.regions);
return PAGE_ALIGN(sizeof(struct memblock_region) * if (memblock.memory.regions == memblock_memory_init_regions) {
addr = __pa(memblock.memory.regions);
size = PAGE_ALIGN(sizeof(struct memblock_region) *
memblock.memory.max); memblock.memory.max);
__memblock_free_late(addr, size);
}
} }
#endif #endif
/** /**
......
...@@ -1611,9 +1611,13 @@ bool mem_cgroup_oom_synchronize(bool handle) ...@@ -1611,9 +1611,13 @@ bool mem_cgroup_oom_synchronize(bool handle)
* @page: the page * @page: the page
* *
* This function protects unlocked LRU pages from being moved to * This function protects unlocked LRU pages from being moved to
* another cgroup and stabilizes their page->mem_cgroup binding. * another cgroup.
*
* It ensures lifetime of the returned memcg. Caller is responsible
* for the lifetime of the page; __unlock_page_memcg() is available
* when @page might get freed inside the locked section.
*/ */
void lock_page_memcg(struct page *page) struct mem_cgroup *lock_page_memcg(struct page *page)
{ {
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
unsigned long flags; unsigned long flags;
...@@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page) ...@@ -1622,18 +1626,24 @@ void lock_page_memcg(struct page *page)
* The RCU lock is held throughout the transaction. The fast * The RCU lock is held throughout the transaction. The fast
* path can get away without acquiring the memcg->move_lock * path can get away without acquiring the memcg->move_lock
* because page moving starts with an RCU grace period. * because page moving starts with an RCU grace period.
*
* The RCU lock also protects the memcg from being freed when
* the page state that is going to change is the only thing
* preventing the page itself from being freed. E.g. writeback
* doesn't hold a page reference and relies on PG_writeback to
* keep off truncation, migration and so forth.
*/ */
rcu_read_lock(); rcu_read_lock();
if (mem_cgroup_disabled()) if (mem_cgroup_disabled())
return; return NULL;
again: again:
memcg = page->mem_cgroup; memcg = page->mem_cgroup;
if (unlikely(!memcg)) if (unlikely(!memcg))
return; return NULL;
if (atomic_read(&memcg->moving_account) <= 0) if (atomic_read(&memcg->moving_account) <= 0)
return; return memcg;
spin_lock_irqsave(&memcg->move_lock, flags); spin_lock_irqsave(&memcg->move_lock, flags);
if (memcg != page->mem_cgroup) { if (memcg != page->mem_cgroup) {
...@@ -1649,18 +1659,18 @@ void lock_page_memcg(struct page *page) ...@@ -1649,18 +1659,18 @@ void lock_page_memcg(struct page *page)
memcg->move_lock_task = current; memcg->move_lock_task = current;
memcg->move_lock_flags = flags; memcg->move_lock_flags = flags;
return; return memcg;
} }
EXPORT_SYMBOL(lock_page_memcg); EXPORT_SYMBOL(lock_page_memcg);
/** /**
* unlock_page_memcg - unlock a page->mem_cgroup binding * __unlock_page_memcg - unlock and unpin a memcg
* @page: the page * @memcg: the memcg
*
* Unlock and unpin a memcg returned by lock_page_memcg().
*/ */
void unlock_page_memcg(struct page *page) void __unlock_page_memcg(struct mem_cgroup *memcg)
{ {
struct mem_cgroup *memcg = page->mem_cgroup;
if (memcg && memcg->move_lock_task == current) { if (memcg && memcg->move_lock_task == current) {
unsigned long flags = memcg->move_lock_flags; unsigned long flags = memcg->move_lock_flags;
...@@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page) ...@@ -1672,6 +1682,15 @@ void unlock_page_memcg(struct page *page)
rcu_read_unlock(); rcu_read_unlock();
} }
/**
* unlock_page_memcg - unlock a page->mem_cgroup binding
* @page: the page
*/
void unlock_page_memcg(struct page *page)
{
__unlock_page_memcg(page->mem_cgroup);
}
EXPORT_SYMBOL(unlock_page_memcg); EXPORT_SYMBOL(unlock_page_memcg);
/* /*
......
...@@ -68,6 +68,7 @@ ...@@ -68,6 +68,7 @@
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/userfaultfd_k.h> #include <linux/userfaultfd_k.h>
#include <linux/dax.h> #include <linux/dax.h>
#include <linux/oom.h>
#include <asm/io.h> #include <asm/io.h>
#include <asm/mmu_context.h> #include <asm/mmu_context.h>
...@@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf) ...@@ -2893,6 +2894,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
struct vm_area_struct *vma = vmf->vma; struct vm_area_struct *vma = vmf->vma;
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
struct page *page; struct page *page;
int ret = 0;
pte_t entry; pte_t entry;
/* File mapping without ->vm_ops ? */ /* File mapping without ->vm_ops ? */
...@@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf) ...@@ -2925,6 +2927,9 @@ static int do_anonymous_page(struct vm_fault *vmf)
vmf->address, &vmf->ptl); vmf->address, &vmf->ptl);
if (!pte_none(*vmf->pte)) if (!pte_none(*vmf->pte))
goto unlock; goto unlock;
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock;
/* Deliver the page fault to userland, check inside PT lock */ /* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) { if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
...@@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf) ...@@ -2959,6 +2964,10 @@ static int do_anonymous_page(struct vm_fault *vmf)
if (!pte_none(*vmf->pte)) if (!pte_none(*vmf->pte))
goto release; goto release;
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto release;
/* Deliver the page fault to userland, check inside PT lock */ /* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) { if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
...@@ -2978,7 +2987,7 @@ static int do_anonymous_page(struct vm_fault *vmf) ...@@ -2978,7 +2987,7 @@ static int do_anonymous_page(struct vm_fault *vmf)
update_mmu_cache(vma, vmf->address, vmf->pte); update_mmu_cache(vma, vmf->address, vmf->pte);
unlock: unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0; return ret;
release: release:
mem_cgroup_cancel_charge(page, memcg, false); mem_cgroup_cancel_charge(page, memcg, false);
put_page(page); put_page(page);
...@@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, ...@@ -3252,7 +3261,7 @@ int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
int finish_fault(struct vm_fault *vmf) int finish_fault(struct vm_fault *vmf)
{ {
struct page *page; struct page *page;
int ret; int ret = 0;
/* Did we COW the page? */ /* Did we COW the page? */
if ((vmf->flags & FAULT_FLAG_WRITE) && if ((vmf->flags & FAULT_FLAG_WRITE) &&
...@@ -3260,6 +3269,14 @@ int finish_fault(struct vm_fault *vmf) ...@@ -3260,6 +3269,14 @@ int finish_fault(struct vm_fault *vmf)
page = vmf->cow_page; page = vmf->cow_page;
else else
page = vmf->page; page = vmf->page;
/*
* check even for read faults because we might have lost our CoWed
* page
*/
if (!(vmf->vma->vm_flags & VM_SHARED))
ret = check_stable_address_space(vmf->vma->vm_mm);
if (!ret)
ret = alloc_set_pte(vmf, vmf->memcg, page); ret = alloc_set_pte(vmf, vmf->memcg, page);
if (vmf->pte) if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl); pte_unmap_unlock(vmf->pte, vmf->ptl);
...@@ -3900,19 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, ...@@ -3900,19 +3917,6 @@ int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false); mem_cgroup_oom_synchronize(false);
} }
/*
* This mm has been already reaped by the oom reaper and so the
* refault cannot be trusted in general. Anonymous refaults would
* lose data and give a zero page instead e.g. This is especially
* problem for use_mm() because regular tasks will just die and
* the corrupted data will not be visible anywhere while kthread
* will outlive the oom victim and potentially propagate the date
* further.
*/
if (unlikely((current->flags & PF_KTHREAD) && !(ret & VM_FAULT_ERROR)
&& test_bit(MMF_UNSTABLE, &vma->vm_mm->flags)))
ret = VM_FAULT_SIGBUS;
return ret; return ret;
} }
EXPORT_SYMBOL_GPL(handle_mm_fault); EXPORT_SYMBOL_GPL(handle_mm_fault);
......
...@@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, ...@@ -861,11 +861,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
*policy |= (pol->flags & MPOL_MODE_FLAGS); *policy |= (pol->flags & MPOL_MODE_FLAGS);
} }
if (vma) {
up_read(&current->mm->mmap_sem);
vma = NULL;
}
err = 0; err = 0;
if (nmask) { if (nmask) {
if (mpol_store_user_nodemask(pol)) { if (mpol_store_user_nodemask(pol)) {
......
...@@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void) ...@@ -146,22 +146,6 @@ static unsigned long __init free_low_memory_core_early(void)
NULL) NULL)
count += __free_memory_core(start, end); count += __free_memory_core(start, end);
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
{
phys_addr_t size;
/* Free memblock.reserved array if it was allocated */
size = get_allocated_memblock_reserved_regions_info(&start);
if (size)
count += __free_memory_core(start, start + size);
/* Free memblock.memory array if it was allocated */
size = get_allocated_memblock_memory_regions_info(&start);
if (size)
count += __free_memory_core(start, start + size);
}
#endif
return count; return count;
} }
......
...@@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); ...@@ -2724,9 +2724,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page) int test_clear_page_writeback(struct page *page)
{ {
struct address_space *mapping = page_mapping(page); struct address_space *mapping = page_mapping(page);
struct mem_cgroup *memcg;
struct lruvec *lruvec;
int ret; int ret;
lock_page_memcg(page); memcg = lock_page_memcg(page);
lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
if (mapping && mapping_use_writeback_tags(mapping)) { if (mapping && mapping_use_writeback_tags(mapping)) {
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
struct backing_dev_info *bdi = inode_to_bdi(inode); struct backing_dev_info *bdi = inode_to_bdi(inode);
...@@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page) ...@@ -2754,12 +2757,18 @@ int test_clear_page_writeback(struct page *page)
} else { } else {
ret = TestClearPageWriteback(page); ret = TestClearPageWriteback(page);
} }
/*
* NOTE: Page might be free now! Writeback doesn't hold a page
* reference on its own, it relies on truncation to wait for
* the clearing of PG_writeback. The below can only access
* page state that is static across allocation cycles.
*/
if (ret) { if (ret) {
dec_lruvec_page_state(page, NR_WRITEBACK); dec_lruvec_state(lruvec, NR_WRITEBACK);
dec_zone_page_state(page, NR_ZONE_WRITE_PENDING); dec_zone_page_state(page, NR_ZONE_WRITE_PENDING);
inc_node_page_state(page, NR_WRITTEN); inc_node_page_state(page, NR_WRITTEN);
} }
unlock_page_memcg(page); __unlock_page_memcg(memcg);
return ret; return ret;
} }
......
...@@ -1584,6 +1584,10 @@ void __init page_alloc_init_late(void) ...@@ -1584,6 +1584,10 @@ void __init page_alloc_init_late(void)
/* Reinit limits that are based on free pages after the kernel is up */ /* Reinit limits that are based on free pages after the kernel is up */
files_maxfiles_init(); files_maxfiles_init();
#endif #endif
#ifdef CONFIG_ARCH_DISCARD_MEMBLOCK
/* Discard memblock private memory */
memblock_discard();
#endif
for_each_populated_zone(zone) for_each_populated_zone(zone)
set_zone_contiguous(zone); set_zone_contiguous(zone);
......
...@@ -5642,13 +5642,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work) ...@@ -5642,13 +5642,14 @@ static void sysfs_slab_remove_workfn(struct work_struct *work)
* A cache is never shut down before deactivation is * A cache is never shut down before deactivation is
* complete, so no need to worry about synchronization. * complete, so no need to worry about synchronization.
*/ */
return; goto out;
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
kset_unregister(s->memcg_kset); kset_unregister(s->memcg_kset);
#endif #endif
kobject_uevent(&s->kobj, KOBJ_REMOVE); kobject_uevent(&s->kobj, KOBJ_REMOVE);
kobject_del(&s->kobj); kobject_del(&s->kobj);
out:
kobject_put(&s->kobj); kobject_put(&s->kobj);
} }
......
...@@ -1671,7 +1671,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, ...@@ -1671,7 +1671,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
struct page **pages; struct page **pages;
unsigned int nr_pages, array_size, i; unsigned int nr_pages, array_size, i;
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
const gfp_t alloc_mask = gfp_mask | __GFP_HIGHMEM | __GFP_NOWARN; const gfp_t alloc_mask = gfp_mask | __GFP_NOWARN;
const gfp_t highmem_mask = (gfp_mask & (GFP_DMA | GFP_DMA32)) ?
0 :
__GFP_HIGHMEM;
nr_pages = get_vm_area_size(area) >> PAGE_SHIFT; nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
array_size = (nr_pages * sizeof(struct page *)); array_size = (nr_pages * sizeof(struct page *));
...@@ -1679,7 +1682,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, ...@@ -1679,7 +1682,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
area->nr_pages = nr_pages; area->nr_pages = nr_pages;
/* Please note that the recursion is strictly bounded. */ /* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) { if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|__GFP_HIGHMEM, pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
PAGE_KERNEL, node, area->caller); PAGE_KERNEL, node, area->caller);
} else { } else {
pages = kmalloc_node(array_size, nested_gfp, node); pages = kmalloc_node(array_size, nested_gfp, node);
...@@ -1700,9 +1703,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, ...@@ -1700,9 +1703,9 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
} }
if (node == NUMA_NO_NODE) if (node == NUMA_NO_NODE)
page = alloc_page(alloc_mask); page = alloc_page(alloc_mask|highmem_mask);
else else
page = alloc_pages_node(node, alloc_mask, 0); page = alloc_pages_node(node, alloc_mask|highmem_mask, 0);
if (unlikely(!page)) { if (unlikely(!page)) {
/* Successfully allocated i pages, free them in __vunmap() */ /* Successfully allocated i pages, free them in __vunmap() */
...@@ -1710,7 +1713,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, ...@@ -1710,7 +1713,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
goto fail; goto fail;
} }
area->pages[i] = page; area->pages[i] = page;
if (gfpflags_allow_blocking(gfp_mask)) if (gfpflags_allow_blocking(gfp_mask|highmem_mask))
cond_resched(); cond_resched();
} }
......
...@@ -473,8 +473,8 @@ usage() ...@@ -473,8 +473,8 @@ usage()
echo " all Runs all tests (default)" echo " all Runs all tests (default)"
echo " -t Run test ID the number amount of times is recommended" echo " -t Run test ID the number amount of times is recommended"
echo " -w Watch test ID run until it runs into an error" echo " -w Watch test ID run until it runs into an error"
echo " -c Run test ID once" echo " -s Run test ID once"
echo " -s Run test ID x test-count number of times" echo " -c Run test ID x test-count number of times"
echo " -l List all test ID list" echo " -l List all test ID list"
echo " -h|--help Help" echo " -h|--help Help"
echo echo
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment