Commit b3d574ae authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "12 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm, vmscan: prevent kswapd livelock due to pfmemalloc-throttled process being killed
  memcg: fix destination cgroup leak on task charges migration
  mm: memcontrol: switch soft limit default back to infinity
  mm/debug_pagealloc: remove obsolete Kconfig options
  vfs: renumber FMODE_NONOTIFY and add to uniqueness check
  arch/blackfin/mach-bf533/boards/stamp.c: add linux/delay.h
  ocfs2: fix the wrong directory passed to ocfs2_lookup_ino_from_name() when link file
  MAINTAINERS: update rydberg's addresses
  mm: protect set_page_dirty() from ongoing truncation
  mm: prevent endless growth of anon_vma hierarchy
  exit: fix race between wait_consider_task() and wait_task_zombie()
  ocfs2: remove bogus check in dlm_process_recovery_data
parents 11c8f01b 9e5e3661
...@@ -51,6 +51,7 @@ Greg Kroah-Hartman <gregkh@suse.de> ...@@ -51,6 +51,7 @@ Greg Kroah-Hartman <gregkh@suse.de>
Greg Kroah-Hartman <greg@kroah.com> Greg Kroah-Hartman <greg@kroah.com>
Henk Vergonet <Henk.Vergonet@gmail.com> Henk Vergonet <Henk.Vergonet@gmail.com>
Henrik Kretzschmar <henne@nachtwindheim.de> Henrik Kretzschmar <henne@nachtwindheim.de>
Henrik Rydberg <rydberg@bitmath.org>
Herbert Xu <herbert@gondor.apana.org.au> Herbert Xu <herbert@gondor.apana.org.au>
Jacob Shin <Jacob.Shin@amd.com> Jacob Shin <Jacob.Shin@amd.com>
James Bottomley <jejb@mulgrave.(none)> James Bottomley <jejb@mulgrave.(none)>
......
...@@ -724,15 +724,15 @@ F: include/uapi/linux/apm_bios.h ...@@ -724,15 +724,15 @@ F: include/uapi/linux/apm_bios.h
F: drivers/char/apm-emulation.c F: drivers/char/apm-emulation.c
APPLE BCM5974 MULTITOUCH DRIVER APPLE BCM5974 MULTITOUCH DRIVER
M: Henrik Rydberg <rydberg@euromail.se> M: Henrik Rydberg <rydberg@bitmath.org>
L: linux-input@vger.kernel.org L: linux-input@vger.kernel.org
S: Maintained S: Odd fixes
F: drivers/input/mouse/bcm5974.c F: drivers/input/mouse/bcm5974.c
APPLE SMC DRIVER APPLE SMC DRIVER
M: Henrik Rydberg <rydberg@euromail.se> M: Henrik Rydberg <rydberg@bitmath.org>
L: lm-sensors@lm-sensors.org L: lm-sensors@lm-sensors.org
S: Maintained S: Odd fixes
F: drivers/hwmon/applesmc.c F: drivers/hwmon/applesmc.c
APPLETALK NETWORK LAYER APPLETALK NETWORK LAYER
...@@ -4940,10 +4940,10 @@ F: include/uapi/linux/input.h ...@@ -4940,10 +4940,10 @@ F: include/uapi/linux/input.h
F: include/linux/input/ F: include/linux/input/
INPUT MULTITOUCH (MT) PROTOCOL INPUT MULTITOUCH (MT) PROTOCOL
M: Henrik Rydberg <rydberg@euromail.se> M: Henrik Rydberg <rydberg@bitmath.org>
L: linux-input@vger.kernel.org L: linux-input@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/rydberg/input-mt.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/rydberg/input-mt.git
S: Maintained S: Odd fixes
F: Documentation/input/multi-touch-protocol.txt F: Documentation/input/multi-touch-protocol.txt
F: drivers/input/input-mt.c F: drivers/input/input-mt.c
K: \b(ABS|SYN)_MT_ K: \b(ABS|SYN)_MT_
......
...@@ -7,6 +7,7 @@ ...@@ -7,6 +7,7 @@
*/ */
#include <linux/device.h> #include <linux/device.h>
#include <linux/delay.h>
#include <linux/platform_device.h> #include <linux/platform_device.h>
#include <linux/mtd/mtd.h> #include <linux/mtd/mtd.h>
#include <linux/mtd/partitions.h> #include <linux/mtd/partitions.h>
......
...@@ -740,14 +740,15 @@ static int __init fcntl_init(void) ...@@ -740,14 +740,15 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY * Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others. * is defined as O_NONBLOCK on some platforms and not on others.
*/ */
BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32( BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
O_RDONLY | O_WRONLY | O_RDWR | O_RDONLY | O_WRONLY | O_RDWR |
O_CREAT | O_EXCL | O_NOCTTY | O_CREAT | O_EXCL | O_NOCTTY |
O_TRUNC | O_APPEND | /* O_NONBLOCK | */ O_TRUNC | O_APPEND | /* O_NONBLOCK | */
__O_SYNC | O_DSYNC | FASYNC | __O_SYNC | O_DSYNC | FASYNC |
O_DIRECT | O_LARGEFILE | O_DIRECTORY | O_DIRECT | O_LARGEFILE | O_DIRECTORY |
O_NOFOLLOW | O_NOATIME | O_CLOEXEC | O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
__FMODE_EXEC | O_PATH | __O_TMPFILE __FMODE_EXEC | O_PATH | __O_TMPFILE |
__FMODE_NONOTIFY
)); ));
fasync_cache = kmem_cache_create("fasync_cache", fasync_cache = kmem_cache_create("fasync_cache",
......
...@@ -2023,11 +2023,8 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm, ...@@ -2023,11 +2023,8 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
dlm_lockres_drop_inflight_ref(dlm, res); dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock); spin_unlock(&res->spinlock);
if (ret < 0) { if (ret < 0)
mlog_errno(ret); mlog_errno(ret);
if (newlock)
dlm_lock_put(newlock);
}
return ret; return ret;
} }
......
...@@ -94,6 +94,14 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb, ...@@ -94,6 +94,14 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
struct inode *inode, struct inode *inode,
const char *symname); const char *symname);
static int ocfs2_double_lock(struct ocfs2_super *osb,
struct buffer_head **bh1,
struct inode *inode1,
struct buffer_head **bh2,
struct inode *inode2,
int rename);
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */ /* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64))) #define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
...@@ -678,8 +686,10 @@ static int ocfs2_link(struct dentry *old_dentry, ...@@ -678,8 +686,10 @@ static int ocfs2_link(struct dentry *old_dentry,
{ {
handle_t *handle; handle_t *handle;
struct inode *inode = old_dentry->d_inode; struct inode *inode = old_dentry->d_inode;
struct inode *old_dir = old_dentry->d_parent->d_inode;
int err; int err;
struct buffer_head *fe_bh = NULL; struct buffer_head *fe_bh = NULL;
struct buffer_head *old_dir_bh = NULL;
struct buffer_head *parent_fe_bh = NULL; struct buffer_head *parent_fe_bh = NULL;
struct ocfs2_dinode *fe = NULL; struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb); struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
...@@ -696,19 +706,33 @@ static int ocfs2_link(struct dentry *old_dentry, ...@@ -696,19 +706,33 @@ static int ocfs2_link(struct dentry *old_dentry,
dquot_initialize(dir); dquot_initialize(dir);
err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT); err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
&parent_fe_bh, dir, 0);
if (err < 0) { if (err < 0) {
if (err != -ENOENT) if (err != -ENOENT)
mlog_errno(err); mlog_errno(err);
return err; return err;
} }
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
if (!parent_fe_bh) {
if (old_dir_bh) {
parent_fe_bh = old_dir_bh;
get_bh(parent_fe_bh);
} else {
mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
err = -EIO;
goto out;
}
}
if (!dir->i_nlink) { if (!dir->i_nlink) {
err = -ENOENT; err = -ENOENT;
goto out; goto out;
} }
err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name, err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
old_dentry->d_name.len, &old_de_ino); old_dentry->d_name.len, &old_de_ino);
if (err) { if (err) {
err = -ENOENT; err = -ENOENT;
...@@ -801,10 +825,11 @@ static int ocfs2_link(struct dentry *old_dentry, ...@@ -801,10 +825,11 @@ static int ocfs2_link(struct dentry *old_dentry,
ocfs2_inode_unlock(inode, 1); ocfs2_inode_unlock(inode, 1);
out: out:
ocfs2_inode_unlock(dir, 1); ocfs2_double_unlock(old_dir, dir);
brelse(fe_bh); brelse(fe_bh);
brelse(parent_fe_bh); brelse(parent_fe_bh);
brelse(old_dir_bh);
ocfs2_free_dir_lookup_result(&lookup); ocfs2_free_dir_lookup_result(&lookup);
...@@ -1072,14 +1097,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, ...@@ -1072,14 +1097,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
} }
/* /*
* The only place this should be used is rename! * The only place this should be used is rename and link!
* if they have the same id, then the 1st one is the only one locked. * if they have the same id, then the 1st one is the only one locked.
*/ */
static int ocfs2_double_lock(struct ocfs2_super *osb, static int ocfs2_double_lock(struct ocfs2_super *osb,
struct buffer_head **bh1, struct buffer_head **bh1,
struct inode *inode1, struct inode *inode1,
struct buffer_head **bh2, struct buffer_head **bh2,
struct inode *inode2) struct inode *inode2,
int rename)
{ {
int status; int status;
int inode1_is_ancestor, inode2_is_ancestor; int inode1_is_ancestor, inode2_is_ancestor;
...@@ -1127,7 +1153,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, ...@@ -1127,7 +1153,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
} }
/* lock id2 */ /* lock id2 */
status = ocfs2_inode_lock_nested(inode2, bh2, 1, status = ocfs2_inode_lock_nested(inode2, bh2, 1,
OI_LS_RENAME1); rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
if (status < 0) { if (status < 0) {
if (status != -ENOENT) if (status != -ENOENT)
mlog_errno(status); mlog_errno(status);
...@@ -1136,7 +1162,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, ...@@ -1136,7 +1162,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
} }
/* lock id1 */ /* lock id1 */
status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2); status = ocfs2_inode_lock_nested(inode1, bh1, 1,
rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT);
if (status < 0) { if (status < 0) {
/* /*
* An error return must mean that no cluster locks * An error return must mean that no cluster locks
...@@ -1252,7 +1279,7 @@ static int ocfs2_rename(struct inode *old_dir, ...@@ -1252,7 +1279,7 @@ static int ocfs2_rename(struct inode *old_dir,
/* if old and new are the same, this'll just do one lock. */ /* if old and new are the same, this'll just do one lock. */
status = ocfs2_double_lock(osb, &old_dir_bh, old_dir, status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
&new_dir_bh, new_dir); &new_dir_bh, new_dir, 1);
if (status < 0) { if (status < 0) {
mlog_errno(status); mlog_errno(status);
goto bail; goto bail;
......
...@@ -135,7 +135,7 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, ...@@ -135,7 +135,7 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define FMODE_CAN_WRITE ((__force fmode_t)0x40000) #define FMODE_CAN_WRITE ((__force fmode_t)0x40000)
/* File was opened by fanotify and shouldn't generate fanotify events */ /* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000) #define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
/* /*
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector * Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
......
...@@ -36,6 +36,16 @@ struct anon_vma { ...@@ -36,6 +36,16 @@ struct anon_vma {
*/ */
atomic_t refcount; atomic_t refcount;
/*
* Count of child anon_vmas and VMAs which points to this anon_vma.
*
* This counter is used for making decision about reusing anon_vma
* instead of forking new one. See comments in function anon_vma_clone.
*/
unsigned degree;
struct anon_vma *parent; /* Parent of this anon_vma */
/* /*
* NOTE: the LSB of the rb_root.rb_node is set by * NOTE: the LSB of the rb_root.rb_node is set by
* mm_take_all_locks() _after_ taking the above lock. So the * mm_take_all_locks() _after_ taking the above lock. So the
......
...@@ -177,7 +177,6 @@ int write_cache_pages(struct address_space *mapping, ...@@ -177,7 +177,6 @@ int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage, struct writeback_control *wbc, writepage_t writepage,
void *data); void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc); int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void set_page_dirty_balance(struct page *page);
void writeback_set_ratelimit(void); void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping, void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end); pgoff_t start, pgoff_t end);
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
/* /*
* FMODE_EXEC is 0x20 * FMODE_EXEC is 0x20
* FMODE_NONOTIFY is 0x1000000 * FMODE_NONOTIFY is 0x4000000
* These cannot be used by userspace O_* until internal and external open * These cannot be used by userspace O_* until internal and external open
* flags are split. * flags are split.
* -Eric Paris * -Eric Paris
......
...@@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p) ...@@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace, static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p) struct task_struct *p)
{ {
/*
* We can race with wait_task_zombie() from another thread.
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
* can't confuse the checks below.
*/
int exit_state = ACCESS_ONCE(p->exit_state);
int ret; int ret;
if (unlikely(p->exit_state == EXIT_DEAD)) if (unlikely(exit_state == EXIT_DEAD))
return 0; return 0;
ret = eligible_child(wo, p); ret = eligible_child(wo, p);
...@@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, ...@@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0; return 0;
} }
if (unlikely(p->exit_state == EXIT_TRACE)) { if (unlikely(exit_state == EXIT_TRACE)) {
/* /*
* ptrace == 0 means we are the natural parent. In this case * ptrace == 0 means we are the natural parent. In this case
* we should clear notask_error, debugger will notify us. * we should clear notask_error, debugger will notify us.
...@@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace, ...@@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
} }
/* slay zombie? */ /* slay zombie? */
if (p->exit_state == EXIT_ZOMBIE) { if (exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */ /* we don't reap group leaders with subthreads */
if (!delay_group_leader(p)) { if (!delay_group_leader(p)) {
/* /*
......
...@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC ...@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC
depends on !KMEMCHECK depends on !KMEMCHECK
select PAGE_EXTENSION select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help--- ---help---
Unmap pages from the kernel linear mapping after free_pages(). Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types This results in a large slowdown, but helps to find certain types
...@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC ...@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC
that would result in incorrect warnings of memory corruption after that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image. a resume because free pages are not saved to the suspend image.
config WANT_PAGE_DEBUG_FLAGS
bool
config PAGE_POISONING config PAGE_POISONING
bool bool
select WANT_PAGE_DEBUG_FLAGS
config PAGE_GUARD
bool
select WANT_PAGE_DEBUG_FLAGS
...@@ -3043,18 +3043,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, ...@@ -3043,18 +3043,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false); mem_cgroup_swap_statistics(from, false);
mem_cgroup_swap_statistics(to, true); mem_cgroup_swap_statistics(to, true);
/*
* This function is only called from task migration context now.
* It postpones page_counter and refcount handling till the end
* of task migration(mem_cgroup_clear_mc()) for performance
* improvement. But we cannot postpone css_get(to) because if
* the process that has been moved to @to does swap-in, the
* refcount of @to might be decreased to 0.
*
* We are in attach() phase, so the cgroup is guaranteed to be
* alive, so we can just call css_get().
*/
css_get(&to->css);
return 0; return 0;
} }
return -EINVAL; return -EINVAL;
...@@ -4679,6 +4667,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ...@@ -4679,6 +4667,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent_css == NULL) { if (parent_css == NULL) {
root_mem_cgroup = memcg; root_mem_cgroup = memcg;
page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->memory, NULL);
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->kmem, NULL);
} }
...@@ -4724,6 +4713,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) ...@@ -4724,6 +4713,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent->use_hierarchy) { if (parent->use_hierarchy) {
page_counter_init(&memcg->memory, &parent->memory); page_counter_init(&memcg->memory, &parent->memory);
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw); page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->kmem, &parent->kmem);
...@@ -4733,6 +4723,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css) ...@@ -4733,6 +4723,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
*/ */
} else { } else {
page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->memory, NULL);
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL); page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL); page_counter_init(&memcg->kmem, NULL);
/* /*
...@@ -4807,7 +4798,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) ...@@ -4807,7 +4798,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX); mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX); mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX); memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
memcg->soft_limit = 0; memcg->soft_limit = PAGE_COUNTER_MAX;
} }
#ifdef CONFIG_MMU #ifdef CONFIG_MMU
......
...@@ -2137,17 +2137,24 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, ...@@ -2137,17 +2137,24 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!dirty_page) if (!dirty_page)
return ret; return ret;
if (!page_mkwrite) {
struct address_space *mapping;
int dirtied;
lock_page(dirty_page);
dirtied = set_page_dirty(dirty_page);
VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
mapping = dirty_page->mapping;
unlock_page(dirty_page);
if (dirtied && mapping) {
/* /*
* Yes, Virginia, this is actually required to prevent a race * Some device drivers do not set page.mapping
* with clear_page_dirty_for_io() from clearing the page dirty * but still dirty their pages
* bit after it clear all dirty ptes, but before a racing
* do_wp_page installs a dirty pte.
*
* do_shared_fault is protected similarly.
*/ */
if (!page_mkwrite) { balance_dirty_pages_ratelimited(mapping);
wait_on_page_locked(dirty_page); }
set_page_dirty_balance(dirty_page);
/* file_update_time outside page_lock */ /* file_update_time outside page_lock */
if (vma->vm_file) if (vma->vm_file)
file_update_time(vma->vm_file); file_update_time(vma->vm_file);
......
...@@ -1541,16 +1541,6 @@ static void balance_dirty_pages(struct address_space *mapping, ...@@ -1541,16 +1541,6 @@ static void balance_dirty_pages(struct address_space *mapping,
bdi_start_background_writeback(bdi); bdi_start_background_writeback(bdi);
} }
void set_page_dirty_balance(struct page *page)
{
if (set_page_dirty(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping)
balance_dirty_pages_ratelimited(mapping);
}
}
static DEFINE_PER_CPU(int, bdp_ratelimits); static DEFINE_PER_CPU(int, bdp_ratelimits);
/* /*
...@@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied); ...@@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied);
* page dirty in that case, but not all the buffers. This is a "bottom-up" * page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying. * dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
* *
* Most callers have locked the page, which pins the address_space in memory. * The caller must ensure this doesn't race with truncation. Most will simply
* But zap_pte_range() does not lock the page, however in that case the * hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
* mapping is pinned by the vma's ->vm_file reference. * the pte lock held, which also locks out truncation.
*
* We take care to handle the case where the page was truncated from the
* mapping by re-checking page_mapping() inside tree_lock.
*/ */
int __set_page_dirty_nobuffers(struct page *page) int __set_page_dirty_nobuffers(struct page *page)
{ {
if (!TestSetPageDirty(page)) { if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page); struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
unsigned long flags; unsigned long flags;
if (!mapping) if (!mapping)
return 1; return 1;
spin_lock_irqsave(&mapping->tree_lock, flags); spin_lock_irqsave(&mapping->tree_lock, flags);
mapping2 = page_mapping(page); BUG_ON(page_mapping(page) != mapping);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page)); WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping); account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree, radix_tree_tag_set(&mapping->page_tree, page_index(page),
page_index(page), PAGECACHE_TAG_DIRTY); PAGECACHE_TAG_DIRTY);
}
spin_unlock_irqrestore(&mapping->tree_lock, flags); spin_unlock_irqrestore(&mapping->tree_lock, flags);
if (mapping->host) { if (mapping->host) {
/* !PageAnon && !swapper_space */ /* !PageAnon && !swapper_space */
...@@ -2306,11 +2289,9 @@ int clear_page_dirty_for_io(struct page *page) ...@@ -2306,11 +2289,9 @@ int clear_page_dirty_for_io(struct page *page)
* We carefully synchronise fault handlers against * We carefully synchronise fault handlers against
* installing a dirty pte and marking the page dirty * installing a dirty pte and marking the page dirty
* at this point. We do this by having them hold the * at this point. We do this by having them hold the
* page lock at some point after installing their * page lock while dirtying the page, and pages are
* pte, but before marking the page dirty. * always locked coming in here, so we get the desired
* Pages are always locked coming in here, so we get * exclusion.
* the desired exclusion. See mm/memory.c:do_wp_page()
* for more comments.
*/ */
if (TestClearPageDirty(page)) { if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY); dec_zone_page_state(page, NR_FILE_DIRTY);
......
...@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void) ...@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) { if (anon_vma) {
atomic_set(&anon_vma->refcount, 1); atomic_set(&anon_vma->refcount, 1);
anon_vma->degree = 1; /* Reference for first vma */
anon_vma->parent = anon_vma;
/* /*
* Initialise the anon_vma root to point to itself. If called * Initialise the anon_vma root to point to itself. If called
* from fork, the root will be reset to the parents anon_vma. * from fork, the root will be reset to the parents anon_vma.
...@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma) ...@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
if (likely(!vma->anon_vma)) { if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma; vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma); anon_vma_chain_link(vma, avc, anon_vma);
/* vma reference or self-parent link for new root */
anon_vma->degree++;
allocated = NULL; allocated = NULL;
avc = NULL; avc = NULL;
} }
...@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root) ...@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
/* /*
* Attach the anon_vmas from src to dst. * Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure. * Returns 0 on success, -ENOMEM on failure.
*
* If dst->anon_vma is NULL this function tries to find and reuse existing
* anon_vma which has no vmas and only one child anon_vma. This prevents
* degradation of anon_vma hierarchy to endless linear chain in case of
* constantly forking task. On the other hand, an anon_vma with more than one
* child isn't reused even if there was no alive vma, thus rmap walker has a
* good chance of avoiding scanning the whole hierarchy when it searches where
* page is mapped.
*/ */
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{ {
...@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src) ...@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
anon_vma = pavc->anon_vma; anon_vma = pavc->anon_vma;
root = lock_anon_vma_root(root, anon_vma); root = lock_anon_vma_root(root, anon_vma);
anon_vma_chain_link(dst, avc, anon_vma); anon_vma_chain_link(dst, avc, anon_vma);
/*
* Reuse existing anon_vma if its degree lower than two,
* that means it has no vma and only one anon_vma child.
*
* Do not chose parent anon_vma, otherwise first child
* will always reuse it. Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
if (!dst->anon_vma && anon_vma != src->anon_vma &&
anon_vma->degree < 2)
dst->anon_vma = anon_vma;
} }
if (dst->anon_vma)
dst->anon_vma->degree++;
unlock_anon_vma_root(root); unlock_anon_vma_root(root);
return 0; return 0;
...@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) ...@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
if (!pvma->anon_vma) if (!pvma->anon_vma)
return 0; return 0;
/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
vma->anon_vma = NULL;
/* /*
* First, attach the new VMA to the parent VMA's anon_vmas, * First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes. * so rmap can find non-COWed pages in child processes.
...@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) ...@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
if (error) if (error)
return error; return error;
/* An existing anon_vma has been reused, all done then. */
if (vma->anon_vma)
return 0;
/* Then add our own anon_vma. */ /* Then add our own anon_vma. */
anon_vma = anon_vma_alloc(); anon_vma = anon_vma_alloc();
if (!anon_vma) if (!anon_vma)
...@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) ...@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
* lock any of the anon_vmas in this anon_vma tree. * lock any of the anon_vmas in this anon_vma tree.
*/ */
anon_vma->root = pvma->anon_vma->root; anon_vma->root = pvma->anon_vma->root;
anon_vma->parent = pvma->anon_vma;
/* /*
* With refcounts, an anon_vma can stay around longer than the * With refcounts, an anon_vma can stay around longer than the
* process it belongs to. The root anon_vma needs to be pinned until * process it belongs to. The root anon_vma needs to be pinned until
...@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma) ...@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
vma->anon_vma = anon_vma; vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma); anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma); anon_vma_chain_link(vma, avc, anon_vma);
anon_vma->parent->degree++;
anon_vma_unlock_write(anon_vma); anon_vma_unlock_write(anon_vma);
return 0; return 0;
...@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma) ...@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* Leave empty anon_vmas on the list - we'll need * Leave empty anon_vmas on the list - we'll need
* to free them outside the lock. * to free them outside the lock.
*/ */
if (RB_EMPTY_ROOT(&anon_vma->rb_root)) if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
anon_vma->parent->degree--;
continue; continue;
}
list_del(&avc->same_vma); list_del(&avc->same_vma);
anon_vma_chain_free(avc); anon_vma_chain_free(avc);
} }
if (vma->anon_vma)
vma->anon_vma->degree--;
unlock_anon_vma_root(root); unlock_anon_vma_root(root);
/* /*
...@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma) ...@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) { list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma; struct anon_vma *anon_vma = avc->anon_vma;
BUG_ON(anon_vma->degree);
put_anon_vma(anon_vma); put_anon_vma(anon_vma);
list_del(&avc->same_vma); list_del(&avc->same_vma);
......
...@@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, ...@@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
return false; return false;
/* /*
* There is a potential race between when kswapd checks its watermarks * The throttled processes are normally woken up in balance_pgdat() as
* and a process gets throttled. There is also a potential race if * soon as pfmemalloc_watermark_ok() is true. But there is a potential
* processes get throttled, kswapd wakes, a large process exits therby * race between when kswapd checks the watermarks and a process gets
* balancing the zones that causes kswapd to miss a wakeup. If kswapd * throttled. There is also a potential race if processes get
* is going to sleep, no process should be sleeping on pfmemalloc_wait * throttled, kswapd wakes, a large process exits thereby balancing the
* so wake them now if necessary. If necessary, processes will wake * zones, which causes kswapd to exit balance_pgdat() before reaching
* kswapd and get throttled again * the wake up checks. If kswapd is going to sleep, no process should
*/ * be sleeping on pfmemalloc_wait, so wake them now if necessary. If
if (waitqueue_active(&pgdat->pfmemalloc_wait)) { * the wake up is premature, processes will wake kswapd and get
wake_up(&pgdat->pfmemalloc_wait); * throttled again. The difference from wake ups in balance_pgdat() is
return false; * that here we are under prepare_to_wait().
} */
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
return pgdat_balanced(pgdat, order, classzone_idx); return pgdat_balanced(pgdat, order, classzone_idx);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment