Commit b3d574ae authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'akpm' (patches from Andrew)

Merge misc fixes from Andrew Morton:
 "12 fixes"

* emailed patches from Andrew Morton <akpm@linux-foundation.org>:
  mm, vmscan: prevent kswapd livelock due to pfmemalloc-throttled process being killed
  memcg: fix destination cgroup leak on task charges migration
  mm: memcontrol: switch soft limit default back to infinity
  mm/debug_pagealloc: remove obsolete Kconfig options
  vfs: renumber FMODE_NONOTIFY and add to uniqueness check
  arch/blackfin/mach-bf533/boards/stamp.c: add linux/delay.h
  ocfs2: fix the wrong directory passed to ocfs2_lookup_ino_from_name() when link file
  MAINTAINERS: update rydberg's addresses
  mm: protect set_page_dirty() from ongoing truncation
  mm: prevent endless growth of anon_vma hierarchy
  exit: fix race between wait_consider_task() and wait_task_zombie()
  ocfs2: remove bogus check in dlm_process_recovery_data
parents 11c8f01b 9e5e3661
......@@ -51,6 +51,7 @@ Greg Kroah-Hartman <gregkh@suse.de>
Greg Kroah-Hartman <greg@kroah.com>
Henk Vergonet <Henk.Vergonet@gmail.com>
Henrik Kretzschmar <henne@nachtwindheim.de>
Henrik Rydberg <rydberg@bitmath.org>
Herbert Xu <herbert@gondor.apana.org.au>
Jacob Shin <Jacob.Shin@amd.com>
James Bottomley <jejb@mulgrave.(none)>
......
......@@ -724,15 +724,15 @@ F: include/uapi/linux/apm_bios.h
F: drivers/char/apm-emulation.c
APPLE BCM5974 MULTITOUCH DRIVER
M: Henrik Rydberg <rydberg@euromail.se>
M: Henrik Rydberg <rydberg@bitmath.org>
L: linux-input@vger.kernel.org
S: Maintained
S: Odd fixes
F: drivers/input/mouse/bcm5974.c
APPLE SMC DRIVER
M: Henrik Rydberg <rydberg@euromail.se>
M: Henrik Rydberg <rydberg@bitmath.org>
L: lm-sensors@lm-sensors.org
S: Maintained
S: Odd fixes
F: drivers/hwmon/applesmc.c
APPLETALK NETWORK LAYER
......@@ -4940,10 +4940,10 @@ F: include/uapi/linux/input.h
F: include/linux/input/
INPUT MULTITOUCH (MT) PROTOCOL
M: Henrik Rydberg <rydberg@euromail.se>
M: Henrik Rydberg <rydberg@bitmath.org>
L: linux-input@vger.kernel.org
T: git git://git.kernel.org/pub/scm/linux/kernel/git/rydberg/input-mt.git
S: Maintained
S: Odd fixes
F: Documentation/input/multi-touch-protocol.txt
F: drivers/input/input-mt.c
K: \b(ABS|SYN)_MT_
......
......@@ -7,6 +7,7 @@
*/
#include <linux/device.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/mtd/mtd.h>
#include <linux/mtd/partitions.h>
......
......@@ -740,14 +740,15 @@ static int __init fcntl_init(void)
* Exceptions: O_NONBLOCK is a two bit define on parisc; O_NDELAY
* is defined as O_NONBLOCK on some platforms and not on others.
*/
BUILD_BUG_ON(20 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
BUILD_BUG_ON(21 - 1 /* for O_RDONLY being 0 */ != HWEIGHT32(
O_RDONLY | O_WRONLY | O_RDWR |
O_CREAT | O_EXCL | O_NOCTTY |
O_TRUNC | O_APPEND | /* O_NONBLOCK | */
__O_SYNC | O_DSYNC | FASYNC |
O_DIRECT | O_LARGEFILE | O_DIRECTORY |
O_NOFOLLOW | O_NOATIME | O_CLOEXEC |
__FMODE_EXEC | O_PATH | __O_TMPFILE
__FMODE_EXEC | O_PATH | __O_TMPFILE |
__FMODE_NONOTIFY
));
fasync_cache = kmem_cache_create("fasync_cache",
......
......@@ -2023,11 +2023,8 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
dlm_lockres_drop_inflight_ref(dlm, res);
spin_unlock(&res->spinlock);
if (ret < 0) {
if (ret < 0)
mlog_errno(ret);
if (newlock)
dlm_lock_put(newlock);
}
return ret;
}
......
......@@ -94,6 +94,14 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
struct inode *inode,
const char *symname);
static int ocfs2_double_lock(struct ocfs2_super *osb,
struct buffer_head **bh1,
struct inode *inode1,
struct buffer_head **bh2,
struct inode *inode2,
int rename);
static void ocfs2_double_unlock(struct inode *inode1, struct inode *inode2);
/* An orphan dir name is an 8 byte value, printed as a hex string */
#define OCFS2_ORPHAN_NAMELEN ((int)(2 * sizeof(u64)))
......@@ -678,8 +686,10 @@ static int ocfs2_link(struct dentry *old_dentry,
{
handle_t *handle;
struct inode *inode = old_dentry->d_inode;
struct inode *old_dir = old_dentry->d_parent->d_inode;
int err;
struct buffer_head *fe_bh = NULL;
struct buffer_head *old_dir_bh = NULL;
struct buffer_head *parent_fe_bh = NULL;
struct ocfs2_dinode *fe = NULL;
struct ocfs2_super *osb = OCFS2_SB(dir->i_sb);
......@@ -696,19 +706,33 @@ static int ocfs2_link(struct dentry *old_dentry,
dquot_initialize(dir);
err = ocfs2_inode_lock_nested(dir, &parent_fe_bh, 1, OI_LS_PARENT);
err = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
&parent_fe_bh, dir, 0);
if (err < 0) {
if (err != -ENOENT)
mlog_errno(err);
return err;
}
/* make sure both dirs have bhs
* get an extra ref on old_dir_bh if old==new */
if (!parent_fe_bh) {
if (old_dir_bh) {
parent_fe_bh = old_dir_bh;
get_bh(parent_fe_bh);
} else {
mlog(ML_ERROR, "%s: no old_dir_bh!\n", osb->uuid_str);
err = -EIO;
goto out;
}
}
if (!dir->i_nlink) {
err = -ENOENT;
goto out;
}
err = ocfs2_lookup_ino_from_name(dir, old_dentry->d_name.name,
err = ocfs2_lookup_ino_from_name(old_dir, old_dentry->d_name.name,
old_dentry->d_name.len, &old_de_ino);
if (err) {
err = -ENOENT;
......@@ -801,10 +825,11 @@ static int ocfs2_link(struct dentry *old_dentry,
ocfs2_inode_unlock(inode, 1);
out:
ocfs2_inode_unlock(dir, 1);
ocfs2_double_unlock(old_dir, dir);
brelse(fe_bh);
brelse(parent_fe_bh);
brelse(old_dir_bh);
ocfs2_free_dir_lookup_result(&lookup);
......@@ -1072,14 +1097,15 @@ static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
}
/*
* The only place this should be used is rename!
* The only place this should be used is rename and link!
* if they have the same id, then the 1st one is the only one locked.
*/
static int ocfs2_double_lock(struct ocfs2_super *osb,
struct buffer_head **bh1,
struct inode *inode1,
struct buffer_head **bh2,
struct inode *inode2)
struct inode *inode2,
int rename)
{
int status;
int inode1_is_ancestor, inode2_is_ancestor;
......@@ -1127,7 +1153,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id2 */
status = ocfs2_inode_lock_nested(inode2, bh2, 1,
OI_LS_RENAME1);
rename == 1 ? OI_LS_RENAME1 : OI_LS_PARENT);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
......@@ -1136,7 +1162,8 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
}
/* lock id1 */
status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_RENAME2);
status = ocfs2_inode_lock_nested(inode1, bh1, 1,
rename == 1 ? OI_LS_RENAME2 : OI_LS_PARENT);
if (status < 0) {
/*
* An error return must mean that no cluster locks
......@@ -1252,7 +1279,7 @@ static int ocfs2_rename(struct inode *old_dir,
/* if old and new are the same, this'll just do one lock. */
status = ocfs2_double_lock(osb, &old_dir_bh, old_dir,
&new_dir_bh, new_dir);
&new_dir_bh, new_dir, 1);
if (status < 0) {
mlog_errno(status);
goto bail;
......
......@@ -135,7 +135,7 @@ typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset,
#define FMODE_CAN_WRITE ((__force fmode_t)0x40000)
/* File was opened by fanotify and shouldn't generate fanotify events */
#define FMODE_NONOTIFY ((__force fmode_t)0x1000000)
#define FMODE_NONOTIFY ((__force fmode_t)0x4000000)
/*
* Flag for rw_copy_check_uvector and compat_rw_copy_check_uvector
......
......@@ -36,6 +36,16 @@ struct anon_vma {
*/
atomic_t refcount;
/*
* Count of child anon_vmas and VMAs which points to this anon_vma.
*
* This counter is used for making decision about reusing anon_vma
* instead of forking new one. See comments in function anon_vma_clone.
*/
unsigned degree;
struct anon_vma *parent; /* Parent of this anon_vma */
/*
* NOTE: the LSB of the rb_root.rb_node is set by
* mm_take_all_locks() _after_ taking the above lock. So the
......
......@@ -177,7 +177,6 @@ int write_cache_pages(struct address_space *mapping,
struct writeback_control *wbc, writepage_t writepage,
void *data);
int do_writepages(struct address_space *mapping, struct writeback_control *wbc);
void set_page_dirty_balance(struct page *page);
void writeback_set_ratelimit(void);
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end);
......
......@@ -5,7 +5,7 @@
/*
* FMODE_EXEC is 0x20
* FMODE_NONOTIFY is 0x1000000
* FMODE_NONOTIFY is 0x4000000
* These cannot be used by userspace O_* until internal and external open
* flags are split.
* -Eric Paris
......
......@@ -1287,9 +1287,15 @@ static int wait_task_continued(struct wait_opts *wo, struct task_struct *p)
static int wait_consider_task(struct wait_opts *wo, int ptrace,
struct task_struct *p)
{
/*
* We can race with wait_task_zombie() from another thread.
* Ensure that EXIT_ZOMBIE -> EXIT_DEAD/EXIT_TRACE transition
* can't confuse the checks below.
*/
int exit_state = ACCESS_ONCE(p->exit_state);
int ret;
if (unlikely(p->exit_state == EXIT_DEAD))
if (unlikely(exit_state == EXIT_DEAD))
return 0;
ret = eligible_child(wo, p);
......@@ -1310,7 +1316,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
if (unlikely(p->exit_state == EXIT_TRACE)) {
if (unlikely(exit_state == EXIT_TRACE)) {
/*
* ptrace == 0 means we are the natural parent. In this case
* we should clear notask_error, debugger will notify us.
......@@ -1337,7 +1343,7 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
}
/* slay zombie? */
if (p->exit_state == EXIT_ZOMBIE) {
if (exit_state == EXIT_ZOMBIE) {
/* we don't reap group leaders with subthreads */
if (!delay_group_leader(p)) {
/*
......
......@@ -14,7 +14,6 @@ config DEBUG_PAGEALLOC
depends on !KMEMCHECK
select PAGE_EXTENSION
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
select PAGE_GUARD if ARCH_SUPPORTS_DEBUG_PAGEALLOC
---help---
Unmap pages from the kernel linear mapping after free_pages().
This results in a large slowdown, but helps to find certain types
......@@ -27,13 +26,5 @@ config DEBUG_PAGEALLOC
that would result in incorrect warnings of memory corruption after
a resume because free pages are not saved to the suspend image.
config WANT_PAGE_DEBUG_FLAGS
bool
config PAGE_POISONING
bool
select WANT_PAGE_DEBUG_FLAGS
config PAGE_GUARD
bool
select WANT_PAGE_DEBUG_FLAGS
......@@ -3043,18 +3043,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
mem_cgroup_swap_statistics(from, false);
mem_cgroup_swap_statistics(to, true);
/*
* This function is only called from task migration context now.
* It postpones page_counter and refcount handling till the end
* of task migration(mem_cgroup_clear_mc()) for performance
* improvement. But we cannot postpone css_get(to) because if
* the process that has been moved to @to does swap-in, the
* refcount of @to might be decreased to 0.
*
* We are in attach() phase, so the cgroup is guaranteed to be
* alive, so we can just call css_get().
*/
css_get(&to->css);
return 0;
}
return -EINVAL;
......@@ -4679,6 +4667,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent_css == NULL) {
root_mem_cgroup = memcg;
page_counter_init(&memcg->memory, NULL);
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
}
......@@ -4724,6 +4713,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
if (parent->use_hierarchy) {
page_counter_init(&memcg->memory, &parent->memory);
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, &parent->memsw);
page_counter_init(&memcg->kmem, &parent->kmem);
......@@ -4733,6 +4723,7 @@ mem_cgroup_css_online(struct cgroup_subsys_state *css)
*/
} else {
page_counter_init(&memcg->memory, NULL);
memcg->soft_limit = PAGE_COUNTER_MAX;
page_counter_init(&memcg->memsw, NULL);
page_counter_init(&memcg->kmem, NULL);
/*
......@@ -4807,7 +4798,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
mem_cgroup_resize_limit(memcg, PAGE_COUNTER_MAX);
mem_cgroup_resize_memsw_limit(memcg, PAGE_COUNTER_MAX);
memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
memcg->soft_limit = 0;
memcg->soft_limit = PAGE_COUNTER_MAX;
}
#ifdef CONFIG_MMU
......
......@@ -2137,17 +2137,24 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (!dirty_page)
return ret;
/*
* Yes, Virginia, this is actually required to prevent a race
* with clear_page_dirty_for_io() from clearing the page dirty
* bit after it clear all dirty ptes, but before a racing
* do_wp_page installs a dirty pte.
*
* do_shared_fault is protected similarly.
*/
if (!page_mkwrite) {
wait_on_page_locked(dirty_page);
set_page_dirty_balance(dirty_page);
struct address_space *mapping;
int dirtied;
lock_page(dirty_page);
dirtied = set_page_dirty(dirty_page);
VM_BUG_ON_PAGE(PageAnon(dirty_page), dirty_page);
mapping = dirty_page->mapping;
unlock_page(dirty_page);
if (dirtied && mapping) {
/*
* Some device drivers do not set page.mapping
* but still dirty their pages
*/
balance_dirty_pages_ratelimited(mapping);
}
/* file_update_time outside page_lock */
if (vma->vm_file)
file_update_time(vma->vm_file);
......
......@@ -1541,16 +1541,6 @@ static void balance_dirty_pages(struct address_space *mapping,
bdi_start_background_writeback(bdi);
}
void set_page_dirty_balance(struct page *page)
{
if (set_page_dirty(page)) {
struct address_space *mapping = page_mapping(page);
if (mapping)
balance_dirty_pages_ratelimited(mapping);
}
}
static DEFINE_PER_CPU(int, bdp_ratelimits);
/*
......@@ -2123,32 +2113,25 @@ EXPORT_SYMBOL(account_page_dirtied);
* page dirty in that case, but not all the buffers. This is a "bottom-up"
* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
*
* Most callers have locked the page, which pins the address_space in memory.
* But zap_pte_range() does not lock the page, however in that case the
* mapping is pinned by the vma's ->vm_file reference.
*
* We take care to handle the case where the page was truncated from the
* mapping by re-checking page_mapping() inside tree_lock.
* The caller must ensure this doesn't race with truncation. Most will simply
* hold the page lock, but e.g. zap_pte_range() calls with the page mapped and
* the pte lock held, which also locks out truncation.
*/
int __set_page_dirty_nobuffers(struct page *page)
{
if (!TestSetPageDirty(page)) {
struct address_space *mapping = page_mapping(page);
struct address_space *mapping2;
unsigned long flags;
if (!mapping)
return 1;
spin_lock_irqsave(&mapping->tree_lock, flags);
mapping2 = page_mapping(page);
if (mapping2) { /* Race with truncate? */
BUG_ON(mapping2 != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree,
page_index(page), PAGECACHE_TAG_DIRTY);
}
BUG_ON(page_mapping(page) != mapping);
WARN_ON_ONCE(!PagePrivate(page) && !PageUptodate(page));
account_page_dirtied(page, mapping);
radix_tree_tag_set(&mapping->page_tree, page_index(page),
PAGECACHE_TAG_DIRTY);
spin_unlock_irqrestore(&mapping->tree_lock, flags);
if (mapping->host) {
/* !PageAnon && !swapper_space */
......@@ -2305,12 +2288,10 @@ int clear_page_dirty_for_io(struct page *page)
/*
* We carefully synchronise fault handlers against
* installing a dirty pte and marking the page dirty
* at this point. We do this by having them hold the
* page lock at some point after installing their
* pte, but before marking the page dirty.
* Pages are always locked coming in here, so we get
* the desired exclusion. See mm/memory.c:do_wp_page()
* for more comments.
* at this point. We do this by having them hold the
* page lock while dirtying the page, and pages are
* always locked coming in here, so we get the desired
* exclusion.
*/
if (TestClearPageDirty(page)) {
dec_zone_page_state(page, NR_FILE_DIRTY);
......
......@@ -72,6 +72,8 @@ static inline struct anon_vma *anon_vma_alloc(void)
anon_vma = kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL);
if (anon_vma) {
atomic_set(&anon_vma->refcount, 1);
anon_vma->degree = 1; /* Reference for first vma */
anon_vma->parent = anon_vma;
/*
* Initialise the anon_vma root to point to itself. If called
* from fork, the root will be reset to the parents anon_vma.
......@@ -188,6 +190,8 @@ int anon_vma_prepare(struct vm_area_struct *vma)
if (likely(!vma->anon_vma)) {
vma->anon_vma = anon_vma;
anon_vma_chain_link(vma, avc, anon_vma);
/* vma reference or self-parent link for new root */
anon_vma->degree++;
allocated = NULL;
avc = NULL;
}
......@@ -236,6 +240,14 @@ static inline void unlock_anon_vma_root(struct anon_vma *root)
/*
* Attach the anon_vmas from src to dst.
* Returns 0 on success, -ENOMEM on failure.
*
* If dst->anon_vma is NULL this function tries to find and reuse existing
* anon_vma which has no vmas and only one child anon_vma. This prevents
* degradation of anon_vma hierarchy to endless linear chain in case of
* constantly forking task. On the other hand, an anon_vma with more than one
* child isn't reused even if there was no alive vma, thus rmap walker has a
* good chance of avoiding scanning the whole hierarchy when it searches where
* page is mapped.
*/
int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
{
......@@ -256,7 +268,21 @@ int anon_vma_clone(struct vm_area_struct *dst, struct vm_area_struct *src)
anon_vma = pavc->anon_vma;
root = lock_anon_vma_root(root, anon_vma);
anon_vma_chain_link(dst, avc, anon_vma);
/*
* Reuse existing anon_vma if its degree lower than two,
* that means it has no vma and only one anon_vma child.
*
* Do not chose parent anon_vma, otherwise first child
* will always reuse it. Root anon_vma is never reused:
* it has self-parent reference and at least one child.
*/
if (!dst->anon_vma && anon_vma != src->anon_vma &&
anon_vma->degree < 2)
dst->anon_vma = anon_vma;
}
if (dst->anon_vma)
dst->anon_vma->degree++;
unlock_anon_vma_root(root);
return 0;
......@@ -280,6 +306,9 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
if (!pvma->anon_vma)
return 0;
/* Drop inherited anon_vma, we'll reuse existing or allocate new. */
vma->anon_vma = NULL;
/*
* First, attach the new VMA to the parent VMA's anon_vmas,
* so rmap can find non-COWed pages in child processes.
......@@ -288,6 +317,10 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
if (error)
return error;
/* An existing anon_vma has been reused, all done then. */
if (vma->anon_vma)
return 0;
/* Then add our own anon_vma. */
anon_vma = anon_vma_alloc();
if (!anon_vma)
......@@ -301,6 +334,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
* lock any of the anon_vmas in this anon_vma tree.
*/
anon_vma->root = pvma->anon_vma->root;
anon_vma->parent = pvma->anon_vma;
/*
* With refcounts, an anon_vma can stay around longer than the
* process it belongs to. The root anon_vma needs to be pinned until
......@@ -311,6 +345,7 @@ int anon_vma_fork(struct vm_area_struct *vma, struct vm_area_struct *pvma)
vma->anon_vma = anon_vma;
anon_vma_lock_write(anon_vma);
anon_vma_chain_link(vma, avc, anon_vma);
anon_vma->parent->degree++;
anon_vma_unlock_write(anon_vma);
return 0;
......@@ -341,12 +376,16 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
* Leave empty anon_vmas on the list - we'll need
* to free them outside the lock.
*/
if (RB_EMPTY_ROOT(&anon_vma->rb_root))
if (RB_EMPTY_ROOT(&anon_vma->rb_root)) {
anon_vma->parent->degree--;
continue;
}
list_del(&avc->same_vma);
anon_vma_chain_free(avc);
}
if (vma->anon_vma)
vma->anon_vma->degree--;
unlock_anon_vma_root(root);
/*
......@@ -357,6 +396,7 @@ void unlink_anon_vmas(struct vm_area_struct *vma)
list_for_each_entry_safe(avc, next, &vma->anon_vma_chain, same_vma) {
struct anon_vma *anon_vma = avc->anon_vma;
BUG_ON(anon_vma->degree);
put_anon_vma(anon_vma);
list_del(&avc->same_vma);
......
......@@ -2921,18 +2921,20 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
return false;
/*
* There is a potential race between when kswapd checks its watermarks
* and a process gets throttled. There is also a potential race if
* processes get throttled, kswapd wakes, a large process exits therby
* balancing the zones that causes kswapd to miss a wakeup. If kswapd
* is going to sleep, no process should be sleeping on pfmemalloc_wait
* so wake them now if necessary. If necessary, processes will wake
* kswapd and get throttled again
* The throttled processes are normally woken up in balance_pgdat() as
* soon as pfmemalloc_watermark_ok() is true. But there is a potential
* race between when kswapd checks the watermarks and a process gets
* throttled. There is also a potential race if processes get
* throttled, kswapd wakes, a large process exits thereby balancing the
* zones, which causes kswapd to exit balance_pgdat() before reaching
* the wake up checks. If kswapd is going to sleep, no process should
* be sleeping on pfmemalloc_wait, so wake them now if necessary. If
* the wake up is premature, processes will wake kswapd and get
* throttled again. The difference from wake ups in balance_pgdat() is
* that here we are under prepare_to_wait().
*/
if (waitqueue_active(&pgdat->pfmemalloc_wait)) {
wake_up(&pgdat->pfmemalloc_wait);
return false;
}
if (waitqueue_active(&pgdat->pfmemalloc_wait))
wake_up_all(&pgdat->pfmemalloc_wait);
return pgdat_balanced(pgdat, order, classzone_idx);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment