Commit d7365e78 authored by Johannes Weiner's avatar Johannes Weiner Committed by Linus Torvalds

mm: memcontrol: fix missed end-writeback page accounting

Commit 0a31bc97 ("mm: memcontrol: rewrite uncharge API") changed
page migration to uncharge the old page right away.  The page is locked,
unmapped, truncated, and off the LRU, but it could race with writeback
ending, which then doesn't unaccount the page properly:

test_clear_page_writeback()              migration
                                           wait_on_page_writeback()
  TestClearPageWriteback()
                                           mem_cgroup_migrate()
                                             clear PCG_USED
  mem_cgroup_update_page_stat()
    if (PageCgroupUsed(pc))
      decrease memcg pages under writeback

  release pc->mem_cgroup->move_lock

The per-page statistics interface is heavily optimized to avoid a
function call and a lookup_page_cgroup() in the file unmap fast path,
which means it doesn't verify whether a page is still charged before
clearing PageWriteback() and it has to do it in the stat update later.

Rework it so that it looks up the page's memcg once at the beginning of
the transaction and then uses it throughout.  The charge will be
verified before clearing PageWriteback() and migration can't uncharge
the page as long as that is still set.  The RCU lock will protect the
memcg past uncharge.

As far as losing the optimization goes, the following test results are
from a microbenchmark that maps, faults, and unmaps a 4GB sparse file
three times in a nested fashion, so that there are two negative passes
that don't account but still go through the new transaction overhead.
There is no actual difference:

 old:     33.195102545 seconds time elapsed       ( +-  0.01% )
 new:     33.199231369 seconds time elapsed       ( +-  0.03% )

The time spent in page_remove_rmap()'s callees still adds up to the
same, but the time spent in the function itself seems reduced:

     # Children      Self  Command        Shared Object       Symbol
 old:     0.12%     0.11%  filemapstress  [kernel.kallsyms]   [k] page_remove_rmap
 new:     0.12%     0.08%  filemapstress  [kernel.kallsyms]   [k] page_remove_rmap
Signed-off-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Acked-by: default avatarMichal Hocko <mhocko@suse.cz>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: <stable@vger.kernel.org>	[3.17.x]
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 3a3c02ec
...@@ -139,48 +139,23 @@ static inline bool mem_cgroup_disabled(void) ...@@ -139,48 +139,23 @@ static inline bool mem_cgroup_disabled(void)
return false; return false;
} }
void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked, struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page, bool *locked,
unsigned long *flags); unsigned long *flags);
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
extern atomic_t memcg_moving; unsigned long flags);
void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
static inline void mem_cgroup_begin_update_page_stat(struct page *page, enum mem_cgroup_stat_index idx, int val);
bool *locked, unsigned long *flags)
{ static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
if (mem_cgroup_disabled())
return;
rcu_read_lock();
*locked = false;
if (atomic_read(&memcg_moving))
__mem_cgroup_begin_update_page_stat(page, locked, flags);
}
void __mem_cgroup_end_update_page_stat(struct page *page,
unsigned long *flags);
static inline void mem_cgroup_end_update_page_stat(struct page *page,
bool *locked, unsigned long *flags)
{
if (mem_cgroup_disabled())
return;
if (*locked)
__mem_cgroup_end_update_page_stat(page, flags);
rcu_read_unlock();
}
void mem_cgroup_update_page_stat(struct page *page,
enum mem_cgroup_stat_index idx,
int val);
static inline void mem_cgroup_inc_page_stat(struct page *page,
enum mem_cgroup_stat_index idx) enum mem_cgroup_stat_index idx)
{ {
mem_cgroup_update_page_stat(page, idx, 1); mem_cgroup_update_page_stat(memcg, idx, 1);
} }
static inline void mem_cgroup_dec_page_stat(struct page *page, static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx) enum mem_cgroup_stat_index idx)
{ {
mem_cgroup_update_page_stat(page, idx, -1); mem_cgroup_update_page_stat(memcg, idx, -1);
} }
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
...@@ -315,13 +290,14 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) ...@@ -315,13 +290,14 @@ mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
{ {
} }
static inline void mem_cgroup_begin_update_page_stat(struct page *page, static inline struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
bool *locked, unsigned long *flags) bool *locked, unsigned long *flags)
{ {
return NULL;
} }
static inline void mem_cgroup_end_update_page_stat(struct page *page, static inline void mem_cgroup_end_page_stat(struct mem_cgroup *memcg,
bool *locked, unsigned long *flags) bool locked, unsigned long flags)
{ {
} }
...@@ -343,12 +319,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) ...@@ -343,12 +319,12 @@ static inline bool mem_cgroup_oom_synchronize(bool wait)
return false; return false;
} }
static inline void mem_cgroup_inc_page_stat(struct page *page, static inline void mem_cgroup_inc_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx) enum mem_cgroup_stat_index idx)
{ {
} }
static inline void mem_cgroup_dec_page_stat(struct page *page, static inline void mem_cgroup_dec_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx) enum mem_cgroup_stat_index idx)
{ {
} }
......
...@@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg) ...@@ -1536,12 +1536,8 @@ int mem_cgroup_swappiness(struct mem_cgroup *memcg)
* start move here. * start move here.
*/ */
/* for quick checking without looking up memcg */
atomic_t memcg_moving __read_mostly;
static void mem_cgroup_start_move(struct mem_cgroup *memcg) static void mem_cgroup_start_move(struct mem_cgroup *memcg)
{ {
atomic_inc(&memcg_moving);
atomic_inc(&memcg->moving_account); atomic_inc(&memcg->moving_account);
synchronize_rcu(); synchronize_rcu();
} }
...@@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg) ...@@ -1552,10 +1548,8 @@ static void mem_cgroup_end_move(struct mem_cgroup *memcg)
* Now, mem_cgroup_clear_mc() may call this function with NULL. * Now, mem_cgroup_clear_mc() may call this function with NULL.
* We check NULL in callee rather than caller. * We check NULL in callee rather than caller.
*/ */
if (memcg) { if (memcg)
atomic_dec(&memcg_moving);
atomic_dec(&memcg->moving_account); atomic_dec(&memcg->moving_account);
}
} }
/* /*
...@@ -2204,41 +2198,52 @@ bool mem_cgroup_oom_synchronize(bool handle) ...@@ -2204,41 +2198,52 @@ bool mem_cgroup_oom_synchronize(bool handle)
return true; return true;
} }
/* /**
* Used to update mapped file or writeback or other statistics. * mem_cgroup_begin_page_stat - begin a page state statistics transaction
* @page: page that is going to change accounted state
* @locked: &memcg->move_lock slowpath was taken
* @flags: IRQ-state flags for &memcg->move_lock
* *
* Notes: Race condition * This function must mark the beginning of an accounted page state
* change to prevent double accounting when the page is concurrently
* being moved to another memcg:
* *
* Charging occurs during page instantiation, while the page is * memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
* unmapped and locked in page migration, or while the page table is * if (TestClearPageState(page))
* locked in THP migration. No race is possible. * mem_cgroup_update_page_stat(memcg, state, -1);
* mem_cgroup_end_page_stat(memcg, locked, flags);
* *
* Uncharge happens to pages with zero references, no race possible. * The RCU lock is held throughout the transaction. The fast path can
* get away without acquiring the memcg->move_lock (@locked is false)
* because page moving starts with an RCU grace period.
* *
* Charge moving between groups is protected by checking mm->moving * The RCU lock also protects the memcg from being freed when the page
* account and taking the move_lock in the slowpath. * state that is going to change is the only thing preventing the page
* from being uncharged. E.g. end-writeback clearing PageWriteback(),
* which allows migration to go ahead and uncharge the page before the
* account transaction might be complete.
*/ */
struct mem_cgroup *mem_cgroup_begin_page_stat(struct page *page,
void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
bool *locked, unsigned long *flags) unsigned long *flags)
{ {
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
struct page_cgroup *pc; struct page_cgroup *pc;
rcu_read_lock();
if (mem_cgroup_disabled())
return NULL;
pc = lookup_page_cgroup(page); pc = lookup_page_cgroup(page);
again: again:
memcg = pc->mem_cgroup; memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc))) if (unlikely(!memcg || !PageCgroupUsed(pc)))
return; return NULL;
/*
* If this memory cgroup is not under account moving, we don't *locked = false;
* need to take move_lock_mem_cgroup(). Because we already hold
* rcu_read_lock(), any calls to move_account will be delayed until
* rcu_read_unlock().
*/
VM_BUG_ON(!rcu_read_lock_held());
if (atomic_read(&memcg->moving_account) <= 0) if (atomic_read(&memcg->moving_account) <= 0)
return; return memcg;
move_lock_mem_cgroup(memcg, flags); move_lock_mem_cgroup(memcg, flags);
if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) {
...@@ -2246,36 +2251,40 @@ void __mem_cgroup_begin_update_page_stat(struct page *page, ...@@ -2246,36 +2251,40 @@ void __mem_cgroup_begin_update_page_stat(struct page *page,
goto again; goto again;
} }
*locked = true; *locked = true;
return memcg;
} }
void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) /**
* mem_cgroup_end_page_stat - finish a page state statistics transaction
* @memcg: the memcg that was accounted against
* @locked: value received from mem_cgroup_begin_page_stat()
* @flags: value received from mem_cgroup_begin_page_stat()
*/
void mem_cgroup_end_page_stat(struct mem_cgroup *memcg, bool locked,
unsigned long flags)
{ {
struct page_cgroup *pc = lookup_page_cgroup(page); if (memcg && locked)
move_unlock_mem_cgroup(memcg, &flags);
/* rcu_read_unlock();
* It's guaranteed that pc->mem_cgroup never changes while
* lock is held because a routine modifies pc->mem_cgroup
* should take move_lock_mem_cgroup().
*/
move_unlock_mem_cgroup(pc->mem_cgroup, flags);
} }
void mem_cgroup_update_page_stat(struct page *page, /**
* mem_cgroup_update_page_stat - update page state statistics
* @memcg: memcg to account against
* @idx: page state item to account
* @val: number of pages (positive or negative)
*
* See mem_cgroup_begin_page_stat() for locking requirements.
*/
void mem_cgroup_update_page_stat(struct mem_cgroup *memcg,
enum mem_cgroup_stat_index idx, int val) enum mem_cgroup_stat_index idx, int val)
{ {
struct mem_cgroup *memcg;
struct page_cgroup *pc = lookup_page_cgroup(page);
unsigned long uninitialized_var(flags);
if (mem_cgroup_disabled())
return;
VM_BUG_ON(!rcu_read_lock_held()); VM_BUG_ON(!rcu_read_lock_held());
memcg = pc->mem_cgroup;
if (unlikely(!memcg || !PageCgroupUsed(pc)))
return;
this_cpu_add(memcg->stat->count[idx], val); if (memcg)
this_cpu_add(memcg->stat->count[idx], val);
} }
/* /*
......
...@@ -2327,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io); ...@@ -2327,11 +2327,12 @@ EXPORT_SYMBOL(clear_page_dirty_for_io);
int test_clear_page_writeback(struct page *page) int test_clear_page_writeback(struct page *page)
{ {
struct address_space *mapping = page_mapping(page); struct address_space *mapping = page_mapping(page);
int ret;
bool locked;
unsigned long memcg_flags; unsigned long memcg_flags;
struct mem_cgroup *memcg;
bool locked;
int ret;
mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
if (mapping) { if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info; struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags; unsigned long flags;
...@@ -2352,22 +2353,23 @@ int test_clear_page_writeback(struct page *page) ...@@ -2352,22 +2353,23 @@ int test_clear_page_writeback(struct page *page)
ret = TestClearPageWriteback(page); ret = TestClearPageWriteback(page);
} }
if (ret) { if (ret) {
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
dec_zone_page_state(page, NR_WRITEBACK); dec_zone_page_state(page, NR_WRITEBACK);
inc_zone_page_state(page, NR_WRITTEN); inc_zone_page_state(page, NR_WRITTEN);
} }
mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
return ret; return ret;
} }
int __test_set_page_writeback(struct page *page, bool keep_write) int __test_set_page_writeback(struct page *page, bool keep_write)
{ {
struct address_space *mapping = page_mapping(page); struct address_space *mapping = page_mapping(page);
int ret;
bool locked;
unsigned long memcg_flags; unsigned long memcg_flags;
struct mem_cgroup *memcg;
bool locked;
int ret;
mem_cgroup_begin_update_page_stat(page, &locked, &memcg_flags); memcg = mem_cgroup_begin_page_stat(page, &locked, &memcg_flags);
if (mapping) { if (mapping) {
struct backing_dev_info *bdi = mapping->backing_dev_info; struct backing_dev_info *bdi = mapping->backing_dev_info;
unsigned long flags; unsigned long flags;
...@@ -2394,10 +2396,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write) ...@@ -2394,10 +2396,10 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
ret = TestSetPageWriteback(page); ret = TestSetPageWriteback(page);
} }
if (!ret) { if (!ret) {
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_WRITEBACK); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
inc_zone_page_state(page, NR_WRITEBACK); inc_zone_page_state(page, NR_WRITEBACK);
} }
mem_cgroup_end_update_page_stat(page, &locked, &memcg_flags); mem_cgroup_end_page_stat(memcg, locked, memcg_flags);
return ret; return ret;
} }
......
...@@ -1042,15 +1042,16 @@ void page_add_new_anon_rmap(struct page *page, ...@@ -1042,15 +1042,16 @@ void page_add_new_anon_rmap(struct page *page,
*/ */
void page_add_file_rmap(struct page *page) void page_add_file_rmap(struct page *page)
{ {
bool locked; struct mem_cgroup *memcg;
unsigned long flags; unsigned long flags;
bool locked;
mem_cgroup_begin_update_page_stat(page, &locked, &flags); memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
if (atomic_inc_and_test(&page->_mapcount)) { if (atomic_inc_and_test(&page->_mapcount)) {
__inc_zone_page_state(page, NR_FILE_MAPPED); __inc_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_inc_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); mem_cgroup_inc_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
} }
mem_cgroup_end_update_page_stat(page, &locked, &flags); mem_cgroup_end_page_stat(memcg, locked, flags);
} }
/** /**
...@@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page) ...@@ -1061,9 +1062,10 @@ void page_add_file_rmap(struct page *page)
*/ */
void page_remove_rmap(struct page *page) void page_remove_rmap(struct page *page)
{ {
struct mem_cgroup *uninitialized_var(memcg);
bool anon = PageAnon(page); bool anon = PageAnon(page);
bool locked;
unsigned long flags; unsigned long flags;
bool locked;
/* /*
* The anon case has no mem_cgroup page_stat to update; but may * The anon case has no mem_cgroup page_stat to update; but may
...@@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page) ...@@ -1071,7 +1073,7 @@ void page_remove_rmap(struct page *page)
* we hold the lock against page_stat move: so avoid it on anon. * we hold the lock against page_stat move: so avoid it on anon.
*/ */
if (!anon) if (!anon)
mem_cgroup_begin_update_page_stat(page, &locked, &flags); memcg = mem_cgroup_begin_page_stat(page, &locked, &flags);
/* page still mapped by someone else? */ /* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount)) if (!atomic_add_negative(-1, &page->_mapcount))
...@@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page) ...@@ -1096,8 +1098,7 @@ void page_remove_rmap(struct page *page)
-hpage_nr_pages(page)); -hpage_nr_pages(page));
} else { } else {
__dec_zone_page_state(page, NR_FILE_MAPPED); __dec_zone_page_state(page, NR_FILE_MAPPED);
mem_cgroup_dec_page_stat(page, MEM_CGROUP_STAT_FILE_MAPPED); mem_cgroup_dec_page_stat(memcg, MEM_CGROUP_STAT_FILE_MAPPED);
mem_cgroup_end_update_page_stat(page, &locked, &flags);
} }
if (unlikely(PageMlocked(page))) if (unlikely(PageMlocked(page)))
clear_page_mlock(page); clear_page_mlock(page);
...@@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page) ...@@ -1110,10 +1111,9 @@ void page_remove_rmap(struct page *page)
* Leaving it set also helps swapoff to reinstate ptes * Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache. * faster for those pages still in swapcache.
*/ */
return;
out: out:
if (!anon) if (!anon)
mem_cgroup_end_update_page_stat(page, &locked, &flags); mem_cgroup_end_page_stat(memcg, locked, flags);
} }
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment