Commit 0add0c77 authored by Shakeel Butt's avatar Shakeel Butt Committed by Linus Torvalds

memcg: charge before adding to swapcache on swapin

Currently the kernel adds the page, allocated for swapin, to the
swapcache before charging the page.  This is fine but now we want a
per-memcg swapcache stat which is essential for folks who wants to
transparently migrate from cgroup v1's memsw to cgroup v2's memory and
swap counters.  In addition charging a page before exposing it to other
parts of the kernel is a step in the right direction.

To correctly maintain the per-memcg swapcache stat, this patch has
adopted to charge the page before adding it to swapcache.  One challenge
in this option is the failure case of add_to_swap_cache() on which we
need to undo the mem_cgroup_charge().  Specifically undoing
mem_cgroup_uncharge_swap() is not simple.

To resolve the issue, this patch decouples the charging for swapin pages
from mem_cgroup_charge().  Two new functions are introduced,
mem_cgroup_swapin_charge_page() for just charging the swapin page and
mem_cgroup_swapin_uncharge_swap() for uncharging the swap slot once the
page has been successfully added to the swapcache.

[shakeelb@google.com: set page->private before calling swap_readpage]
  Link: https://lkml.kernel.org/r/20210318015959.2986837-1-shakeelb@google.com

Link: https://lkml.kernel.org/r/20210305212639.775498-1-shakeelb@google.comSigned-off-by: default avatarShakeel Butt <shakeelb@google.com>
Acked-by: default avatarRoman Gushchin <guro@fb.com>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Acked-by: default avatarHugh Dickins <hughd@google.com>
Tested-by: default avatarHeiko Carstens <hca@linux.ibm.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 4bbcc5a4
...@@ -609,6 +609,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg) ...@@ -609,6 +609,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
} }
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask); int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry);
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
void mem_cgroup_uncharge(struct page *page); void mem_cgroup_uncharge(struct page *page);
void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_uncharge_list(struct list_head *page_list);
...@@ -1112,6 +1115,16 @@ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm, ...@@ -1112,6 +1115,16 @@ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
return 0; return 0;
} }
static inline int mem_cgroup_swapin_charge_page(struct page *page,
struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
{
return 0;
}
static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
{
}
static inline void mem_cgroup_uncharge(struct page *page) static inline void mem_cgroup_uncharge(struct page *page)
{ {
} }
......
...@@ -6644,6 +6644,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, ...@@ -6644,6 +6644,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_low_usage))); atomic_long_read(&parent->memory.children_low_usage)));
} }
static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
gfp_t gfp)
{
unsigned int nr_pages = thp_nr_pages(page);
int ret;
ret = try_charge(memcg, gfp, nr_pages);
if (ret)
goto out;
css_get(&memcg->css);
commit_charge(page, memcg);
local_irq_disable();
mem_cgroup_charge_statistics(memcg, page, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
out:
return ret;
}
/** /**
* mem_cgroup_charge - charge a newly allocated page to a cgroup * mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge * @page: page to charge
...@@ -6653,55 +6674,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root, ...@@ -6653,55 +6674,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
* Try to charge @page to the memcg that @mm belongs to, reclaiming * Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary. * pages according to @gfp_mask if necessary.
* *
* Do not use this for pages allocated for swapin.
*
* Returns 0 on success. Otherwise, an error code is returned. * Returns 0 on success. Otherwise, an error code is returned.
*/ */
int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{ {
unsigned int nr_pages = thp_nr_pages(page); struct mem_cgroup *memcg;
struct mem_cgroup *memcg = NULL; int ret;
int ret = 0;
if (mem_cgroup_disabled()) if (mem_cgroup_disabled())
goto out; return 0;
if (PageSwapCache(page)) { memcg = get_mem_cgroup_from_mm(mm);
swp_entry_t ent = { .val = page_private(page), }; ret = __mem_cgroup_charge(page, memcg, gfp_mask);
unsigned short id; css_put(&memcg->css);
/* return ret;
* Every swap fault against a single page tries to charge the }
* page, bail as early as possible. shmem_unuse() encounters
* already charged pages, too. page and memcg binding is
* protected by the page lock, which serializes swap cache
* removal, which in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (page_memcg(compound_head(page)))
goto out;
id = lookup_swap_cgroup_id(ent); /**
rcu_read_lock(); * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
memcg = mem_cgroup_from_id(id); * @page: page to charge
if (memcg && !css_tryget_online(&memcg->css)) * @mm: mm context of the victim
memcg = NULL; * @gfp: reclaim mode
rcu_read_unlock(); * @entry: swap entry for which the page is allocated
} *
* This function charges a page allocated for swapin. Please call this before
* adding the page to the swapcache.
*
* Returns 0 on success. Otherwise, an error code is returned.
*/
int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
gfp_t gfp, swp_entry_t entry)
{
struct mem_cgroup *memcg;
unsigned short id;
int ret;
if (!memcg) if (mem_cgroup_disabled())
return 0;
id = lookup_swap_cgroup_id(entry);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (!memcg || !css_tryget_online(&memcg->css))
memcg = get_mem_cgroup_from_mm(mm); memcg = get_mem_cgroup_from_mm(mm);
rcu_read_unlock();
ret = try_charge(memcg, gfp_mask, nr_pages); ret = __mem_cgroup_charge(page, memcg, gfp);
if (ret)
goto out_put;
css_get(&memcg->css); css_put(&memcg->css);
commit_charge(page, memcg); return ret;
}
local_irq_disable();
mem_cgroup_charge_statistics(memcg, page, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
/*
* mem_cgroup_swapin_uncharge_swap - uncharge swap slot
* @entry: swap entry for which the page is charged
*
* Call this function after successfully adding the charged page to swapcache.
*
* Note: This function assumes the page for which swap slot is being uncharged
* is order 0 page.
*/
void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
{
/* /*
* Cgroup1's unified memory+swap counter has been charged with the * Cgroup1's unified memory+swap counter has been charged with the
* new swapcache page, finish the transfer by uncharging the swap * new swapcache page, finish the transfer by uncharging the swap
...@@ -6714,20 +6751,14 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) ...@@ -6714,20 +6751,14 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
* correspond 1:1 to page and swap slot lifetimes: we charge the * correspond 1:1 to page and swap slot lifetimes: we charge the
* page to memory here, and uncharge swap when the slot is freed. * page to memory here, and uncharge swap when the slot is freed.
*/ */
if (do_memsw_account() && PageSwapCache(page)) { if (!mem_cgroup_disabled() && do_memsw_account()) {
swp_entry_t entry = { .val = page_private(page) };
/* /*
* The swap entry might not get freed for a long time, * The swap entry might not get freed for a long time,
* let's not wait for it. The page already received a * let's not wait for it. The page already received a
* memory+swap charge, drop the swap entry duplicate. * memory+swap charge, drop the swap entry duplicate.
*/ */
mem_cgroup_uncharge_swap(entry, nr_pages); mem_cgroup_uncharge_swap(entry, 1);
} }
out_put:
css_put(&memcg->css);
out:
return ret;
} }
struct uncharge_gather { struct uncharge_gather {
......
...@@ -3309,28 +3309,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) ...@@ -3309,28 +3309,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address); vmf->address);
if (page) { if (page) {
int err;
__SetPageLocked(page); __SetPageLocked(page);
__SetPageSwapBacked(page); __SetPageSwapBacked(page);
set_page_private(page, entry.val);
/* Tell memcg to use swap ownership records */ if (mem_cgroup_swapin_charge_page(page,
SetPageSwapCache(page); vma->vm_mm, GFP_KERNEL, entry)) {
err = mem_cgroup_charge(page, vma->vm_mm,
GFP_KERNEL);
ClearPageSwapCache(page);
if (err) {
ret = VM_FAULT_OOM; ret = VM_FAULT_OOM;
goto out_page; goto out_page;
} }
mem_cgroup_swapin_uncharge_swap(entry);
shadow = get_shadow_from_swap_cache(entry); shadow = get_shadow_from_swap_cache(entry);
if (shadow) if (shadow)
workingset_refault(page, shadow); workingset_refault(page, shadow);
lru_cache_add(page); lru_cache_add(page);
/* To provide entry to swap_readpage() */
set_page_private(page, entry.val);
swap_readpage(page, true); swap_readpage(page, true);
set_page_private(page, 0);
} }
} else { } else {
page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
......
...@@ -497,16 +497,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, ...@@ -497,16 +497,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
__SetPageLocked(page); __SetPageLocked(page);
__SetPageSwapBacked(page); __SetPageSwapBacked(page);
/* May fail (-ENOMEM) if XArray node allocation failed. */ if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
put_swap_page(page, entry);
goto fail_unlock; goto fail_unlock;
}
if (mem_cgroup_charge(page, NULL, gfp_mask)) { /* May fail (-ENOMEM) if XArray node allocation failed. */
delete_from_swap_cache(page); if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
goto fail_unlock; goto fail_unlock;
}
mem_cgroup_swapin_uncharge_swap(entry);
if (shadow) if (shadow)
workingset_refault(page, shadow); workingset_refault(page, shadow);
...@@ -517,6 +515,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, ...@@ -517,6 +515,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
return page; return page;
fail_unlock: fail_unlock:
put_swap_page(page, entry);
unlock_page(page); unlock_page(page);
put_page(page); put_page(page);
return NULL; return NULL;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment