memcg: charge before adding to swapcache on swapin

Currently the kernel adds the page, allocated for swapin, to the swapcache before charging the page. This is fine but now we want a per-memcg swapcache stat which is essential for folks who wants to transparently migrate from cgroup v1's memsw to cgroup v2's memory and swap counters. In addition charging a page before exposing it to other parts of the kernel is a step in the right direction. To correctly maintain the per-memcg swapcache stat, this patch has adopted to charge the page before adding it to swapcache. One challenge in this option is the failure case of add_to_swap_cache() on which we need to undo the mem_cgroup_charge(). Specifically undoing mem_cgroup_uncharge_swap() is not simple. To resolve the issue, this patch decouples the charging for swapin pages from mem_cgroup_charge(). Two new functions are introduced, mem_cgroup_swapin_charge_page() for just charging the swapin page and mem_cgroup_swapin_uncharge_swap() for uncharging the swap slot once the page has been successfully added to the swapcache. [shakeelb@google.com: set page->private before calling swap_readpage] Link: https://lkml.kernel.org/r/20210318015959.2986837-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210305212639.775498-1-shakeelb@google.comSigned-off-by: Shakeel Butt <shakeelb@google.com> Acked-by: Roman Gushchin <guro@fb.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Hugh Dickins <hughd@google.com> Tested-by: Heiko Carstens <hca@linux.ibm.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

memcg: charge before adding to swapcache on swapin
Currently the kernel adds the page, allocated for swapin, to the swapcache before charging the page. This is fine but now we want a per-memcg swapcache stat which is essential for folks who wants to transparently migrate from cgroup v1's memsw to cgroup v2's memory and swap counters. In addition charging a page before exposing it to other parts of the kernel is a step in the right direction. To correctly maintain the per-memcg swapcache stat, this patch has adopted to charge the page before adding it to swapcache. One challenge in this option is the failure case of add_to_swap_cache() on which we need to undo the mem_cgroup_charge(). Specifically undoing mem_cgroup_uncharge_swap() is not simple. To resolve the issue, this patch decouples the charging for swapin pages from mem_cgroup_charge(). Two new functions are introduced, mem_cgroup_swapin_charge_page() for just charging the swapin page and mem_cgroup_swapin_uncharge_swap() for uncharging the swap slot once the page has been successfully added to the swapcache. [shakeelb@google.com: set page->private before calling swap_readpage] Link: https://lkml.kernel.org/r/20210318015959.2986837-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20210305212639.775498-1-shakeelb@google.comSigned-off-by: Shakeel Butt <shakeelb@google.com> Acked-by: Roman Gushchin <guro@fb.com> Acked-by: Johannes Weiner <hannes@cmpxchg.org> Acked-by: Hugh Dickins <hughd@google.com> Tested-by: Heiko Carstens <hca@linux.ibm.com> Cc: Michal Hocko <mhocko@kernel.org> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Cc: Christian Borntraeger <borntraeger@de.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
0add0c77 · Shakeel Butt · Linus Torvalds · 4bbcc5a4 · 0add0c77 · 0add0c77
Commit 0add0c77 authored Apr 29, 2021 by Shakeel Butt Committed by Linus Torvalds Apr 30, 2021
Showing with 100 additions and 59 deletions

include/linux/memcontrol.h include/linux/memcontrol.h +13 -0

mm/memcontrol.c mm/memcontrol.c +74 -43

mm/memory.c mm/memory.c +7 -9

mm/swap_state.c mm/swap_state.c +6 -7

No files found.
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -609,6 +609,9 @@ static inline bool mem_cgroup_below_min(struct mem_cgroup *memcg)
 }
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask);
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
+				  gfp_t gfp, swp_entry_t entry);
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry);
 void mem_cgroup_uncharge(struct page *page);
 void mem_cgroup_uncharge_list(struct list_head *page_list);
@@ -1112,6 +1115,16 @@ static inline int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
 	return 0;
 }
+static inline int mem_cgroup_swapin_charge_page(struct page *page,
+			struct mm_struct *mm, gfp_t gfp, swp_entry_t entry)
+{
+	return 0;
+}
+static inline void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
+}
 static inline void mem_cgroup_uncharge(struct page *page)
 {
 }

--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6644,6 +6644,27 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
 			atomic_long_read(&parent->memory.children_low_usage)));
 }
+static int __mem_cgroup_charge(struct page *page, struct mem_cgroup *memcg,
+			       gfp_t gfp)
+{
+	unsigned int nr_pages = thp_nr_pages(page);
+	int ret;
+	ret = try_charge(memcg, gfp, nr_pages);
+	if (ret)
+		goto out;
+	css_get(&memcg->css);
+	commit_charge(page, memcg);
+	local_irq_disable();
+	mem_cgroup_charge_statistics(memcg, page, nr_pages);
+	memcg_check_events(memcg, page);
+	local_irq_enable();
+out:
+	return ret;
+}
 /**
 * mem_cgroup_charge - charge a newly allocated page to a cgroup
 * @page: page to charge
@@ -6653,55 +6674,71 @@ void mem_cgroup_calculate_protection(struct mem_cgroup *root,
 * Try to charge @page to the memcg that @mm belongs to, reclaiming
 * pages according to @gfp_mask if necessary.
 *
+ * Do not use this for pages allocated for swapin.
+ *
 * Returns 0 on success. Otherwise, an error code is returned.
 */
 int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 {
-	unsigned int nr_pages = thp_nr_pages(page);
+	struct mem_cgroup *memcg;
-	struct mem_cgroup *memcg = NULL;
+	int ret;
-	int ret = 0;
 	if (mem_cgroup_disabled())
-		goto out;
+		return 0;
-	if (PageSwapCache(page)) {
+	memcg = get_mem_cgroup_from_mm(mm);
-		swp_entry_t ent = { .val = page_private(page), };
+	ret = __mem_cgroup_charge(page, memcg, gfp_mask);
-		unsigned short id;
+	css_put(&memcg->css);
-		/*
+	return ret;
-		 * Every swap fault against a single page tries to charge the
+}
-		 * page, bail as early as possible.  shmem_unuse() encounters
-		 * already charged pages, too.  page and memcg binding is
+/**
-		 * protected by the page lock, which serializes swap cache
+ * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin
-		 * removal, which in turn serializes uncharging.
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp: reclaim mode
+ * @entry: swap entry for which the page is allocated
+ *
+ * This function charges a page allocated for swapin. Please call this before
+ * adding the page to the swapcache.
+ *
+ * Returns 0 on success. Otherwise, an error code is returned.
 */
-		VM_BUG_ON_PAGE(!PageLocked(page), page);
+int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm,
-		if (page_memcg(compound_head(page)))
+				  gfp_t gfp, swp_entry_t entry)
-			goto out;
+{
+	struct mem_cgroup *memcg;
+	unsigned short id;
+	int ret;
-		id = lookup_swap_cgroup_id(ent);
+	if (mem_cgroup_disabled())
+		return 0;
+	id = lookup_swap_cgroup_id(entry);
 	rcu_read_lock();
 	memcg = mem_cgroup_from_id(id);
-		if (memcg && !css_tryget_online(&memcg->css))
+	if (!memcg || !css_tryget_online(&memcg->css))
-			memcg = NULL;
-		rcu_read_unlock();
-	}
-	if (!memcg)
 		memcg = get_mem_cgroup_from_mm(mm);
+	rcu_read_unlock();
-	ret = try_charge(memcg, gfp_mask, nr_pages);
+	ret = __mem_cgroup_charge(page, memcg, gfp);
-	if (ret)
-		goto out_put;
-	css_get(&memcg->css);
-	commit_charge(page, memcg);
-	local_irq_disable();
+	css_put(&memcg->css);
-	mem_cgroup_charge_statistics(memcg, page, nr_pages);
+	return ret;
-	memcg_check_events(memcg, page);
+}
-	local_irq_enable();
+/*
+ * mem_cgroup_swapin_uncharge_swap - uncharge swap slot
+ * @entry: swap entry for which the page is charged
+ *
+ * Call this function after successfully adding the charged page to swapcache.
+ *
+ * Note: This function assumes the page for which swap slot is being uncharged
+ * is order 0 page.
+ */
+void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry)
+{
 	/*
 	 * Cgroup1's unified memory+swap counter has been charged with the
 	 * new swapcache page, finish the transfer by uncharging the swap
@@ -6714,20 +6751,14 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
 	 * correspond 1:1 to page and swap slot lifetimes: we charge the
 	 * page to memory here, and uncharge swap when the slot is freed.
 	 */
-	if (do_memsw_account() && PageSwapCache(page)) {
+	if (!mem_cgroup_disabled() && do_memsw_account()) {
-		swp_entry_t entry = { .val = page_private(page) };
 		/*
 		 * The swap entry might not get freed for a long time,
 		 * let's not wait for it.  The page already received a
 		 * memory+swap charge, drop the swap entry duplicate.
 		 */
-		mem_cgroup_uncharge_swap(entry, nr_pages);
+		mem_cgroup_uncharge_swap(entry, 1);
 	}
-out_put:
-	css_put(&memcg->css);
-out:
-	return ret;
 }
 struct uncharge_gather {

--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3309,28 +3309,26 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
 			page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
 							vmf->address);
 			if (page) {
-				int err;
 				__SetPageLocked(page);
 				__SetPageSwapBacked(page);
-				set_page_private(page, entry.val);
-				/* Tell memcg to use swap ownership records */
+				if (mem_cgroup_swapin_charge_page(page,
-				SetPageSwapCache(page);
+					vma->vm_mm, GFP_KERNEL, entry)) {
-				err = mem_cgroup_charge(page, vma->vm_mm,
-							GFP_KERNEL);
-				ClearPageSwapCache(page);
-				if (err) {
 					ret = VM_FAULT_OOM;
 					goto out_page;
 				}
+				mem_cgroup_swapin_uncharge_swap(entry);
 				shadow = get_shadow_from_swap_cache(entry);
 				if (shadow)
 					workingset_refault(page, shadow);
 				lru_cache_add(page);
+				/* To provide entry to swap_readpage() */
+				set_page_private(page, entry.val);
 				swap_readpage(page, true);
+				set_page_private(page, 0);
 			}
 		} else {
 			page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,

--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -497,16 +497,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	__SetPageLocked(page);
 	__SetPageSwapBacked(page);
-	/* May fail (-ENOMEM) if XArray node allocation failed. */
+	if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry))
-	if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
-		put_swap_page(page, entry);
 		goto fail_unlock;
-	}
-	if (mem_cgroup_charge(page, NULL, gfp_mask)) {
+	/* May fail (-ENOMEM) if XArray node allocation failed. */
-		delete_from_swap_cache(page);
+	if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow))
 		goto fail_unlock;
-	}
+	mem_cgroup_swapin_uncharge_swap(entry);
 	if (shadow)
 		workingset_refault(page, shadow);
@@ -517,6 +515,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 	return page;
 fail_unlock:
+	put_swap_page(page, entry);
 	unlock_page(page);
 	put_page(page);
 	return NULL;