Commit a65b0e76 authored by Domenico Cerasuolo's avatar Domenico Cerasuolo Committed by Andrew Morton

zswap: make shrinking memcg-aware

Currently, we only have a single global LRU for zswap.  This makes it
impossible to perform worload-specific shrinking - an memcg cannot
determine which pages in the pool it owns, and often ends up writing pages
from other memcgs.  This issue has been previously observed in practice
and mitigated by simply disabling memcg-initiated shrinking:

https://lore.kernel.org/all/20230530232435.3097106-1-nphamcs@gmail.com/T/#u

This patch fully resolves the issue by replacing the global zswap LRU
with memcg- and NUMA-specific LRUs, and modify the reclaim logic:

a) When a store attempt hits an memcg limit, it now triggers a
   synchronous reclaim attempt that, if successful, allows the new
   hotter page to be accepted by zswap.
b) If the store attempt instead hits the global zswap limit, it will
   trigger an asynchronous reclaim attempt, in which an memcg is
   selected for reclaim in a round-robin-like fashion.

[nphamcs@gmail.com: use correct function for the onlineness check, use mem_cgroup_iter_break()]
  Link: https://lkml.kernel.org/r/20231205195419.2563217-1-nphamcs@gmail.com
[nphamcs@gmail.com: drop the pool's reference at the end of the writeback step]
  Link: https://lkml.kernel.org/r/20231206030627.4155634-1-nphamcs@gmail.com
Link: https://lkml.kernel.org/r/20231130194023.4102148-4-nphamcs@gmail.comSigned-off-by: default avatarDomenico Cerasuolo <cerasuolodomenico@gmail.com>
Co-developed-by: default avatarNhat Pham <nphamcs@gmail.com>
Signed-off-by: default avatarNhat Pham <nphamcs@gmail.com>
Tested-by: default avatarBagas Sanjaya <bagasdotme@gmail.com>
Cc: Chris Li <chrisl@kernel.org>
Cc: Dan Streetman <ddstreet@ieee.org>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Seth Jennings <sjenning@redhat.com>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Shuah Khan <shuah@kernel.org>
Cc: Vitaly Wool <vitaly.wool@konsulko.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent fdc4161f
......@@ -1192,6 +1192,11 @@ static inline struct mem_cgroup *page_memcg_check(struct page *page)
return NULL;
}
static inline struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
{
return NULL;
}
static inline bool folio_memcg_kmem(struct folio *folio)
{
return false;
......
......@@ -15,6 +15,7 @@ bool zswap_load(struct folio *folio);
void zswap_invalidate(int type, pgoff_t offset);
void zswap_swapon(int type);
void zswap_swapoff(int type);
void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg);
#else
......@@ -31,6 +32,7 @@ static inline bool zswap_load(struct folio *folio)
static inline void zswap_invalidate(int type, pgoff_t offset) {}
static inline void zswap_swapon(int type) {}
static inline void zswap_swapoff(int type) {}
static inline void zswap_memcg_offline_cleanup(struct mem_cgroup *memcg) {}
#endif
......
......@@ -5614,6 +5614,8 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
zswap_memcg_offline_cleanup(memcg);
memcg_offline_kmem(memcg);
reparent_shrinker_deferred(memcg);
wb_memcg_offline(memcg);
......
......@@ -51,7 +51,8 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct swap_iocb **plug);
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx,
bool *new_page_allocated);
bool *new_page_allocated,
bool skip_if_exists);
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t flag,
struct mempolicy *mpol, pgoff_t ilx);
struct page *swapin_readahead(swp_entry_t entry, gfp_t flag,
......
......@@ -412,7 +412,8 @@ struct folio *filemap_get_incore_folio(struct address_space *mapping,
struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct mempolicy *mpol, pgoff_t ilx,
bool *new_page_allocated)
bool *new_page_allocated,
bool skip_if_exists)
{
struct swap_info_struct *si;
struct folio *folio;
......@@ -470,6 +471,17 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
if (err != -EEXIST)
goto fail_put_swap;
/*
* Protect against a recursive call to __read_swap_cache_async()
* on the same entry waiting forever here because SWAP_HAS_CACHE
* is set but the folio is not the swap cache yet. This can
* happen today if mem_cgroup_swapin_charge_folio() below
* triggers reclaim through zswap, which may call
* __read_swap_cache_async() in the writeback path.
*/
if (skip_if_exists)
goto fail_put_swap;
/*
* We might race against __delete_from_swap_cache(), and
* stumble across a swap_map entry whose SWAP_HAS_CACHE
......@@ -537,7 +549,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
mpol = get_vma_policy(vma, addr, 0, &ilx);
page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated);
&page_allocated, false);
mpol_cond_put(mpol);
if (page_allocated)
......@@ -654,7 +666,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
/* Ok, do the async read-ahead now */
page = __read_swap_cache_async(
swp_entry(swp_type(entry), offset),
gfp_mask, mpol, ilx, &page_allocated);
gfp_mask, mpol, ilx, &page_allocated, false);
if (!page)
continue;
if (page_allocated) {
......@@ -672,7 +684,7 @@ struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
skip:
/* The page was likely read above, so no need for plugging here */
page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated);
&page_allocated, false);
if (unlikely(page_allocated))
swap_readpage(page, false, NULL);
return page;
......@@ -827,7 +839,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
pte_unmap(pte);
pte = NULL;
page = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated);
&page_allocated, false);
if (!page)
continue;
if (page_allocated) {
......@@ -847,7 +859,7 @@ static struct page *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
skip:
/* The page was likely read above, so no need for plugging here */
page = __read_swap_cache_async(targ_entry, gfp_mask, mpol, targ_ilx,
&page_allocated);
&page_allocated, false);
if (unlikely(page_allocated))
swap_readpage(page, false, NULL);
return page;
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment