Commit f95bdb70 authored by Qi Zheng's avatar Qi Zheng Committed by Andrew Morton

mm: vmscan: make global slab shrink lockless

The shrinker_rwsem is a global read-write lock in shrinkers subsystem,
which protects most operations such as slab shrink, registration and
unregistration of shrinkers, etc.  This can easily cause problems in the
following cases.

1) When the memory pressure is high and there are many
   filesystems mounted or unmounted at the same time,
   slab shrink will be affected (down_read_trylock()
   failed).

   Such as the real workload mentioned by Kirill Tkhai:

   ```
   One of the real workloads from my experience is start
   of an overcommitted node containing many starting
   containers after node crash (or many resuming containers
   after reboot for kernel update). In these cases memory
   pressure is huge, and the node goes round in long reclaim.
   ```

2) If a shrinker is blocked (such as the case mentioned
   in [1]) and a writer comes in (such as mount a fs),
   then this writer will be blocked and cause all
   subsequent shrinker-related operations to be blocked.

Even if there is no competitor when shrinking slab, there may still be a
problem.  If we have a long shrinker list and we do not reclaim enough
memory with each shrinker, then the down_read_trylock() may be called with
high frequency.  Because of the poor multicore scalability of atomic
operations, this can lead to a significant drop in IPC (instructions per
cycle).

So many times in history ([2],[3],[4],[5]), some people wanted to replace
shrinker_rwsem trylock with SRCU in the slab shrink, but all these patches
were abandoned because SRCU was not unconditionally enabled.

But now, since commit 1cd0bd06093c ("rcu: Remove CONFIG_SRCU"), the SRCU
is unconditionally enabled.  So it's time to use SRCU to protect readers
who previously held shrinker_rwsem.

This commit uses SRCU to make global slab shrink lockless,
the memcg slab shrink is handled in the subsequent patch.

[1]. https://lore.kernel.org/lkml/20191129214541.3110-1-ptikhomirov@virtuozzo.com/
[2]. https://lore.kernel.org/all/1437080113.3596.2.camel@stgolabs.net/
[3]. https://lore.kernel.org/lkml/1510609063-3327-1-git-send-email-penguin-kernel@I-love.SAKURA.ne.jp/
[4]. https://lore.kernel.org/lkml/153365347929.19074.12509495712735843805.stgit@localhost.localdomain/
[5]. https://lore.kernel.org/lkml/20210927074823.5825-1-sultan@kerneltoast.com/

Link: https://lkml.kernel.org/r/20230313112819.38938-3-zhengqi.arch@bytedance.comSigned-off-by: default avatarQi Zheng <zhengqi.arch@bytedance.com>
Acked-by: default avatarVlastimil Babka <vbabka@suse.cz>
Acked-by: default avatarKirill Tkhai <tkhai@ya.ru>
Acked-by: default avatarRoman Gushchin <roman.gushchin@linux.dev>
Cc: Christian König <christian.koenig@amd.com>
Cc: David Hildenbrand <david@redhat.com>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Muchun Song <muchun.song@linux.dev>
Cc: Paul E. McKenney <paulmck@kernel.org>
Cc: Shakeel Butt <shakeelb@google.com>
Cc: Yang Shi <shy828301@gmail.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 42c9db39
...@@ -57,6 +57,7 @@ ...@@ -57,6 +57,7 @@
#include <linux/khugepaged.h> #include <linux/khugepaged.h>
#include <linux/rculist_nulls.h> #include <linux/rculist_nulls.h>
#include <linux/random.h> #include <linux/random.h>
#include <linux/srcu.h>
#include <asm/tlbflush.h> #include <asm/tlbflush.h>
#include <asm/div64.h> #include <asm/div64.h>
...@@ -202,6 +203,7 @@ static void set_task_reclaim_state(struct task_struct *task, ...@@ -202,6 +203,7 @@ static void set_task_reclaim_state(struct task_struct *task,
LIST_HEAD(shrinker_list); LIST_HEAD(shrinker_list);
DECLARE_RWSEM(shrinker_rwsem); DECLARE_RWSEM(shrinker_rwsem);
DEFINE_SRCU(shrinker_srcu);
#ifdef CONFIG_MEMCG #ifdef CONFIG_MEMCG
static int shrinker_nr_max; static int shrinker_nr_max;
...@@ -700,7 +702,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker) ...@@ -700,7 +702,7 @@ void free_prealloced_shrinker(struct shrinker *shrinker)
void register_shrinker_prepared(struct shrinker *shrinker) void register_shrinker_prepared(struct shrinker *shrinker)
{ {
down_write(&shrinker_rwsem); down_write(&shrinker_rwsem);
list_add_tail(&shrinker->list, &shrinker_list); list_add_tail_rcu(&shrinker->list, &shrinker_list);
shrinker->flags |= SHRINKER_REGISTERED; shrinker->flags |= SHRINKER_REGISTERED;
shrinker_debugfs_add(shrinker); shrinker_debugfs_add(shrinker);
up_write(&shrinker_rwsem); up_write(&shrinker_rwsem);
...@@ -754,13 +756,15 @@ void unregister_shrinker(struct shrinker *shrinker) ...@@ -754,13 +756,15 @@ void unregister_shrinker(struct shrinker *shrinker)
return; return;
down_write(&shrinker_rwsem); down_write(&shrinker_rwsem);
list_del(&shrinker->list); list_del_rcu(&shrinker->list);
shrinker->flags &= ~SHRINKER_REGISTERED; shrinker->flags &= ~SHRINKER_REGISTERED;
if (shrinker->flags & SHRINKER_MEMCG_AWARE) if (shrinker->flags & SHRINKER_MEMCG_AWARE)
unregister_memcg_shrinker(shrinker); unregister_memcg_shrinker(shrinker);
debugfs_entry = shrinker_debugfs_remove(shrinker); debugfs_entry = shrinker_debugfs_remove(shrinker);
up_write(&shrinker_rwsem); up_write(&shrinker_rwsem);
synchronize_srcu(&shrinker_srcu);
debugfs_remove_recursive(debugfs_entry); debugfs_remove_recursive(debugfs_entry);
kfree(shrinker->nr_deferred); kfree(shrinker->nr_deferred);
...@@ -780,6 +784,7 @@ void synchronize_shrinkers(void) ...@@ -780,6 +784,7 @@ void synchronize_shrinkers(void)
{ {
down_write(&shrinker_rwsem); down_write(&shrinker_rwsem);
up_write(&shrinker_rwsem); up_write(&shrinker_rwsem);
synchronize_srcu(&shrinker_srcu);
} }
EXPORT_SYMBOL(synchronize_shrinkers); EXPORT_SYMBOL(synchronize_shrinkers);
...@@ -990,6 +995,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, ...@@ -990,6 +995,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
{ {
unsigned long ret, freed = 0; unsigned long ret, freed = 0;
struct shrinker *shrinker; struct shrinker *shrinker;
int srcu_idx;
/* /*
* The root memcg might be allocated even though memcg is disabled * The root memcg might be allocated even though memcg is disabled
...@@ -1001,10 +1007,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, ...@@ -1001,10 +1007,10 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg)) if (!mem_cgroup_disabled() && !mem_cgroup_is_root(memcg))
return shrink_slab_memcg(gfp_mask, nid, memcg, priority); return shrink_slab_memcg(gfp_mask, nid, memcg, priority);
if (!down_read_trylock(&shrinker_rwsem)) srcu_idx = srcu_read_lock(&shrinker_srcu);
goto out;
list_for_each_entry(shrinker, &shrinker_list, list) { list_for_each_entry_srcu(shrinker, &shrinker_list, list,
srcu_read_lock_held(&shrinker_srcu)) {
struct shrink_control sc = { struct shrink_control sc = {
.gfp_mask = gfp_mask, .gfp_mask = gfp_mask,
.nid = nid, .nid = nid,
...@@ -1015,19 +1021,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, ...@@ -1015,19 +1021,9 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
if (ret == SHRINK_EMPTY) if (ret == SHRINK_EMPTY)
ret = 0; ret = 0;
freed += ret; freed += ret;
/*
* Bail out if someone want to register a new shrinker to
* prevent the registration from being stalled for long periods
* by parallel ongoing shrinking.
*/
if (rwsem_is_contended(&shrinker_rwsem)) {
freed = freed ? : 1;
break;
}
} }
up_read(&shrinker_rwsem); srcu_read_unlock(&shrinker_srcu, srcu_idx);
out:
cond_resched(); cond_resched();
return freed; return freed;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment