Commit 503c358c authored by Vladimir Davydov's avatar Vladimir Davydov Committed by Linus Torvalds

list_lru: introduce list_lru_shrink_{count,walk}

Kmem accounting of memcg is unusable now, because it lacks slab shrinker
support.  That means when we hit the limit we will get ENOMEM w/o any
chance to recover.  What we should do then is to call shrink_slab, which
would reclaim old inode/dentry caches from this cgroup.  This is what
this patch set is intended to do.

Basically, it does two things.  First, it introduces the notion of
per-memcg slab shrinker.  A shrinker that wants to reclaim objects per
cgroup should mark itself as SHRINKER_MEMCG_AWARE.  Then it will be
passed the memory cgroup to scan from in shrink_control->memcg.  For
such shrinkers shrink_slab iterates over the whole cgroup subtree under
the target cgroup and calls the shrinker for each kmem-active memory
cgroup.

Secondly, this patch set makes the list_lru structure per-memcg.  It's
done transparently to list_lru users - everything they have to do is to
tell list_lru_init that they want memcg-aware list_lru.  Then the
list_lru will automatically distribute objects among per-memcg lists
basing on which cgroup the object is accounted to.  This way to make FS
shrinkers (icache, dcache) memcg-aware we only need to make them use
memcg-aware list_lru, and this is what this patch set does.

As before, this patch set only enables per-memcg kmem reclaim when the
pressure goes from memory.limit, not from memory.kmem.limit.  Handling
memory.kmem.limit is going to be tricky due to GFP_NOFS allocations, and
it is still unclear whether we will have this knob in the unified
hierarchy.

This patch (of 9):

NUMA aware slab shrinkers use the list_lru structure to distribute
objects coming from different NUMA nodes to different lists.  Whenever
such a shrinker needs to count or scan objects from a particular node,
it issues commands like this:

        count = list_lru_count_node(lru, sc->nid);
        freed = list_lru_walk_node(lru, sc->nid, isolate_func,
                                   isolate_arg, &sc->nr_to_scan);

where sc is an instance of the shrink_control structure passed to it
from vmscan.

To simplify this, let's add special list_lru functions to be used by
shrinkers, list_lru_shrink_count() and list_lru_shrink_walk(), which
consolidate the nid and nr_to_scan arguments in the shrink_control
structure.

This will also allow us to avoid patching shrinkers that use list_lru
when we make shrink_slab() per-memcg - all we will have to do is extend
the shrink_control structure to include the target memcg and make
list_lru_shrink_{count,walk} handle this appropriately.
Signed-off-by: default avatarVladimir Davydov <vdavydov@parallels.com>
Suggested-by: default avatarDave Chinner <david@fromorbit.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Greg Thelen <gthelen@google.com>
Cc: Glauber Costa <glommer@gmail.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 10c1045f
...@@ -930,24 +930,22 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) ...@@ -930,24 +930,22 @@ dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
/** /**
* prune_dcache_sb - shrink the dcache * prune_dcache_sb - shrink the dcache
* @sb: superblock * @sb: superblock
* @nr_to_scan : number of entries to try to free * @sc: shrink control, passed to list_lru_shrink_walk()
* @nid: which node to scan for freeable entities
* *
* Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
* done when we need more memory an called from the superblock shrinker * is done when we need more memory and called from the superblock shrinker
* function. * function.
* *
* This function may fail to free any resources if all the dentries are in * This function may fail to free any resources if all the dentries are in
* use. * use.
*/ */
long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
int nid)
{ {
LIST_HEAD(dispose); LIST_HEAD(dispose);
long freed; long freed;
freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate, freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
&dispose, &nr_to_scan); dentry_lru_isolate, &dispose);
shrink_dentry_list(&dispose); shrink_dentry_list(&dispose);
return freed; return freed;
} }
......
...@@ -171,8 +171,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, ...@@ -171,8 +171,8 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
if (!(sc->gfp_mask & __GFP_FS)) if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP; return SHRINK_STOP;
freed = list_lru_walk_node(&gfs2_qd_lru, sc->nid, gfs2_qd_isolate, freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
&dispose, &sc->nr_to_scan); gfs2_qd_isolate, &dispose);
gfs2_qd_dispose(&dispose); gfs2_qd_dispose(&dispose);
...@@ -182,7 +182,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink, ...@@ -182,7 +182,7 @@ static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink, static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
struct shrink_control *sc) struct shrink_control *sc)
{ {
return vfs_pressure_ratio(list_lru_count_node(&gfs2_qd_lru, sc->nid)); return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
} }
struct shrinker gfs2_qd_shrinker = { struct shrinker gfs2_qd_shrinker = {
......
...@@ -751,14 +751,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg) ...@@ -751,14 +751,13 @@ inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
* to trim from the LRU. Inodes to be freed are moved to a temporary list and * to trim from the LRU. Inodes to be freed are moved to a temporary list and
* then are freed outside inode_lock by dispose_list(). * then are freed outside inode_lock by dispose_list().
*/ */
long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
int nid)
{ {
LIST_HEAD(freeable); LIST_HEAD(freeable);
long freed; long freed;
freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate, freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
&freeable, &nr_to_scan); inode_lru_isolate, &freeable);
dispose_list(&freeable); dispose_list(&freeable);
return freed; return freed;
} }
......
...@@ -14,6 +14,7 @@ struct file_system_type; ...@@ -14,6 +14,7 @@ struct file_system_type;
struct linux_binprm; struct linux_binprm;
struct path; struct path;
struct mount; struct mount;
struct shrink_control;
/* /*
* block_dev.c * block_dev.c
...@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f); ...@@ -111,8 +112,7 @@ extern int open_check_o_direct(struct file *f);
* inode.c * inode.c
*/ */
extern spinlock_t inode_sb_list_lock; extern spinlock_t inode_sb_list_lock;
extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan, extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
int nid);
extern void inode_add_lru(struct inode *inode); extern void inode_add_lru(struct inode *inode);
/* /*
...@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool); ...@@ -129,8 +129,7 @@ extern int invalidate_inodes(struct super_block *, bool);
*/ */
extern struct dentry *__d_alloc(struct super_block *, const struct qstr *); extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
extern int d_set_mounted(struct dentry *dentry); extern int d_set_mounted(struct dentry *dentry);
extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
int nid);
/* /*
* read_write.c * read_write.c
......
...@@ -77,8 +77,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink, ...@@ -77,8 +77,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
if (sb->s_op->nr_cached_objects) if (sb->s_op->nr_cached_objects)
fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid); fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
total_objects = dentries + inodes + fs_objects + 1; total_objects = dentries + inodes + fs_objects + 1;
if (!total_objects) if (!total_objects)
total_objects = 1; total_objects = 1;
...@@ -86,20 +86,20 @@ static unsigned long super_cache_scan(struct shrinker *shrink, ...@@ -86,20 +86,20 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
/* proportion the scan between the caches */ /* proportion the scan between the caches */
dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
inodes = mult_frac(sc->nr_to_scan, inodes, total_objects); inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
/* /*
* prune the dcache first as the icache is pinned by it, then * prune the dcache first as the icache is pinned by it, then
* prune the icache, followed by the filesystem specific caches * prune the icache, followed by the filesystem specific caches
*/ */
freed = prune_dcache_sb(sb, dentries, sc->nid); sc->nr_to_scan = dentries;
freed += prune_icache_sb(sb, inodes, sc->nid); freed = prune_dcache_sb(sb, sc);
sc->nr_to_scan = inodes;
freed += prune_icache_sb(sb, sc);
if (fs_objects) { if (fs_objects)
fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
total_objects);
freed += sb->s_op->free_cached_objects(sb, fs_objects, freed += sb->s_op->free_cached_objects(sb, fs_objects,
sc->nid); sc->nid);
}
drop_super(sb); drop_super(sb);
return freed; return freed;
...@@ -118,17 +118,15 @@ static unsigned long super_cache_count(struct shrinker *shrink, ...@@ -118,17 +118,15 @@ static unsigned long super_cache_count(struct shrinker *shrink,
* scalability bottleneck. The counts could get updated * scalability bottleneck. The counts could get updated
* between super_cache_count and super_cache_scan anyway. * between super_cache_count and super_cache_scan anyway.
* Call to super_cache_count with shrinker_rwsem held * Call to super_cache_count with shrinker_rwsem held
* ensures the safety of call to list_lru_count_node() and * ensures the safety of call to list_lru_shrink_count() and
* s_op->nr_cached_objects(). * s_op->nr_cached_objects().
*/ */
if (sb->s_op && sb->s_op->nr_cached_objects) if (sb->s_op && sb->s_op->nr_cached_objects)
total_objects = sb->s_op->nr_cached_objects(sb, total_objects = sb->s_op->nr_cached_objects(sb,
sc->nid); sc->nid);
total_objects += list_lru_count_node(&sb->s_dentry_lru, total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
sc->nid); total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
total_objects += list_lru_count_node(&sb->s_inode_lru,
sc->nid);
total_objects = vfs_pressure_ratio(total_objects); total_objects = vfs_pressure_ratio(total_objects);
return total_objects; return total_objects;
......
...@@ -1583,10 +1583,9 @@ xfs_buftarg_shrink_scan( ...@@ -1583,10 +1583,9 @@ xfs_buftarg_shrink_scan(
struct xfs_buftarg, bt_shrinker); struct xfs_buftarg, bt_shrinker);
LIST_HEAD(dispose); LIST_HEAD(dispose);
unsigned long freed; unsigned long freed;
unsigned long nr_to_scan = sc->nr_to_scan;
freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate, freed = list_lru_shrink_walk(&btp->bt_lru, sc,
&dispose, &nr_to_scan); xfs_buftarg_isolate, &dispose);
while (!list_empty(&dispose)) { while (!list_empty(&dispose)) {
struct xfs_buf *bp; struct xfs_buf *bp;
...@@ -1605,7 +1604,7 @@ xfs_buftarg_shrink_count( ...@@ -1605,7 +1604,7 @@ xfs_buftarg_shrink_count(
{ {
struct xfs_buftarg *btp = container_of(shrink, struct xfs_buftarg *btp = container_of(shrink,
struct xfs_buftarg, bt_shrinker); struct xfs_buftarg, bt_shrinker);
return list_lru_count_node(&btp->bt_lru, sc->nid); return list_lru_shrink_count(&btp->bt_lru, sc);
} }
void void
......
...@@ -523,7 +523,6 @@ xfs_qm_shrink_scan( ...@@ -523,7 +523,6 @@ xfs_qm_shrink_scan(
struct xfs_qm_isolate isol; struct xfs_qm_isolate isol;
unsigned long freed; unsigned long freed;
int error; int error;
unsigned long nr_to_scan = sc->nr_to_scan;
if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
return 0; return 0;
...@@ -531,8 +530,8 @@ xfs_qm_shrink_scan( ...@@ -531,8 +530,8 @@ xfs_qm_shrink_scan(
INIT_LIST_HEAD(&isol.buffers); INIT_LIST_HEAD(&isol.buffers);
INIT_LIST_HEAD(&isol.dispose); INIT_LIST_HEAD(&isol.dispose);
freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol, freed = list_lru_shrink_walk(&qi->qi_lru, sc,
&nr_to_scan); xfs_qm_dquot_isolate, &isol);
error = xfs_buf_delwri_submit(&isol.buffers); error = xfs_buf_delwri_submit(&isol.buffers);
if (error) if (error)
...@@ -557,7 +556,7 @@ xfs_qm_shrink_count( ...@@ -557,7 +556,7 @@ xfs_qm_shrink_count(
struct xfs_quotainfo *qi = container_of(shrink, struct xfs_quotainfo *qi = container_of(shrink,
struct xfs_quotainfo, qi_shrinker); struct xfs_quotainfo, qi_shrinker);
return list_lru_count_node(&qi->qi_lru, sc->nid); return list_lru_shrink_count(&qi->qi_lru, sc);
} }
/* /*
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/list.h> #include <linux/list.h>
#include <linux/nodemask.h> #include <linux/nodemask.h>
#include <linux/shrinker.h>
/* list_lru_walk_cb has to always return one of those */ /* list_lru_walk_cb has to always return one of those */
enum lru_status { enum lru_status {
...@@ -81,6 +82,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item); ...@@ -81,6 +82,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item);
* Callers that want such a guarantee need to provide an outer lock. * Callers that want such a guarantee need to provide an outer lock.
*/ */
unsigned long list_lru_count_node(struct list_lru *lru, int nid); unsigned long list_lru_count_node(struct list_lru *lru, int nid);
static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
struct shrink_control *sc)
{
return list_lru_count_node(lru, sc->nid);
}
static inline unsigned long list_lru_count(struct list_lru *lru) static inline unsigned long list_lru_count(struct list_lru *lru)
{ {
long count = 0; long count = 0;
...@@ -119,6 +127,14 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid, ...@@ -119,6 +127,14 @@ unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
list_lru_walk_cb isolate, void *cb_arg, list_lru_walk_cb isolate, void *cb_arg,
unsigned long *nr_to_walk); unsigned long *nr_to_walk);
static inline unsigned long
list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
list_lru_walk_cb isolate, void *cb_arg)
{
return list_lru_walk_node(lru, sc->nid, isolate, cb_arg,
&sc->nr_to_scan);
}
static inline unsigned long static inline unsigned long
list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate, list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
void *cb_arg, unsigned long nr_to_walk) void *cb_arg, unsigned long nr_to_walk)
......
...@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker, ...@@ -275,7 +275,7 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */ /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable(); local_irq_disable();
shadow_nodes = list_lru_count_node(&workingset_shadow_nodes, sc->nid); shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
local_irq_enable(); local_irq_enable();
pages = node_present_pages(sc->nid); pages = node_present_pages(sc->nid);
...@@ -376,8 +376,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker, ...@@ -376,8 +376,8 @@ static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
/* list_lru lock nests inside IRQ-safe mapping->tree_lock */ /* list_lru lock nests inside IRQ-safe mapping->tree_lock */
local_irq_disable(); local_irq_disable();
ret = list_lru_walk_node(&workingset_shadow_nodes, sc->nid, ret = list_lru_shrink_walk(&workingset_shadow_nodes, sc,
shadow_lru_isolate, NULL, &sc->nr_to_scan); shadow_lru_isolate, NULL);
local_irq_enable(); local_irq_enable();
return ret; return ret;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment