Commit 12a5d395 authored by Mina Almasry's avatar Mina Almasry Committed by Andrew Morton

mm: add nodes= arg to memory.reclaim

The nodes= arg instructs the kernel to only scan the given nodes for
proactive reclaim.  For example use cases, consider a 2 tier memory
system:

nodes 0,1 -> top tier
nodes 2,3 -> second tier

$ echo "1m nodes=0" > memory.reclaim

This instructs the kernel to attempt to reclaim 1m memory from node 0. 
Since node 0 is a top tier node, demotion will be attempted first.  This
is useful to direct proactive reclaim to specific nodes that are under
pressure.

$ echo "1m nodes=2,3" > memory.reclaim

This instructs the kernel to attempt to reclaim 1m memory in the second
tier, since this tier of memory has no demotion targets the memory will be
reclaimed.

$ echo "1m nodes=0,1" > memory.reclaim

Instructs the kernel to reclaim memory from the top tier nodes, which can
be desirable according to the userspace policy if there is pressure on the
top tiers.  Since these nodes have demotion targets, the kernel will
attempt demotion first.

Since commit 3f1509c5 ("Revert "mm/vmscan: never demote for memcg
reclaim""), the proactive reclaim interface memory.reclaim does both
reclaim and demotion.  Reclaim and demotion incur different latency costs
to the jobs in the cgroup.  Demoted memory would still be addressable by
the userspace at a higher latency, but reclaimed memory would need to
incur a pagefault.

The 'nodes' arg is useful to allow the userspace to control demotion and
reclaim independently according to its policy: if the memory.reclaim is
called on a node with demotion targets, it will attempt demotion first; if
it is called on a node without demotion targets, it will only attempt
reclaim.

Link: https://lkml.kernel.org/r/20221202223533.1785418-1-almasrymina@google.comSigned-off-by: default avatarMina Almasry <almasrymina@google.com>
Acked-by: default avatarMichal Hocko <mhocko@suse.com>
Acked-by: default avatarShakeel Butt <shakeelb@google.com>
Acked-by: default avatarMuchun Song <songmuchun@bytedance.com>
Cc: Bagas Sanjaya <bagasdotme@gmail.com>
Cc: "Huang, Ying" <ying.huang@intel.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Cc: Roman Gushchin <roman.gushchin@linux.dev>
Cc: Tejun Heo <tj@kernel.org>
Cc: Wei Xu <weixugc@google.com>
Cc: Yang Shi <yang.shi@linux.alibaba.com>
Cc: Yosry Ahmed <yosryahmed@google.com>
Cc: zefan li <lizefan.x@bytedance.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
parent 6b426d07
...@@ -1245,17 +1245,13 @@ PAGE_SIZE multiple when read back. ...@@ -1245,17 +1245,13 @@ PAGE_SIZE multiple when read back.
This is a simple interface to trigger memory reclaim in the This is a simple interface to trigger memory reclaim in the
target cgroup. target cgroup.
This file accepts a single key, the number of bytes to reclaim. This file accepts a string which contains the number of bytes to
No nested keys are currently supported. reclaim.
Example:: Example::
echo "1G" > memory.reclaim echo "1G" > memory.reclaim
The interface can be later extended with nested keys to
configure the reclaim behavior. For example, specify the
type of memory to reclaim from (anon, file, ..).
Please note that the kernel can over or under reclaim from Please note that the kernel can over or under reclaim from
the target cgroup. If less bytes are reclaimed than the the target cgroup. If less bytes are reclaimed than the
specified amount, -EAGAIN is returned. specified amount, -EAGAIN is returned.
...@@ -1267,6 +1263,13 @@ PAGE_SIZE multiple when read back. ...@@ -1267,6 +1263,13 @@ PAGE_SIZE multiple when read back.
This means that the networking layer will not adapt based on This means that the networking layer will not adapt based on
reclaim induced by memory.reclaim. reclaim induced by memory.reclaim.
This file also allows the user to specify the nodes to reclaim from,
via the 'nodes=' key, for example::
echo "1G nodes=0,1" > memory.reclaim
The above instructs the kernel to reclaim memory from nodes 0,1.
memory.peak memory.peak
A read-only single value file which exists on non-root A read-only single value file which exists on non-root
cgroups. cgroups.
......
...@@ -418,7 +418,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, ...@@ -418,7 +418,8 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages, unsigned long nr_pages,
gfp_t gfp_mask, gfp_t gfp_mask,
unsigned int reclaim_options); unsigned int reclaim_options,
nodemask_t *nodemask);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap, gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat, pg_data_t *pgdat,
......
...@@ -63,6 +63,7 @@ ...@@ -63,6 +63,7 @@
#include <linux/resume_user_mode.h> #include <linux/resume_user_mode.h>
#include <linux/psi.h> #include <linux/psi.h>
#include <linux/seq_buf.h> #include <linux/seq_buf.h>
#include <linux/parser.h>
#include "internal.h" #include "internal.h"
#include <net/sock.h> #include <net/sock.h>
#include <net/ip.h> #include <net/ip.h>
...@@ -2392,7 +2393,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, ...@@ -2392,7 +2393,8 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags); psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask, gfp_mask,
MEMCG_RECLAIM_MAY_SWAP); MEMCG_RECLAIM_MAY_SWAP,
NULL);
psi_memstall_leave(&pflags); psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) && } while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg)); !mem_cgroup_is_root(memcg));
...@@ -2683,7 +2685,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, ...@@ -2683,7 +2685,8 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
psi_memstall_enter(&pflags); psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, reclaim_options); gfp_mask, reclaim_options,
NULL);
psi_memstall_leave(&pflags); psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages) if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
...@@ -3503,7 +3506,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, ...@@ -3503,7 +3506,8 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
} }
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) { memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP,
NULL)) {
ret = -EBUSY; ret = -EBUSY;
break; break;
} }
...@@ -3614,7 +3618,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) ...@@ -3614,7 +3618,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR; return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
MEMCG_RECLAIM_MAY_SWAP)) MEMCG_RECLAIM_MAY_SWAP,
NULL))
nr_retries--; nr_retries--;
} }
...@@ -6418,7 +6423,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, ...@@ -6418,7 +6423,8 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
} }
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP); GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
NULL);
if (!reclaimed && !nr_retries--) if (!reclaimed && !nr_retries--)
break; break;
...@@ -6467,7 +6473,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, ...@@ -6467,7 +6473,8 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) { if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP)) GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP,
NULL))
nr_reclaims--; nr_reclaims--;
continue; continue;
} }
...@@ -6590,21 +6597,54 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, ...@@ -6590,21 +6597,54 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes; return nbytes;
} }
enum {
MEMORY_RECLAIM_NODES = 0,
MEMORY_RECLAIM_NULL,
};
static const match_table_t if_tokens = {
{ MEMORY_RECLAIM_NODES, "nodes=%s" },
{ MEMORY_RECLAIM_NULL, NULL },
};
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off) size_t nbytes, loff_t off)
{ {
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0; unsigned long nr_to_reclaim, nr_reclaimed = 0;
unsigned int reclaim_options; unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP |
int err; MEMCG_RECLAIM_PROACTIVE;
char *old_buf, *start;
substring_t args[MAX_OPT_ARGS];
int token;
char value[256];
nodemask_t nodemask = NODE_MASK_ALL;
buf = strstrip(buf); buf = strstrip(buf);
err = page_counter_memparse(buf, "", &nr_to_reclaim);
if (err)
return err;
reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE; old_buf = buf;
nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
if (buf == old_buf)
return -EINVAL;
buf = strstrip(buf);
while ((start = strsep(&buf, " ")) != NULL) {
if (!strlen(start))
continue;
token = match_token(start, if_tokens, args);
match_strlcpy(value, args, sizeof(value));
switch (token) {
case MEMORY_RECLAIM_NODES:
if (nodelist_parse(value, nodemask) < 0)
return -EINVAL;
break;
default:
return -EINVAL;
}
}
while (nr_reclaimed < nr_to_reclaim) { while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed; unsigned long reclaimed;
...@@ -6621,7 +6661,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, ...@@ -6621,7 +6661,8 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg, reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed, nr_to_reclaim - nr_reclaimed,
GFP_KERNEL, reclaim_options); GFP_KERNEL, reclaim_options,
&nodemask);
if (!reclaimed && !nr_retries--) if (!reclaimed && !nr_retries--)
return -EAGAIN; return -EAGAIN;
......
...@@ -6758,7 +6758,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, ...@@ -6758,7 +6758,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages, unsigned long nr_pages,
gfp_t gfp_mask, gfp_t gfp_mask,
unsigned int reclaim_options) unsigned int reclaim_options,
nodemask_t *nodemask)
{ {
unsigned long nr_reclaimed; unsigned long nr_reclaimed;
unsigned int noreclaim_flag; unsigned int noreclaim_flag;
...@@ -6773,6 +6774,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, ...@@ -6773,6 +6774,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1, .may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
.nodemask = nodemask,
}; };
/* /*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment