Commit 8de7ecc6 authored by Shakeel Butt's avatar Shakeel Butt Committed by Linus Torvalds

memcg: reduce memcg tree traversals for stats collection

Currently cgroup-v1's memcg_stat_show traverses the memcg tree ~17 times
to collect the stats while cgroup-v2's memory_stat_show traverses the
memcg tree thrice.  On a large machine, a couple thousand memcgs is very
normal and if the churn is high and memcgs stick around during to several
reasons, tens of thousands of nodes in memcg tree can exist.  This patch
has refactored and shared the stat collection code between cgroup-v1 and
cgroup-v2 and has reduced the tree traversal to just one.

I ran a simple benchmark which reads the root_mem_cgroup's stat file
1000 times in the presense of 2500 memcgs on cgroup-v1. The results are:

Without the patch:
$ time ./read-root-stat-1000-times

real    0m1.663s
user    0m0.000s
sys     0m1.660s

With the patch:
$ time ./read-root-stat-1000-times

real    0m0.468s
user    0m0.000s
sys     0m0.467s

Link: http://lkml.kernel.org/r/20180724224635.143944-1-shakeelb@google.comSigned-off-by: default avatarShakeel Butt <shakeelb@google.com>
Acked-by: default avatarMichal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Greg Thelen <gthelen@google.com>
Cc: Bruce Merry <bmerry@ska.ac.za>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 1c4c3b99
...@@ -2899,29 +2899,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, ...@@ -2899,29 +2899,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
return retval; return retval;
} }
static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) struct accumulated_stats {
{ unsigned long stat[MEMCG_NR_STAT];
struct mem_cgroup *iter; unsigned long events[NR_VM_EVENT_ITEMS];
int i; unsigned long lru_pages[NR_LRU_LISTS];
const unsigned int *stats_array;
memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); const unsigned int *events_array;
int stats_size;
for_each_mem_cgroup_tree(iter, memcg) { int events_size;
for (i = 0; i < MEMCG_NR_STAT; i++) };
stat[i] += memcg_page_state(iter, i);
}
}
static void tree_events(struct mem_cgroup *memcg, unsigned long *events) static void accumulate_memcg_tree(struct mem_cgroup *memcg,
struct accumulated_stats *acc)
{ {
struct mem_cgroup *iter; struct mem_cgroup *mi;
int i; int i;
memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); for_each_mem_cgroup_tree(mi, memcg) {
for (i = 0; i < acc->stats_size; i++)
acc->stat[i] += memcg_page_state(mi,
acc->stats_array ? acc->stats_array[i] : i);
for_each_mem_cgroup_tree(iter, memcg) { for (i = 0; i < acc->events_size; i++)
for (i = 0; i < NR_VM_EVENT_ITEMS; i++) acc->events[i] += memcg_sum_events(mi,
events[i] += memcg_sum_events(iter, i); acc->events_array ? acc->events_array[i] : i);
for (i = 0; i < NR_LRU_LISTS; i++)
acc->lru_pages[i] +=
mem_cgroup_nr_lru_pages(mi, BIT(i));
} }
} }
...@@ -3332,6 +3337,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) ...@@ -3332,6 +3337,7 @@ static int memcg_stat_show(struct seq_file *m, void *v)
unsigned long memory, memsw; unsigned long memory, memsw;
struct mem_cgroup *mi; struct mem_cgroup *mi;
unsigned int i; unsigned int i;
struct accumulated_stats acc;
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
...@@ -3364,32 +3370,27 @@ static int memcg_stat_show(struct seq_file *m, void *v) ...@@ -3364,32 +3370,27 @@ static int memcg_stat_show(struct seq_file *m, void *v)
seq_printf(m, "hierarchical_memsw_limit %llu\n", seq_printf(m, "hierarchical_memsw_limit %llu\n",
(u64)memsw * PAGE_SIZE); (u64)memsw * PAGE_SIZE);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { memset(&acc, 0, sizeof(acc));
unsigned long long val = 0; acc.stats_size = ARRAY_SIZE(memcg1_stats);
acc.stats_array = memcg1_stats;
acc.events_size = ARRAY_SIZE(memcg1_events);
acc.events_array = memcg1_events;
accumulate_memcg_tree(memcg, &acc);
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue; continue;
for_each_mem_cgroup_tree(mi, memcg) seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
val += memcg_page_state(mi, memcg1_stats[i]) * (u64)acc.stat[i] * PAGE_SIZE);
PAGE_SIZE;
seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) {
unsigned long long val = 0;
for_each_mem_cgroup_tree(mi, memcg)
val += memcg_sum_events(mi, memcg1_events[i]);
seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val);
} }
for (i = 0; i < NR_LRU_LISTS; i++) { for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
unsigned long long val = 0; seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
(u64)acc.events[i]);
for_each_mem_cgroup_tree(mi, memcg) for (i = 0; i < NR_LRU_LISTS; i++)
val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); (u64)acc.lru_pages[i] * PAGE_SIZE);
}
#ifdef CONFIG_DEBUG_VM #ifdef CONFIG_DEBUG_VM
{ {
...@@ -5486,8 +5487,7 @@ static int memory_events_show(struct seq_file *m, void *v) ...@@ -5486,8 +5487,7 @@ static int memory_events_show(struct seq_file *m, void *v)
static int memory_stat_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v)
{ {
struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
unsigned long stat[MEMCG_NR_STAT]; struct accumulated_stats acc;
unsigned long events[NR_VM_EVENT_ITEMS];
int i; int i;
/* /*
...@@ -5501,66 +5501,62 @@ static int memory_stat_show(struct seq_file *m, void *v) ...@@ -5501,66 +5501,62 @@ static int memory_stat_show(struct seq_file *m, void *v)
* Current memory state: * Current memory state:
*/ */
tree_stat(memcg, stat); memset(&acc, 0, sizeof(acc));
tree_events(memcg, events); acc.stats_size = MEMCG_NR_STAT;
acc.events_size = NR_VM_EVENT_ITEMS;
accumulate_memcg_tree(memcg, &acc);
seq_printf(m, "anon %llu\n", seq_printf(m, "anon %llu\n",
(u64)stat[MEMCG_RSS] * PAGE_SIZE); (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
seq_printf(m, "file %llu\n", seq_printf(m, "file %llu\n",
(u64)stat[MEMCG_CACHE] * PAGE_SIZE); (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
seq_printf(m, "kernel_stack %llu\n", seq_printf(m, "kernel_stack %llu\n",
(u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
seq_printf(m, "slab %llu\n", seq_printf(m, "slab %llu\n",
(u64)(stat[NR_SLAB_RECLAIMABLE] + (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
seq_printf(m, "sock %llu\n", seq_printf(m, "sock %llu\n",
(u64)stat[MEMCG_SOCK] * PAGE_SIZE); (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
seq_printf(m, "shmem %llu\n", seq_printf(m, "shmem %llu\n",
(u64)stat[NR_SHMEM] * PAGE_SIZE); (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
seq_printf(m, "file_mapped %llu\n", seq_printf(m, "file_mapped %llu\n",
(u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
seq_printf(m, "file_dirty %llu\n", seq_printf(m, "file_dirty %llu\n",
(u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
seq_printf(m, "file_writeback %llu\n", seq_printf(m, "file_writeback %llu\n",
(u64)stat[NR_WRITEBACK] * PAGE_SIZE); (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
for (i = 0; i < NR_LRU_LISTS; i++) { for (i = 0; i < NR_LRU_LISTS; i++)
struct mem_cgroup *mi; seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
unsigned long val = 0; (u64)acc.lru_pages[i] * PAGE_SIZE);
for_each_mem_cgroup_tree(mi, memcg)
val += mem_cgroup_nr_lru_pages(mi, BIT(i));
seq_printf(m, "%s %llu\n",
mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
}
seq_printf(m, "slab_reclaimable %llu\n", seq_printf(m, "slab_reclaimable %llu\n",
(u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
seq_printf(m, "slab_unreclaimable %llu\n", seq_printf(m, "slab_unreclaimable %llu\n",
(u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
/* Accumulated memory events */ /* Accumulated memory events */
seq_printf(m, "pgfault %lu\n", events[PGFAULT]); seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
events[PGSCAN_DIRECT]); acc.events[PGSCAN_DIRECT]);
seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
events[PGSTEAL_DIRECT]); acc.events[PGSTEAL_DIRECT]);
seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
seq_printf(m, "workingset_refault %lu\n", seq_printf(m, "workingset_refault %lu\n",
stat[WORKINGSET_REFAULT]); acc.stat[WORKINGSET_REFAULT]);
seq_printf(m, "workingset_activate %lu\n", seq_printf(m, "workingset_activate %lu\n",
stat[WORKINGSET_ACTIVATE]); acc.stat[WORKINGSET_ACTIVATE]);
seq_printf(m, "workingset_nodereclaim %lu\n", seq_printf(m, "workingset_nodereclaim %lu\n",
stat[WORKINGSET_NODERECLAIM]); acc.stat[WORKINGSET_NODERECLAIM]);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment