Commit 97b27821 authored by Tejun Heo's avatar Tejun Heo Committed by Jens Axboe

writeback, memcg: Implement foreign dirty flushing

There's an inherent mismatch between memcg and writeback.  The former
trackes ownership per-page while the latter per-inode.  This was a
deliberate design decision because honoring per-page ownership in the
writeback path is complicated, may lead to higher CPU and IO overheads
and deemed unnecessary given that write-sharing an inode across
different cgroups isn't a common use-case.

Combined with inode majority-writer ownership switching, this works
well enough in most cases but there are some pathological cases.  For
example, let's say there are two cgroups A and B which keep writing to
different but confined parts of the same inode.  B owns the inode and
A's memory is limited far below B's.  A's dirty ratio can rise enough
to trigger balance_dirty_pages() sleeps but B's can be low enough to
avoid triggering background writeback.  A will be slowed down without
a way to make writeback of the dirty pages happen.

This patch implements foreign dirty recording and foreign mechanism so
that when a memcg encounters a condition as above it can trigger
flushes on bdi_writebacks which can clean its pages.  Please see the
comment on top of mem_cgroup_track_foreign_dirty_slowpath() for
details.

A reproducer follows.

write-range.c::

  #include <stdio.h>
  #include <stdlib.h>
  #include <unistd.h>
  #include <fcntl.h>
  #include <sys/types.h>

  static const char *usage = "write-range FILE START SIZE\n";

  int main(int argc, char **argv)
  {
	  int fd;
	  unsigned long start, size, end, pos;
	  char *endp;
	  char buf[4096];

	  if (argc < 4) {
		  fprintf(stderr, usage);
		  return 1;
	  }

	  fd = open(argv[1], O_WRONLY);
	  if (fd < 0) {
		  perror("open");
		  return 1;
	  }

	  start = strtoul(argv[2], &endp, 0);
	  if (*endp != '\0') {
		  fprintf(stderr, usage);
		  return 1;
	  }

	  size = strtoul(argv[3], &endp, 0);
	  if (*endp != '\0') {
		  fprintf(stderr, usage);
		  return 1;
	  }

	  end = start + size;

	  while (1) {
		  for (pos = start; pos < end; ) {
			  long bread, bwritten = 0;

			  if (lseek(fd, pos, SEEK_SET) < 0) {
				  perror("lseek");
				  return 1;
			  }

			  bread = read(0, buf, sizeof(buf) < end - pos ?
					       sizeof(buf) : end - pos);
			  if (bread < 0) {
				  perror("read");
				  return 1;
			  }
			  if (bread == 0)
				  return 0;

			  while (bwritten < bread) {
				  long this;

				  this = write(fd, buf + bwritten,
					       bread - bwritten);
				  if (this < 0) {
					  perror("write");
					  return 1;
				  }

				  bwritten += this;
				  pos += bwritten;
			  }
		  }
	  }
  }

repro.sh::

  #!/bin/bash

  set -e
  set -x

  sysctl -w vm.dirty_expire_centisecs=300000
  sysctl -w vm.dirty_writeback_centisecs=300000
  sysctl -w vm.dirtytime_expire_seconds=300000
  echo 3 > /proc/sys/vm/drop_caches

  TEST=/sys/fs/cgroup/test
  A=$TEST/A
  B=$TEST/B

  mkdir -p $A $B
  echo "+memory +io" > $TEST/cgroup.subtree_control
  echo $((1<<30)) > $A/memory.high
  echo $((32<<30)) > $B/memory.high

  rm -f testfile
  touch testfile
  fallocate -l 4G testfile

  echo "Starting B"

  (echo $BASHPID > $B/cgroup.procs
   pv -q --rate-limit 70M < /dev/urandom | ./write-range testfile $((2<<30)) $((2<<30))) &

  echo "Waiting 10s to ensure B claims the testfile inode"
  sleep 5
  sync
  sleep 5
  sync
  echo "Starting A"

  (echo $BASHPID > $A/cgroup.procs
   pv < /dev/urandom | ./write-range testfile 0 $((2<<30)))

v2: Added comments explaining why the specific intervals are being used.

v3: Use 0 @nr when calling cgroup_writeback_by_id() to use best-effort
    flushing while avoding possible livelocks.

v4: Use get_jiffies_64() and time_before/after64() instead of raw
    jiffies_64 and arthimetic comparisons as suggested by Jan.
Reviewed-by: default avatarJan Kara <jack@suse.cz>
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
Signed-off-by: default avatarJens Axboe <axboe@kernel.dk>
parent d62241c7
...@@ -63,6 +63,7 @@ enum wb_reason { ...@@ -63,6 +63,7 @@ enum wb_reason {
* so it has a mismatch name. * so it has a mismatch name.
*/ */
WB_REASON_FORKER_THREAD, WB_REASON_FORKER_THREAD,
WB_REASON_FOREIGN_FLUSH,
WB_REASON_MAX, WB_REASON_MAX,
}; };
......
...@@ -183,6 +183,23 @@ struct memcg_padding { ...@@ -183,6 +183,23 @@ struct memcg_padding {
#define MEMCG_PADDING(name) #define MEMCG_PADDING(name)
#endif #endif
/*
* Remember four most recent foreign writebacks with dirty pages in this
* cgroup. Inode sharing is expected to be uncommon and, even if we miss
* one in a given round, we're likely to catch it later if it keeps
* foreign-dirtying, so a fairly low count should be enough.
*
* See mem_cgroup_track_foreign_dirty_slowpath() for details.
*/
#define MEMCG_CGWB_FRN_CNT 4
struct memcg_cgwb_frn {
u64 bdi_id; /* bdi->id of the foreign inode */
int memcg_id; /* memcg->css.id of foreign inode */
u64 at; /* jiffies_64 at the time of dirtying */
struct wb_completion done; /* tracks in-flight foreign writebacks */
};
/* /*
* The memory controller data structure. The memory controller controls both * The memory controller data structure. The memory controller controls both
* page cache and RSS per cgroup. We would eventually like to provide * page cache and RSS per cgroup. We would eventually like to provide
...@@ -307,6 +324,7 @@ struct mem_cgroup { ...@@ -307,6 +324,7 @@ struct mem_cgroup {
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
struct list_head cgwb_list; struct list_head cgwb_list;
struct wb_domain cgwb_domain; struct wb_domain cgwb_domain;
struct memcg_cgwb_frn cgwb_frn[MEMCG_CGWB_FRN_CNT];
#endif #endif
/* List of events which userspace want to receive */ /* List of events which userspace want to receive */
...@@ -1218,6 +1236,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, ...@@ -1218,6 +1236,18 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
unsigned long *pheadroom, unsigned long *pdirty, unsigned long *pheadroom, unsigned long *pdirty,
unsigned long *pwriteback); unsigned long *pwriteback);
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
struct bdi_writeback *wb);
static inline void mem_cgroup_track_foreign_dirty(struct page *page,
struct bdi_writeback *wb)
{
if (unlikely(&page->mem_cgroup->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(page, wb);
}
void mem_cgroup_flush_foreign(struct bdi_writeback *wb);
#else /* CONFIG_CGROUP_WRITEBACK */ #else /* CONFIG_CGROUP_WRITEBACK */
static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb) static inline struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
...@@ -1233,6 +1263,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb, ...@@ -1233,6 +1263,15 @@ static inline void mem_cgroup_wb_stats(struct bdi_writeback *wb,
{ {
} }
static inline void mem_cgroup_track_foreign_dirty(struct page *page,
struct bdi_writeback *wb)
{
}
static inline void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
}
#endif /* CONFIG_CGROUP_WRITEBACK */ #endif /* CONFIG_CGROUP_WRITEBACK */
struct sock; struct sock;
......
...@@ -87,6 +87,10 @@ int do_swap_account __read_mostly; ...@@ -87,6 +87,10 @@ int do_swap_account __read_mostly;
#define do_swap_account 0 #define do_swap_account 0
#endif #endif
#ifdef CONFIG_CGROUP_WRITEBACK
static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
#endif
/* Whether legacy memory+swap accounting is active */ /* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void) static bool do_memsw_account(void)
{ {
...@@ -4145,6 +4149,127 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, ...@@ -4145,6 +4149,127 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
} }
} }
/*
* Foreign dirty flushing
*
* There's an inherent mismatch between memcg and writeback. The former
* trackes ownership per-page while the latter per-inode. This was a
* deliberate design decision because honoring per-page ownership in the
* writeback path is complicated, may lead to higher CPU and IO overheads
* and deemed unnecessary given that write-sharing an inode across
* different cgroups isn't a common use-case.
*
* Combined with inode majority-writer ownership switching, this works well
* enough in most cases but there are some pathological cases. For
* example, let's say there are two cgroups A and B which keep writing to
* different but confined parts of the same inode. B owns the inode and
* A's memory is limited far below B's. A's dirty ratio can rise enough to
* trigger balance_dirty_pages() sleeps but B's can be low enough to avoid
* triggering background writeback. A will be slowed down without a way to
* make writeback of the dirty pages happen.
*
* Conditions like the above can lead to a cgroup getting repatedly and
* severely throttled after making some progress after each
* dirty_expire_interval while the underyling IO device is almost
* completely idle.
*
* Solving this problem completely requires matching the ownership tracking
* granularities between memcg and writeback in either direction. However,
* the more egregious behaviors can be avoided by simply remembering the
* most recent foreign dirtying events and initiating remote flushes on
* them when local writeback isn't enough to keep the memory clean enough.
*
* The following two functions implement such mechanism. When a foreign
* page - a page whose memcg and writeback ownerships don't match - is
* dirtied, mem_cgroup_track_foreign_dirty() records the inode owning
* bdi_writeback on the page owning memcg. When balance_dirty_pages()
* decides that the memcg needs to sleep due to high dirty ratio, it calls
* mem_cgroup_flush_foreign() which queues writeback on the recorded
* foreign bdi_writebacks which haven't expired. Both the numbers of
* recorded bdi_writebacks and concurrent in-flight foreign writebacks are
* limited to MEMCG_CGWB_FRN_CNT.
*
* The mechanism only remembers IDs and doesn't hold any object references.
* As being wrong occasionally doesn't matter, updates and accesses to the
* records are lockless and racy.
*/
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
struct bdi_writeback *wb)
{
struct mem_cgroup *memcg = page->mem_cgroup;
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
int oldest = -1;
int i;
/*
* Pick the slot to use. If there is already a slot for @wb, keep
* using it. If not replace the oldest one which isn't being
* written out.
*/
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
frn = &memcg->cgwb_frn[i];
if (frn->bdi_id == wb->bdi->id &&
frn->memcg_id == wb->memcg_css->id)
break;
if (time_before64(frn->at, oldest_at) &&
atomic_read(&frn->done.cnt) == 1) {
oldest = i;
oldest_at = frn->at;
}
}
if (i < MEMCG_CGWB_FRN_CNT) {
/*
* Re-using an existing one. Update timestamp lazily to
* avoid making the cacheline hot. We want them to be
* reasonably up-to-date and significantly shorter than
* dirty_expire_interval as that's what expires the record.
* Use the shorter of 1s and dirty_expire_interval / 8.
*/
unsigned long update_intv =
min_t(unsigned long, HZ,
msecs_to_jiffies(dirty_expire_interval * 10) / 8);
if (time_before64(frn->at, now - update_intv))
frn->at = now;
} else if (oldest >= 0) {
/* replace the oldest free one */
frn = &memcg->cgwb_frn[oldest];
frn->bdi_id = wb->bdi->id;
frn->memcg_id = wb->memcg_css->id;
frn->at = now;
}
}
/* issue foreign writeback flushes for recorded foreign dirtying events */
void mem_cgroup_flush_foreign(struct bdi_writeback *wb)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10);
u64 now = jiffies_64;
int i;
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) {
struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i];
/*
* If the record is older than dirty_expire_interval,
* writeback on it has already started. No need to kick it
* off again. Also, don't start a new one if there's
* already one in flight.
*/
if (time_after64(frn->at, now - intv) &&
atomic_read(&frn->done.cnt) == 1) {
frn->at = 0;
cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0,
WB_REASON_FOREIGN_FLUSH,
&frn->done);
}
}
}
#else /* CONFIG_CGROUP_WRITEBACK */ #else /* CONFIG_CGROUP_WRITEBACK */
static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
...@@ -4661,6 +4786,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) ...@@ -4661,6 +4786,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
unsigned int size; unsigned int size;
int node; int node;
int __maybe_unused i;
size = sizeof(struct mem_cgroup); size = sizeof(struct mem_cgroup);
size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
...@@ -4704,6 +4830,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) ...@@ -4704,6 +4830,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
#endif #endif
#ifdef CONFIG_CGROUP_WRITEBACK #ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list); INIT_LIST_HEAD(&memcg->cgwb_list);
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
memcg->cgwb_frn[i].done =
__WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq);
#endif #endif
idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
return memcg; return memcg;
...@@ -4833,7 +4962,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) ...@@ -4833,7 +4962,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
static void mem_cgroup_css_free(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
{ {
struct mem_cgroup *memcg = mem_cgroup_from_css(css); struct mem_cgroup *memcg = mem_cgroup_from_css(css);
int __maybe_unused i;
#ifdef CONFIG_CGROUP_WRITEBACK
for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++)
wb_wait_for_completion(&memcg->cgwb_frn[i].done);
#endif
if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
static_branch_dec(&memcg_sockets_enabled_key); static_branch_dec(&memcg_sockets_enabled_key);
......
...@@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb, ...@@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
if (unlikely(!writeback_in_progress(wb))) if (unlikely(!writeback_in_progress(wb)))
wb_start_background_writeback(wb); wb_start_background_writeback(wb);
mem_cgroup_flush_foreign(wb);
/* /*
* Calculate global domain's pos_ratio and select the * Calculate global domain's pos_ratio and select the
* global dtc by default. * global dtc by default.
...@@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) ...@@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
task_io_account_write(PAGE_SIZE); task_io_account_write(PAGE_SIZE);
current->nr_dirtied++; current->nr_dirtied++;
this_cpu_inc(bdp_ratelimits); this_cpu_inc(bdp_ratelimits);
mem_cgroup_track_foreign_dirty(page, wb);
} }
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment