Commit e30b5dca authored by Yan, Zheng's avatar Yan, Zheng Committed by Theodore Ts'o

ext4: fix fio regression

We (Linux Kernel Performance project) found a regression introduced
by commit:

  f7fec032 ext4: track all extent status in extent status tree

The commit causes about 20% performance decrease in fio random write
test. Profiler shows that rb_next() uses a lot of CPU time. The call
stack is:

  rb_next
  ext4_es_find_delayed_extent
  ext4_map_blocks
  _ext4_get_block
  ext4_get_block_write
  __blockdev_direct_IO
  ext4_direct_IO
  generic_file_direct_write
  __generic_file_aio_write
  ext4_file_write
  aio_rw_vect_retry
  aio_run_iocb
  do_io_submit
  sys_io_submit
  system_call_fastpath
  io_submit
  td_io_getevents
  io_u_queued_complete
  thread_main
  main
  __libc_start_main

The cause is that ext4_es_find_delayed_extent() doesn't have an
upper bound, it keeps searching until a delayed extent is found.
When there are a lots of non-delayed entries in the extent state
tree, ext4_es_find_delayed_extent() may uses a lot of CPU time.
Reported-by: default avatarLKP project <lkp@linux.intel.com>
Signed-off-by: default avatarYan, Zheng <zheng.z.yan@intel.com>
Signed-off-by: default avatarZheng Liu <wenqing.lz@taobao.com>
Cc: "Theodore Ts'o" <tytso@mit.edu>
parent 0d606e2c
...@@ -3642,7 +3642,7 @@ int ext4_find_delalloc_range(struct inode *inode, ...@@ -3642,7 +3642,7 @@ int ext4_find_delalloc_range(struct inode *inode,
{ {
struct extent_status es; struct extent_status es;
ext4_es_find_delayed_extent(inode, lblk_start, &es); ext4_es_find_delayed_extent_range(inode, lblk_start, lblk_end, &es);
if (es.es_len == 0) if (es.es_len == 0)
return 0; /* there is no delay extent in this tree */ return 0; /* there is no delay extent in this tree */
else if (es.es_lblk <= lblk_start && else if (es.es_lblk <= lblk_start &&
...@@ -4608,9 +4608,10 @@ static int ext4_find_delayed_extent(struct inode *inode, ...@@ -4608,9 +4608,10 @@ static int ext4_find_delayed_extent(struct inode *inode,
struct extent_status es; struct extent_status es;
ext4_lblk_t block, next_del; ext4_lblk_t block, next_del;
ext4_es_find_delayed_extent(inode, newes->es_lblk, &es);
if (newes->es_pblk == 0) { if (newes->es_pblk == 0) {
ext4_es_find_delayed_extent_range(inode, newes->es_lblk,
newes->es_lblk + newes->es_len - 1, &es);
/* /*
* No extent in extent-tree contains block @newes->es_pblk, * No extent in extent-tree contains block @newes->es_pblk,
* then the block may stay in 1)a hole or 2)delayed-extent. * then the block may stay in 1)a hole or 2)delayed-extent.
...@@ -4630,7 +4631,7 @@ static int ext4_find_delayed_extent(struct inode *inode, ...@@ -4630,7 +4631,7 @@ static int ext4_find_delayed_extent(struct inode *inode,
} }
block = newes->es_lblk + newes->es_len; block = newes->es_lblk + newes->es_len;
ext4_es_find_delayed_extent(inode, block, &es); ext4_es_find_delayed_extent_range(inode, block, EXT_MAX_BLOCKS, &es);
if (es.es_len == 0) if (es.es_len == 0)
next_del = EXT_MAX_BLOCKS; next_del = EXT_MAX_BLOCKS;
else else
......
...@@ -232,14 +232,16 @@ static struct extent_status *__es_tree_search(struct rb_root *root, ...@@ -232,14 +232,16 @@ static struct extent_status *__es_tree_search(struct rb_root *root,
} }
/* /*
* ext4_es_find_delayed_extent: find the 1st delayed extent covering @es->lblk * ext4_es_find_delayed_extent_range: find the 1st delayed extent covering
* if it exists, otherwise, the next extent after @es->lblk. * @es->lblk if it exists, otherwise, the next extent after @es->lblk.
* *
* @inode: the inode which owns delayed extents * @inode: the inode which owns delayed extents
* @lblk: the offset where we start to search * @lblk: the offset where we start to search
* @end: the offset where we stop to search
* @es: delayed extent that we found * @es: delayed extent that we found
*/ */
void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, void ext4_es_find_delayed_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es) struct extent_status *es)
{ {
struct ext4_es_tree *tree = NULL; struct ext4_es_tree *tree = NULL;
...@@ -247,7 +249,8 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, ...@@ -247,7 +249,8 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
struct rb_node *node; struct rb_node *node;
BUG_ON(es == NULL); BUG_ON(es == NULL);
trace_ext4_es_find_delayed_extent_enter(inode, lblk); BUG_ON(end < lblk);
trace_ext4_es_find_delayed_extent_range_enter(inode, lblk);
read_lock(&EXT4_I(inode)->i_es_lock); read_lock(&EXT4_I(inode)->i_es_lock);
tree = &EXT4_I(inode)->i_es_tree; tree = &EXT4_I(inode)->i_es_tree;
...@@ -270,6 +273,10 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, ...@@ -270,6 +273,10 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
if (es1 && !ext4_es_is_delayed(es1)) { if (es1 && !ext4_es_is_delayed(es1)) {
while ((node = rb_next(&es1->rb_node)) != NULL) { while ((node = rb_next(&es1->rb_node)) != NULL) {
es1 = rb_entry(node, struct extent_status, rb_node); es1 = rb_entry(node, struct extent_status, rb_node);
if (es1->es_lblk > end) {
es1 = NULL;
break;
}
if (ext4_es_is_delayed(es1)) if (ext4_es_is_delayed(es1))
break; break;
} }
...@@ -285,7 +292,7 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, ...@@ -285,7 +292,7 @@ void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk,
read_unlock(&EXT4_I(inode)->i_es_lock); read_unlock(&EXT4_I(inode)->i_es_lock);
ext4_es_lru_add(inode); ext4_es_lru_add(inode);
trace_ext4_es_find_delayed_extent_exit(inode, es); trace_ext4_es_find_delayed_extent_range_exit(inode, es);
} }
static struct extent_status * static struct extent_status *
......
...@@ -62,7 +62,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, ...@@ -62,7 +62,8 @@ extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
unsigned long long status); unsigned long long status);
extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len); ext4_lblk_t len);
extern void ext4_es_find_delayed_extent(struct inode *inode, ext4_lblk_t lblk, extern void ext4_es_find_delayed_extent_range(struct inode *inode,
ext4_lblk_t lblk, ext4_lblk_t end,
struct extent_status *es); struct extent_status *es);
extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status *es); struct extent_status *es);
......
...@@ -464,7 +464,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) ...@@ -464,7 +464,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset, * If there is a delay extent at this offset,
* it will be as a data. * it will be as a data.
*/ */
ext4_es_find_delayed_extent(inode, last, &es); ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
if (last != start) if (last != start)
dataoff = last << blkbits; dataoff = last << blkbits;
...@@ -547,7 +547,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) ...@@ -547,7 +547,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
* If there is a delay extent at this offset, * If there is a delay extent at this offset,
* we will skip this extent. * we will skip this extent.
*/ */
ext4_es_find_delayed_extent(inode, last, &es); ext4_es_find_delayed_extent_range(inode, last, last, &es);
if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) { if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
last = es.es_lblk + es.es_len; last = es.es_lblk + es.es_len;
holeoff = last << blkbits; holeoff = last << blkbits;
......
...@@ -2139,7 +2139,7 @@ TRACE_EVENT(ext4_es_remove_extent, ...@@ -2139,7 +2139,7 @@ TRACE_EVENT(ext4_es_remove_extent,
__entry->lblk, __entry->len) __entry->lblk, __entry->len)
); );
TRACE_EVENT(ext4_es_find_delayed_extent_enter, TRACE_EVENT(ext4_es_find_delayed_extent_range_enter,
TP_PROTO(struct inode *inode, ext4_lblk_t lblk), TP_PROTO(struct inode *inode, ext4_lblk_t lblk),
TP_ARGS(inode, lblk), TP_ARGS(inode, lblk),
...@@ -2161,7 +2161,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_enter, ...@@ -2161,7 +2161,7 @@ TRACE_EVENT(ext4_es_find_delayed_extent_enter,
(unsigned long) __entry->ino, __entry->lblk) (unsigned long) __entry->ino, __entry->lblk)
); );
TRACE_EVENT(ext4_es_find_delayed_extent_exit, TRACE_EVENT(ext4_es_find_delayed_extent_range_exit,
TP_PROTO(struct inode *inode, struct extent_status *es), TP_PROTO(struct inode *inode, struct extent_status *es),
TP_ARGS(inode, es), TP_ARGS(inode, es),
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment