Commit d5633b0d authored by Qu Wenruo's avatar Qu Wenruo Committed by David Sterba

btrfs: defrag: bring back the old file extent search behavior

For defrag, we don't really want to use btrfs_get_extent() to iterate
all extent maps of an inode.

The reasons are:

- btrfs_get_extent() can merge extent maps
  And the result em has the higher generation of the two, causing defrag
  to mark unnecessary part of such merged large extent map.

  This in fact can result extra IO for autodefrag in v5.16+ kernels.

  However this patch is not going to completely solve the problem, as
  one can still using read() to trigger extent map reading, and got
  them merged.

  The completely solution for the extent map merging generation problem
  will come as an standalone fix.

- btrfs_get_extent() caches the extent map result
  Normally it's fine, but for defrag the target range may not get
  another read/write for a long long time.
  Such cache would only increase the memory usage.

- btrfs_get_extent() doesn't skip older extent map
  Unlike the old find_new_extent() which uses btrfs_search_forward() to
  skip the older subtree, thus it will pick up unnecessary extent maps.

This patch will fix the regression by introducing defrag_get_extent() to
replace the btrfs_get_extent() call.

This helper will:

- Not cache the file extent we found
  It will search the file extent and manually convert it to em.

- Use btrfs_search_forward() to skip entire ranges which is modified in
  the past

This should reduce the IO for autodefrag.
Reported-by: default avatarFilipe Manana <fdmanana@suse.com>
Fixes: 7b508037 ("btrfs: defrag: use defrag_one_cluster() to implement btrfs_defrag_file()")
Reviewed-by: default avatarFilipe Manana <fdmanana@suse.com>
Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 550f133f
...@@ -1012,8 +1012,155 @@ static noinline int btrfs_mksnapshot(const struct path *parent, ...@@ -1012,8 +1012,155 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
return ret; return ret;
} }
/*
* Defrag specific helper to get an extent map.
*
* Differences between this and btrfs_get_extent() are:
*
* - No extent_map will be added to inode->extent_tree
* To reduce memory usage in the long run.
*
* - Extra optimization to skip file extents older than @newer_than
* By using btrfs_search_forward() we can skip entire file ranges that
* have extents created in past transactions, because btrfs_search_forward()
* will not visit leaves and nodes with a generation smaller than given
* minimal generation threshold (@newer_than).
*
* Return valid em if we find a file extent matching the requirement.
* Return NULL if we can not find a file extent matching the requirement.
*
* Return ERR_PTR() for error.
*/
static struct extent_map *defrag_get_extent(struct btrfs_inode *inode,
u64 start, u64 newer_than)
{
struct btrfs_root *root = inode->root;
struct btrfs_file_extent_item *fi;
struct btrfs_path path = { 0 };
struct extent_map *em;
struct btrfs_key key;
u64 ino = btrfs_ino(inode);
int ret;
em = alloc_extent_map();
if (!em) {
ret = -ENOMEM;
goto err;
}
key.objectid = ino;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = start;
if (newer_than) {
ret = btrfs_search_forward(root, &key, &path, newer_than);
if (ret < 0)
goto err;
/* Can't find anything newer */
if (ret > 0)
goto not_found;
} else {
ret = btrfs_search_slot(NULL, root, &key, &path, 0, 0);
if (ret < 0)
goto err;
}
if (path.slots[0] >= btrfs_header_nritems(path.nodes[0])) {
/*
* If btrfs_search_slot() makes path to point beyond nritems,
* we should not have an empty leaf, as this inode must at
* least have its INODE_ITEM.
*/
ASSERT(btrfs_header_nritems(path.nodes[0]));
path.slots[0] = btrfs_header_nritems(path.nodes[0]) - 1;
}
btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
/* Perfect match, no need to go one slot back */
if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY &&
key.offset == start)
goto iterate;
/* We didn't find a perfect match, needs to go one slot back */
if (path.slots[0] > 0) {
btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
path.slots[0]--;
}
iterate:
/* Iterate through the path to find a file extent covering @start */
while (true) {
u64 extent_end;
if (path.slots[0] >= btrfs_header_nritems(path.nodes[0]))
goto next;
btrfs_item_key_to_cpu(path.nodes[0], &key, path.slots[0]);
/*
* We may go one slot back to INODE_REF/XATTR item, then
* need to go forward until we reach an EXTENT_DATA.
* But we should still has the correct ino as key.objectid.
*/
if (WARN_ON(key.objectid < ino) || key.type < BTRFS_EXTENT_DATA_KEY)
goto next;
/* It's beyond our target range, definitely not extent found */
if (key.objectid > ino || key.type > BTRFS_EXTENT_DATA_KEY)
goto not_found;
/*
* | |<- File extent ->|
* \- start
*
* This means there is a hole between start and key.offset.
*/
if (key.offset > start) {
em->start = start;
em->orig_start = start;
em->block_start = EXTENT_MAP_HOLE;
em->len = key.offset - start;
break;
}
fi = btrfs_item_ptr(path.nodes[0], path.slots[0],
struct btrfs_file_extent_item);
extent_end = btrfs_file_extent_end(&path);
/*
* |<- file extent ->| |
* \- start
*
* We haven't reached start, search next slot.
*/
if (extent_end <= start)
goto next;
/* Now this extent covers @start, convert it to em */
btrfs_extent_item_to_extent_map(inode, &path, fi, false, em);
break;
next:
ret = btrfs_next_item(root, &path);
if (ret < 0)
goto err;
if (ret > 0)
goto not_found;
}
btrfs_release_path(&path);
return em;
not_found:
btrfs_release_path(&path);
free_extent_map(em);
return NULL;
err:
btrfs_release_path(&path);
free_extent_map(em);
return ERR_PTR(ret);
}
static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
bool locked) u64 newer_than, bool locked)
{ {
struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
...@@ -1035,7 +1182,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, ...@@ -1035,7 +1182,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
/* get the big lock and read metadata off disk */ /* get the big lock and read metadata off disk */
if (!locked) if (!locked)
lock_extent_bits(io_tree, start, end, &cached); lock_extent_bits(io_tree, start, end, &cached);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize); em = defrag_get_extent(BTRFS_I(inode), start, newer_than);
if (!locked) if (!locked)
unlock_extent_cached(io_tree, start, end, &cached); unlock_extent_cached(io_tree, start, end, &cached);
...@@ -1063,7 +1210,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, ...@@ -1063,7 +1210,12 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
if (em->start + em->len >= i_size_read(inode)) if (em->start + em->len >= i_size_read(inode))
return false; return false;
next = defrag_lookup_extent(inode, em->start + em->len, locked); /*
* We want to check if the next extent can be merged with the current
* one, which can be an extent created in a past generation, so we pass
* a minimum generation of 0 to defrag_lookup_extent().
*/
next = defrag_lookup_extent(inode, em->start + em->len, 0, locked);
/* No more em or hole */ /* No more em or hole */
if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
goto out; goto out;
...@@ -1214,7 +1366,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, ...@@ -1214,7 +1366,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
u64 range_len; u64 range_len;
last_is_target = false; last_is_target = false;
em = defrag_lookup_extent(&inode->vfs_inode, cur, locked); em = defrag_lookup_extent(&inode->vfs_inode, cur,
newer_than, locked);
if (!em) if (!em)
break; break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment