Commit f26c9238 authored by Qu Wenruo's avatar Qu Wenruo Committed by David Sterba

btrfs: remove reada infrastructure

Currently there is only one user for btrfs metadata readahead, and
that's scrub.

But even for the single user, it's not providing the correct
functionality it needs, as scrub needs reada for commit root, which
current readahead can't provide. (Although it's pretty easy to add such
feature).

Despite this, there are some extra problems related to metadata
readahead:

- Duplicated feature with btrfs_path::reada

- Partly duplicated feature of btrfs_fs_info::buffer_radix
  Btrfs already caches its metadata in buffer_radix, while readahead
  tries to read the tree block no matter if it's already cached.

- Poor layer separation
  Metadata readahead works kinda at device level.
  This is definitely not the correct layer it should be, since metadata
  is at btrfs logical address space, it should not bother device at all.

  This brings extra chance for bugs to sneak in, while brings
  unnecessary complexity.

- Dead code
  In the very beginning of scrub.c we have #undef DEBUG, rendering all
  the debug related code useless and unable to test.

Thus here I purpose to remove the metadata readahead mechanism
completely.

[BENCHMARK]
There is a full benchmark for the scrub performance difference using the
old btrfs_reada_add() and btrfs_path::reada.

For the worst case (no dirty metadata, slow HDD), there could be a 5%
performance drop for scrub.
For other cases (even SATA SSD), there is no distinguishable performance
difference.

The number is reported scrub speed, in MiB/s.
The resolution is limited by the reported duration, which only has a
resolution of 1 second.

	Old		New		Diff
SSD	455.3		466.332		+2.42%
HDD	103.927 	98.012		-5.69%

Comprehensive test methodology is in the cover letter of the patch.
Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent dcf62b20
...@@ -27,7 +27,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ ...@@ -27,7 +27,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \ export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \ compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o subpage.o tree-mod-log.o
......
...@@ -821,7 +821,6 @@ struct btrfs_fs_info { ...@@ -821,7 +821,6 @@ struct btrfs_fs_info {
struct btrfs_workqueue *endio_write_workers; struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker; struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers; struct btrfs_workqueue *caching_workers;
struct btrfs_workqueue *readahead_workers;
/* /*
* fixup workers take dirty pages that didn't properly go through * fixup workers take dirty pages that didn't properly go through
...@@ -958,13 +957,6 @@ struct btrfs_fs_info { ...@@ -958,13 +957,6 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root; struct btrfs_delayed_root *delayed_root;
/* readahead tree */
spinlock_t reada_lock;
struct radix_tree_root reada_tree;
/* readahead works cnt */
atomic_t reada_works_cnt;
/* Extent buffer radix tree */ /* Extent buffer radix tree */
spinlock_t buffer_lock; spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */ /* Entries are eb->start / sectorsize */
...@@ -3807,23 +3799,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info) ...@@ -3807,23 +3799,6 @@ static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
btrfs_bio_counter_sub(fs_info, 1); btrfs_bio_counter_sub(fs_info, 1);
} }
/* reada.c */
struct reada_control {
struct btrfs_fs_info *fs_info; /* tree to prefetch */
struct btrfs_key key_start;
struct btrfs_key key_end; /* exclusive */
atomic_t elems;
struct kref refcnt;
wait_queue_head_t wait;
};
struct reada_control *btrfs_reada_add(struct btrfs_root *root,
struct btrfs_key *start, struct btrfs_key *end);
int btrfs_reada_wait(void *handle);
void btrfs_reada_detach(void *handle);
int btree_readahead_hook(struct extent_buffer *eb, int err);
void btrfs_reada_remove_dev(struct btrfs_device *dev);
void btrfs_reada_undo_remove_dev(struct btrfs_device *dev);
static inline int is_fstree(u64 rootid) static inline int is_fstree(u64 rootid)
{ {
if (rootid == BTRFS_FS_TREE_OBJECTID || if (rootid == BTRFS_FS_TREE_OBJECTID ||
......
...@@ -906,9 +906,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ...@@ -906,9 +906,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
} }
btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1); btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, (u64)-1);
if (!scrub_ret)
btrfs_reada_remove_dev(src_device);
/* /*
* We have to use this loop approach because at this point src_device * We have to use this loop approach because at this point src_device
* has to be available for transaction commit to complete, yet new * has to be available for transaction commit to complete, yet new
...@@ -917,7 +914,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ...@@ -917,7 +914,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
while (1) { while (1) {
trans = btrfs_start_transaction(root, 0); trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) { if (IS_ERR(trans)) {
btrfs_reada_undo_remove_dev(src_device);
mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
return PTR_ERR(trans); return PTR_ERR(trans);
} }
...@@ -968,7 +964,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ...@@ -968,7 +964,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
up_write(&dev_replace->rwsem); up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex); mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex); mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_reada_undo_remove_dev(src_device);
btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device) if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device); btrfs_destroy_dev_replace_tgtdev(tgt_device);
......
...@@ -665,9 +665,6 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end, ...@@ -665,9 +665,6 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
if (ret < 0) if (ret < 0)
goto err; goto err;
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(eb, ret);
set_extent_buffer_uptodate(eb); set_extent_buffer_uptodate(eb);
free_extent_buffer(eb); free_extent_buffer(eb);
...@@ -715,10 +712,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio, ...@@ -715,10 +712,6 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
} }
ret = validate_extent_buffer(eb); ret = validate_extent_buffer(eb);
err: err:
if (reads_done &&
test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags))
btree_readahead_hook(eb, ret);
if (ret) { if (ret) {
/* /*
* our io error hook is going to dec the io pages * our io error hook is going to dec the io pages
...@@ -2232,7 +2225,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) ...@@ -2232,7 +2225,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->endio_freespace_worker); btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers); btrfs_destroy_workqueue(fs_info->delayed_workers);
btrfs_destroy_workqueue(fs_info->caching_workers); btrfs_destroy_workqueue(fs_info->caching_workers);
btrfs_destroy_workqueue(fs_info->readahead_workers);
btrfs_destroy_workqueue(fs_info->flush_workers); btrfs_destroy_workqueue(fs_info->flush_workers);
btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers); btrfs_destroy_workqueue(fs_info->qgroup_rescan_workers);
if (fs_info->discard_ctl.discard_workers) if (fs_info->discard_ctl.discard_workers)
...@@ -2445,9 +2437,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) ...@@ -2445,9 +2437,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->delayed_workers = fs_info->delayed_workers =
btrfs_alloc_workqueue(fs_info, "delayed-meta", flags, btrfs_alloc_workqueue(fs_info, "delayed-meta", flags,
max_active, 0); max_active, 0);
fs_info->readahead_workers =
btrfs_alloc_workqueue(fs_info, "readahead", flags,
max_active, 2);
fs_info->qgroup_rescan_workers = fs_info->qgroup_rescan_workers =
btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0); btrfs_alloc_workqueue(fs_info, "qgroup-rescan", flags, 1, 0);
fs_info->discard_ctl.discard_workers = fs_info->discard_ctl.discard_workers =
...@@ -2459,9 +2448,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) ...@@ -2459,9 +2448,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->endio_meta_write_workers && fs_info->endio_meta_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->readahead_workers && fs_info->caching_workers && fs_info->fixup_workers &&
fs_info->fixup_workers && fs_info->delayed_workers && fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
fs_info->qgroup_rescan_workers &&
fs_info->discard_ctl.discard_workers)) { fs_info->discard_ctl.discard_workers)) {
return -ENOMEM; return -ENOMEM;
} }
...@@ -3091,7 +3079,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) ...@@ -3091,7 +3079,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
atomic_set(&fs_info->async_delalloc_pages, 0); atomic_set(&fs_info->async_delalloc_pages, 0);
atomic_set(&fs_info->defrag_running, 0); atomic_set(&fs_info->defrag_running, 0);
atomic_set(&fs_info->reada_works_cnt, 0);
atomic_set(&fs_info->nr_delayed_iputs, 0); atomic_set(&fs_info->nr_delayed_iputs, 0);
atomic64_set(&fs_info->tree_mod_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0);
fs_info->global_root_tree = RB_ROOT; fs_info->global_root_tree = RB_ROOT;
...@@ -3102,9 +3089,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) ...@@ -3102,9 +3089,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->tree_mod_log = RB_ROOT; fs_info->tree_mod_log = RB_ROOT;
fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */ fs_info->avg_delayed_ref_runtime = NSEC_PER_SEC >> 6; /* div by 64 */
/* readahead state */
INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
spin_lock_init(&fs_info->reada_lock);
btrfs_init_ref_verify(fs_info); btrfs_init_ref_verify(fs_info);
fs_info->thread_pool_size = min_t(unsigned long, fs_info->thread_pool_size = min_t(unsigned long,
......
...@@ -3087,9 +3087,6 @@ static void end_bio_extent_readpage(struct bio *bio) ...@@ -3087,9 +3087,6 @@ static void end_bio_extent_readpage(struct bio *bio)
set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags);
eb->read_mirror = mirror; eb->read_mirror = mirror;
atomic_dec(&eb->io_pages); atomic_dec(&eb->io_pages);
if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD,
&eb->bflags))
btree_readahead_hook(eb, -EIO);
} }
readpage_ok: readpage_ok:
if (likely(uptodate)) { if (likely(uptodate)) {
......
This diff is collapsed.
...@@ -3188,10 +3188,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ...@@ -3188,10 +3188,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
u64 physical_end; u64 physical_end;
u64 generation; u64 generation;
int mirror_num; int mirror_num;
struct reada_control *reada1;
struct reada_control *reada2;
struct btrfs_key key; struct btrfs_key key;
struct btrfs_key key_end;
u64 increment = map->stripe_len; u64 increment = map->stripe_len;
u64 offset; u64 offset;
u64 extent_logical; u64 extent_logical;
...@@ -3242,11 +3239,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ...@@ -3242,11 +3239,6 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
path->skip_locking = 1; path->skip_locking = 1;
path->reada = READA_FORWARD; path->reada = READA_FORWARD;
/*
* trigger the readahead for extent tree csum tree and wait for
* completion. During readahead, the scrub is officially paused
* to not hold off transaction commits
*/
logical = base + offset; logical = base + offset;
physical_end = physical + nstripes * map->stripe_len; physical_end = physical + nstripes * map->stripe_len;
if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
...@@ -3261,36 +3253,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, ...@@ -3261,36 +3253,8 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
scrub_blocked_if_needed(fs_info); scrub_blocked_if_needed(fs_info);
root = btrfs_extent_root(fs_info, logical); root = btrfs_extent_root(fs_info, logical);
/* FIXME it might be better to start readahead at commit root */
key.objectid = logical;
key.type = BTRFS_EXTENT_ITEM_KEY;
key.offset = (u64)0;
key_end.objectid = logic_end;
key_end.type = BTRFS_METADATA_ITEM_KEY;
key_end.offset = (u64)-1;
reada1 = btrfs_reada_add(root, &key, &key_end);
csum_root = btrfs_csum_root(fs_info, logical); csum_root = btrfs_csum_root(fs_info, logical);
if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = logical;
key_end.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key_end.type = BTRFS_EXTENT_CSUM_KEY;
key_end.offset = logic_end;
reada2 = btrfs_reada_add(csum_root, &key, &key_end);
} else {
reada2 = NULL;
}
if (!IS_ERR(reada1))
btrfs_reada_wait(reada1);
if (!IS_ERR_OR_NULL(reada2))
btrfs_reada_wait(reada2);
/* /*
* collect all data csums for the stripe to avoid seeking during * collect all data csums for the stripe to avoid seeking during
* the scrub. This might currently (crc32) end up to be about 1MB * the scrub. This might currently (crc32) end up to be about 1MB
......
...@@ -1842,7 +1842,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, ...@@ -1842,7 +1842,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size); btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size); btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->readahead_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers, btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
new_pool_size); new_pool_size);
} }
......
...@@ -1166,7 +1166,6 @@ static void btrfs_close_one_device(struct btrfs_device *device) ...@@ -1166,7 +1166,6 @@ static void btrfs_close_one_device(struct btrfs_device *device)
ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)); ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
ASSERT(list_empty(&device->dev_alloc_list)); ASSERT(list_empty(&device->dev_alloc_list));
ASSERT(list_empty(&device->post_commit_list)); ASSERT(list_empty(&device->post_commit_list));
ASSERT(atomic_read(&device->reada_in_flight) == 0);
} }
static void close_fs_devices(struct btrfs_fs_devices *fs_devices) static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
...@@ -2150,8 +2149,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ...@@ -2150,8 +2149,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
} }
ret = btrfs_shrink_device(device, 0); ret = btrfs_shrink_device(device, 0);
if (!ret)
btrfs_reada_remove_dev(device);
if (ret) if (ret)
goto error_undo; goto error_undo;
...@@ -2249,7 +2246,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, ...@@ -2249,7 +2246,6 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info,
return ret; return ret;
error_undo: error_undo:
btrfs_reada_undo_remove_dev(device);
if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) { if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
mutex_lock(&fs_info->chunk_mutex); mutex_lock(&fs_info->chunk_mutex);
list_add(&device->dev_alloc_list, list_add(&device->dev_alloc_list,
...@@ -6980,11 +6976,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, ...@@ -6980,11 +6976,8 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
INIT_LIST_HEAD(&dev->dev_alloc_list); INIT_LIST_HEAD(&dev->dev_alloc_list);
INIT_LIST_HEAD(&dev->post_commit_list); INIT_LIST_HEAD(&dev->post_commit_list);
atomic_set(&dev->reada_in_flight, 0);
atomic_set(&dev->dev_stats_ccnt, 0); atomic_set(&dev->dev_stats_ccnt, 0);
btrfs_device_data_ordered_init(dev); btrfs_device_data_ordered_init(dev);
INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
extent_io_tree_init(fs_info, &dev->alloc_state, extent_io_tree_init(fs_info, &dev->alloc_state,
IO_TREE_DEVICE_ALLOC_STATE, NULL); IO_TREE_DEVICE_ALLOC_STATE, NULL);
......
...@@ -123,13 +123,6 @@ struct btrfs_device { ...@@ -123,13 +123,6 @@ struct btrfs_device {
/* per-device scrub information */ /* per-device scrub information */
struct scrub_ctx *scrub_ctx; struct scrub_ctx *scrub_ctx;
/* readahead state */
atomic_t reada_in_flight;
u64 reada_next;
struct reada_zone *reada_curr_zone;
struct radix_tree_root reada_zones;
struct radix_tree_root reada_extents;
/* disk I/O failure stats. For detailed description refer to /* disk I/O failure stats. For detailed description refer to
* enum btrfs_dev_stat_values in ioctl.h */ * enum btrfs_dev_stat_values in ioctl.h */
int dev_stats_valid; int dev_stats_valid;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment