Commit 9ed74f2d authored by Josef Bacik's avatar Josef Bacik Committed by Chris Mason

Btrfs: proper -ENOSPC handling

At the start of a transaction we do a btrfs_reserve_metadata_space() and
specify how many items we plan on modifying.  Then once we've done our
modifications and such, just call btrfs_unreserve_metadata_space() for
the same number of items we reserved.

For keeping track of metadata needed for data I've had to add an extent_io op
for when we merge extents.  This lets us track space properly when we are doing
sequential writes, so we don't end up reserving way more metadata space than
what we need.

The only place where the metadata space accounting is not done is in the
relocation code.  This is because Yan is going to be reworking that code in the
near future, so running btrfs-vol -b could still possibly result in a ENOSPC
related panic.  This patch also turns off the metadata_ratio stuff in order to
allow users to more efficiently use their disk space.

This patch makes it so we track how much metadata we need for an inode's
delayed allocation extents by tracking how many extents are currently
waiting for allocation.  It introduces two new callbacks for the
extent_io tree's, merge_extent_hook and split_extent_hook.  These help
us keep track of when we merge delalloc extents together and split them
up.  Reservations are handled prior to any actually dirty'ing occurs,
and then we unreserve after we dirty.

btrfs_unreserve_metadata_for_delalloc() will make the appropriate
unreservations as needed based on the number of reservations we
currently have and the number of extents we currently have.  Doing the
reservation outside of doing any of the actual dirty'ing lets us do
things like filemap_flush() the inode to try and force delalloc to
happen, or as a last resort actually start allocation on all delalloc
inodes in the fs.  This has survived dbench, fs_mark and an fsx torture
test.
Signed-off-by: default avatarJosef Bacik <jbacik@redhat.com>
Signed-off-by: default avatarChris Mason <chris.mason@oracle.com>
parent c65ddb52
......@@ -127,6 +127,14 @@ struct btrfs_inode {
*/
u64 last_unlink_trans;
/*
* These two counters are for delalloc metadata reservations. We keep
* track of how many extents we've accounted for vs how many extents we
* have.
*/
int delalloc_reserved_extents;
int delalloc_extents;
/*
* ordered_data_close is set by truncate when a file that used
* to have good data has been truncated to zero. When it is set
......
......@@ -675,18 +675,19 @@ struct btrfs_space_info {
current allocations */
u64 bytes_readonly; /* total bytes that are read only */
u64 bytes_super; /* total bytes reserved for the super blocks */
/* delalloc accounting */
u64 bytes_delalloc; /* number of bytes reserved for allocation,
this space is not necessarily reserved yet
by the allocator */
u64 bytes_root; /* the number of bytes needed to commit a
transaction */
u64 bytes_may_use; /* number of bytes that may be used for
delalloc */
delalloc/allocations */
u64 bytes_delalloc; /* number of bytes currently reserved for
delayed allocation */
int full; /* indicates that we cannot allocate any more
chunks for this space */
int force_alloc; /* set if we need to force a chunk alloc for
this space */
int force_delalloc; /* make people start doing filemap_flush until
we're under a threshold */
struct list_head list;
......@@ -695,6 +696,9 @@ struct btrfs_space_info {
spinlock_t lock;
struct rw_semaphore groups_sem;
atomic_t caching_threads;
int allocating_chunk;
wait_queue_head_t wait;
};
/*
......@@ -2022,7 +2026,12 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
int btrfs_check_metadata_free_space(struct btrfs_root *root);
int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items);
int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items);
int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items);
int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items);
int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
u64 bytes);
void btrfs_free_reserved_data_space(struct btrfs_root *root,
......
......@@ -1629,7 +1629,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
fs_info->sb = sb;
fs_info->max_extent = (u64)-1;
fs_info->max_inline = 8192 * 1024;
fs_info->metadata_ratio = 8;
fs_info->metadata_ratio = 0;
fs_info->thread_pool_size = min_t(unsigned long,
num_online_cpus() + 2, 8);
......
......@@ -68,6 +68,8 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct extent_buffer **must_clean);
static int find_next_key(struct btrfs_path *path, int level,
struct btrfs_key *key);
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups);
static noinline int
block_group_cache_done(struct btrfs_block_group_cache *cache)
......@@ -2764,67 +2766,346 @@ void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
alloc_target);
}
static u64 calculate_bytes_needed(struct btrfs_root *root, int num_items)
{
u64 num_bytes;
int level;
level = BTRFS_MAX_LEVEL - 2;
/*
* NOTE: these calculations are absolutely the worst possible case.
* This assumes that _every_ item we insert will require a new leaf, and
* that the tree has grown to its maximum level size.
*/
/*
* for every item we insert we could insert both an extent item and a
* extent ref item. Then for ever item we insert, we will need to cow
* both the original leaf, plus the leaf to the left and right of it.
*
* Unless we are talking about the extent root, then we just want the
* number of items * 2, since we just need the extent item plus its ref.
*/
if (root == root->fs_info->extent_root)
num_bytes = num_items * 2;
else
num_bytes = (num_items + (2 * num_items)) * 3;
/*
* num_bytes is total number of leaves we could need times the leaf
* size, and then for every leaf we could end up cow'ing 2 nodes per
* level, down to the leaf level.
*/
num_bytes = (num_bytes * root->leafsize) +
(num_bytes * (level * 2)) * root->nodesize;
return num_bytes;
}
/*
* for now this just makes sure we have at least 5% of our metadata space free
* for use.
* Unreserve metadata space for delalloc. If we have less reserved credits than
* we have extents, this function does nothing.
*/
int btrfs_check_metadata_free_space(struct btrfs_root *root)
int btrfs_unreserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
u64 alloc_target, thresh;
int committed = 0, ret;
u64 num_bytes;
u64 alloc_target;
bool bug = false;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
if (!meta_sinfo)
goto alloc;
again:
num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
num_items);
spin_lock(&meta_sinfo->lock);
if (!meta_sinfo->full)
thresh = meta_sinfo->total_bytes * 80;
else
thresh = meta_sinfo->total_bytes * 95;
if (BTRFS_I(inode)->delalloc_reserved_extents <=
BTRFS_I(inode)->delalloc_extents) {
spin_unlock(&meta_sinfo->lock);
return 0;
}
BTRFS_I(inode)->delalloc_reserved_extents--;
BUG_ON(BTRFS_I(inode)->delalloc_reserved_extents < 0);
if (meta_sinfo->bytes_delalloc < num_bytes) {
bug = true;
meta_sinfo->bytes_delalloc = 0;
} else {
meta_sinfo->bytes_delalloc -= num_bytes;
}
spin_unlock(&meta_sinfo->lock);
BUG_ON(bug);
return 0;
}
static void check_force_delalloc(struct btrfs_space_info *meta_sinfo)
{
u64 thresh;
thresh = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
meta_sinfo->bytes_super + meta_sinfo->bytes_root +
meta_sinfo->bytes_may_use;
thresh = meta_sinfo->total_bytes - thresh;
thresh *= 80;
do_div(thresh, 100);
if (thresh <= meta_sinfo->bytes_delalloc)
meta_sinfo->force_delalloc = 1;
else
meta_sinfo->force_delalloc = 0;
}
if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
meta_sinfo->bytes_super > thresh) {
struct btrfs_trans_handle *trans;
if (!meta_sinfo->full) {
meta_sinfo->force_alloc = 1;
static int maybe_allocate_chunk(struct btrfs_root *root,
struct btrfs_space_info *info)
{
struct btrfs_super_block *disk_super = &root->fs_info->super_copy;
struct btrfs_trans_handle *trans;
bool wait = false;
int ret = 0;
u64 min_metadata;
u64 free_space;
free_space = btrfs_super_total_bytes(disk_super);
/*
* we allow the metadata to grow to a max of either 5gb or 5% of the
* space in the volume.
*/
min_metadata = min((u64)5 * 1024 * 1024 * 1024,
div64_u64(free_space * 5, 100));
if (info->total_bytes >= min_metadata) {
spin_unlock(&info->lock);
return 0;
}
if (info->full) {
spin_unlock(&info->lock);
return 0;
}
if (!info->allocating_chunk) {
info->force_alloc = 1;
info->allocating_chunk = 1;
init_waitqueue_head(&info->wait);
} else {
wait = true;
}
spin_unlock(&info->lock);
if (wait) {
wait_event(info->wait,
!info->allocating_chunk);
return 1;
}
trans = btrfs_start_transaction(root, 1);
if (!trans) {
ret = -ENOMEM;
goto out;
}
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4096 + 2 * 1024 * 1024,
info->flags, 0);
btrfs_end_transaction(trans, root);
if (ret)
goto out;
out:
spin_lock(&info->lock);
info->allocating_chunk = 0;
spin_unlock(&info->lock);
wake_up(&info->wait);
if (ret)
return 0;
return 1;
}
/*
* Reserve metadata space for delalloc.
*/
int btrfs_reserve_metadata_for_delalloc(struct btrfs_root *root,
struct inode *inode, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
u64 num_bytes;
u64 used;
u64 alloc_target;
int flushed = 0;
int force_delalloc;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
num_bytes = calculate_bytes_needed(root->fs_info->extent_root,
num_items);
again:
spin_lock(&meta_sinfo->lock);
force_delalloc = meta_sinfo->force_delalloc;
if (unlikely(!meta_sinfo->bytes_root))
meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
if (!flushed)
meta_sinfo->bytes_delalloc += num_bytes;
used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
meta_sinfo->bytes_super + meta_sinfo->bytes_root +
meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
if (used > meta_sinfo->total_bytes) {
flushed++;
if (flushed == 1) {
if (maybe_allocate_chunk(root, meta_sinfo))
goto again;
flushed++;
} else {
spin_unlock(&meta_sinfo->lock);
alloc:
trans = btrfs_start_transaction(root, 1);
if (!trans)
return -ENOMEM;
}
ret = do_chunk_alloc(trans, root->fs_info->extent_root,
2 * 1024 * 1024, alloc_target, 0);
btrfs_end_transaction(trans, root);
if (!meta_sinfo) {
meta_sinfo = __find_space_info(info,
alloc_target);
}
if (flushed == 2) {
filemap_flush(inode->i_mapping);
goto again;
} else if (flushed == 3) {
btrfs_start_delalloc_inodes(root);
btrfs_wait_ordered_extents(root, 0);
goto again;
}
spin_lock(&meta_sinfo->lock);
meta_sinfo->bytes_delalloc -= num_bytes;
spin_unlock(&meta_sinfo->lock);
printk(KERN_ERR "enospc, has %d, reserved %d\n",
BTRFS_I(inode)->delalloc_extents,
BTRFS_I(inode)->delalloc_reserved_extents);
dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
if (!committed) {
committed = 1;
trans = btrfs_join_transaction(root, 1);
if (!trans)
return -ENOMEM;
ret = btrfs_commit_transaction(trans, root);
if (ret)
return ret;
BTRFS_I(inode)->delalloc_reserved_extents++;
check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
if (!flushed && force_delalloc)
filemap_flush(inode->i_mapping);
return 0;
}
/*
* unreserve num_items number of items worth of metadata space. This needs to
* be paired with btrfs_reserve_metadata_space.
*
* NOTE: if you have the option, run this _AFTER_ you do a
* btrfs_end_transaction, since btrfs_end_transaction will run delayed ref
* oprations which will result in more used metadata, so we want to make sure we
* can do that without issue.
*/
int btrfs_unreserve_metadata_space(struct btrfs_root *root, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
u64 num_bytes;
u64 alloc_target;
bool bug = false;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
num_bytes = calculate_bytes_needed(root, num_items);
spin_lock(&meta_sinfo->lock);
if (meta_sinfo->bytes_may_use < num_bytes) {
bug = true;
meta_sinfo->bytes_may_use = 0;
} else {
meta_sinfo->bytes_may_use -= num_bytes;
}
spin_unlock(&meta_sinfo->lock);
BUG_ON(bug);
return 0;
}
/*
* Reserve some metadata space for use. We'll calculate the worste case number
* of bytes that would be needed to modify num_items number of items. If we
* have space, fantastic, if not, you get -ENOSPC. Please call
* btrfs_unreserve_metadata_space when you are done for the _SAME_ number of
* items you reserved, since whatever metadata you needed should have already
* been allocated.
*
* This will commit the transaction to make more space if we don't have enough
* metadata space. THe only time we don't do this is if we're reserving space
* inside of a transaction, then we will just return -ENOSPC and it is the
* callers responsibility to handle it properly.
*/
int btrfs_reserve_metadata_space(struct btrfs_root *root, int num_items)
{
struct btrfs_fs_info *info = root->fs_info;
struct btrfs_space_info *meta_sinfo;
u64 num_bytes;
u64 used;
u64 alloc_target;
int retries = 0;
/* get the space info for where the metadata will live */
alloc_target = btrfs_get_alloc_profile(root, 0);
meta_sinfo = __find_space_info(info, alloc_target);
num_bytes = calculate_bytes_needed(root, num_items);
again:
spin_lock(&meta_sinfo->lock);
if (unlikely(!meta_sinfo->bytes_root))
meta_sinfo->bytes_root = calculate_bytes_needed(root, 6);
if (!retries)
meta_sinfo->bytes_may_use += num_bytes;
used = meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly +
meta_sinfo->bytes_super + meta_sinfo->bytes_root +
meta_sinfo->bytes_may_use + meta_sinfo->bytes_delalloc;
if (used > meta_sinfo->total_bytes) {
retries++;
if (retries == 1) {
if (maybe_allocate_chunk(root, meta_sinfo))
goto again;
retries++;
} else {
spin_unlock(&meta_sinfo->lock);
}
if (retries == 2) {
btrfs_start_delalloc_inodes(root);
btrfs_wait_ordered_extents(root, 0);
goto again;
}
spin_lock(&meta_sinfo->lock);
meta_sinfo->bytes_may_use -= num_bytes;
spin_unlock(&meta_sinfo->lock);
dump_space_info(meta_sinfo, 0, 0);
return -ENOSPC;
}
check_force_delalloc(meta_sinfo);
spin_unlock(&meta_sinfo->lock);
return 0;
......@@ -2915,7 +3196,7 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
BTRFS_I(inode)->reserved_bytes += bytes;
spin_unlock(&data_sinfo->lock);
return btrfs_check_metadata_free_space(root);
return 0;
}
/*
......@@ -3014,17 +3295,15 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
BUG_ON(!space_info);
spin_lock(&space_info->lock);
if (space_info->force_alloc) {
if (space_info->force_alloc)
force = 1;
space_info->force_alloc = 0;
}
if (space_info->full) {
spin_unlock(&space_info->lock);
goto out;
}
thresh = space_info->total_bytes - space_info->bytes_readonly;
thresh = div_factor(thresh, 6);
thresh = div_factor(thresh, 8);
if (!force &&
(space_info->bytes_used + space_info->bytes_pinned +
space_info->bytes_reserved + alloc_bytes) < thresh) {
......@@ -3038,7 +3317,7 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
* we keep a reasonable number of metadata chunks allocated in the
* FS as well.
*/
if (flags & BTRFS_BLOCK_GROUP_DATA) {
if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
fs_info->data_chunk_allocations++;
if (!(fs_info->data_chunk_allocations %
fs_info->metadata_ratio))
......@@ -3046,8 +3325,11 @@ static int do_chunk_alloc(struct btrfs_trans_handle *trans,
}
ret = btrfs_alloc_chunk(trans, extent_root, flags);
spin_lock(&space_info->lock);
if (ret)
space_info->full = 1;
space_info->force_alloc = 0;
spin_unlock(&space_info->lock);
out:
mutex_unlock(&extent_root->fs_info->chunk_mutex);
return ret;
......@@ -4062,21 +4344,32 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
return ret;
}
static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
int dump_block_groups)
{
struct btrfs_block_group_cache *cache;
spin_lock(&info->lock);
printk(KERN_INFO "space_info has %llu free, is %sfull\n",
(unsigned long long)(info->total_bytes - info->bytes_used -
info->bytes_pinned - info->bytes_reserved),
info->bytes_pinned - info->bytes_reserved -
info->bytes_super),
(info->full) ? "" : "not ");
printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
" may_use=%llu, used=%llu\n",
" may_use=%llu, used=%llu, root=%llu, super=%llu, reserved=%llu"
"\n",
(unsigned long long)info->total_bytes,
(unsigned long long)info->bytes_pinned,
(unsigned long long)info->bytes_delalloc,
(unsigned long long)info->bytes_may_use,
(unsigned long long)info->bytes_used);
(unsigned long long)info->bytes_used,
(unsigned long long)info->bytes_root,
(unsigned long long)info->bytes_super,
(unsigned long long)info->bytes_reserved);
spin_unlock(&info->lock);
if (!dump_block_groups)
return;
down_read(&info->groups_sem);
list_for_each_entry(cache, &info->block_groups, list) {
......@@ -4144,7 +4437,7 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
printk(KERN_ERR "btrfs allocation failed flags %llu, "
"wanted %llu\n", (unsigned long long)data,
(unsigned long long)num_bytes);
dump_space_info(sinfo, num_bytes);
dump_space_info(sinfo, num_bytes, 1);
}
return ret;
......
......@@ -280,6 +280,14 @@ static struct extent_buffer *buffer_search(struct extent_io_tree *tree,
return NULL;
}
static void merge_cb(struct extent_io_tree *tree, struct extent_state *new,
struct extent_state *other)
{
if (tree->ops && tree->ops->merge_extent_hook)
tree->ops->merge_extent_hook(tree->mapping->host, new,
other);
}
/*
* utility function to look for merge candidates inside a given range.
* Any extents with matching state are merged together into a single
......@@ -303,6 +311,7 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->end == state->start - 1 &&
other->state == state->state) {
merge_cb(tree, state, other);
state->start = other->start;
other->tree = NULL;
rb_erase(&other->rb_node, &tree->state);
......@@ -314,33 +323,37 @@ static int merge_state(struct extent_io_tree *tree,
other = rb_entry(other_node, struct extent_state, rb_node);
if (other->start == state->end + 1 &&
other->state == state->state) {
merge_cb(tree, state, other);
other->start = state->start;
state->tree = NULL;
rb_erase(&state->rb_node, &tree->state);
free_extent_state(state);
state = NULL;
}
}
return 0;
}
static void set_state_cb(struct extent_io_tree *tree,
static int set_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
if (tree->ops && tree->ops->set_bit_hook) {
tree->ops->set_bit_hook(tree->mapping->host, state->start,
state->end, state->state, bits);
return tree->ops->set_bit_hook(tree->mapping->host,
state->start, state->end,
state->state, bits);
}
return 0;
}
static void clear_state_cb(struct extent_io_tree *tree,
struct extent_state *state,
unsigned long bits)
{
if (tree->ops && tree->ops->clear_bit_hook) {
tree->ops->clear_bit_hook(tree->mapping->host, state->start,
state->end, state->state, bits);
}
if (tree->ops && tree->ops->clear_bit_hook)
tree->ops->clear_bit_hook(tree->mapping->host, state, bits);
}
/*
......@@ -358,6 +371,7 @@ static int insert_state(struct extent_io_tree *tree,
int bits)
{
struct rb_node *node;
int ret;
if (end < start) {
printk(KERN_ERR "btrfs end < start %llu %llu\n",
......@@ -365,11 +379,14 @@ static int insert_state(struct extent_io_tree *tree,
(unsigned long long)start);
WARN_ON(1);
}
if (bits & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
state->start = start;
state->end = end;
set_state_cb(tree, state, bits);
ret = set_state_cb(tree, state, bits);
if (ret)
return ret;
if (bits & EXTENT_DIRTY)
tree->dirty_bytes += end - start + 1;
state->state |= bits;
node = tree_insert(&tree->state, end, &state->rb_node);
if (node) {
......@@ -387,6 +404,15 @@ static int insert_state(struct extent_io_tree *tree,
return 0;
}
static int split_cb(struct extent_io_tree *tree, struct extent_state *orig,
u64 split)
{
if (tree->ops && tree->ops->split_extent_hook)
return tree->ops->split_extent_hook(tree->mapping->host,
orig, split);
return 0;
}
/*
* split a given extent state struct in two, inserting the preallocated
* struct 'prealloc' as the newly created second half. 'split' indicates an
......@@ -405,6 +431,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct extent_state *prealloc, u64 split)
{
struct rb_node *node;
split_cb(tree, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
prealloc->state = orig->state;
......@@ -542,8 +571,8 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (err)
goto out;
if (state->end <= end) {
set |= clear_state_bit(tree, state, bits,
wake, delete);
set |= clear_state_bit(tree, state, bits, wake,
delete);
if (last_end == (u64)-1)
goto out;
start = last_end + 1;
......@@ -561,12 +590,11 @@ int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
prealloc = alloc_extent_state(GFP_ATOMIC);
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
if (wake)
wake_up(&state->wq);
set |= clear_state_bit(tree, prealloc, bits,
wake, delete);
set |= clear_state_bit(tree, prealloc, bits, wake, delete);
prealloc = NULL;
goto out;
}
......@@ -667,16 +695,23 @@ int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits)
return 0;
}
static void set_state_bits(struct extent_io_tree *tree,
static int set_state_bits(struct extent_io_tree *tree,
struct extent_state *state,
int bits)
{
int ret;
ret = set_state_cb(tree, state, bits);
if (ret)
return ret;
if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) {
u64 range = state->end - state->start + 1;
tree->dirty_bytes += range;
}
set_state_cb(tree, state, bits);
state->state |= bits;
return 0;
}
static void cache_state(struct extent_state *state,
......@@ -758,7 +793,10 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
goto out;
}
set_state_bits(tree, state, bits);
err = set_state_bits(tree, state, bits);
if (err)
goto out;
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
......@@ -805,7 +843,9 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (err)
goto out;
if (state->end <= end) {
set_state_bits(tree, state, bits);
err = set_state_bits(tree, state, bits);
if (err)
goto out;
cache_state(state, cached_state);
merge_state(tree, state);
if (last_end == (u64)-1)
......@@ -829,11 +869,13 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
this_end = last_start - 1;
err = insert_state(tree, prealloc, start, this_end,
bits);
cache_state(prealloc, cached_state);
prealloc = NULL;
BUG_ON(err == -EEXIST);
if (err)
if (err) {
prealloc = NULL;
goto out;
}
cache_state(prealloc, cached_state);
prealloc = NULL;
start = this_end + 1;
goto search_again;
}
......@@ -852,7 +894,11 @@ static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
err = split_state(tree, state, prealloc, end + 1);
BUG_ON(err == -EEXIST);
set_state_bits(tree, prealloc, bits);
err = set_state_bits(tree, prealloc, bits);
if (err) {
prealloc = NULL;
goto out;
}
cache_state(prealloc, cached_state);
merge_state(tree, prealloc);
prealloc = NULL;
......
......@@ -60,8 +60,13 @@ struct extent_io_ops {
struct extent_state *state, int uptodate);
int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits);
int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits);
int (*clear_bit_hook)(struct inode *inode, struct extent_state *state,
unsigned long bits);
int (*merge_extent_hook)(struct inode *inode,
struct extent_state *new,
struct extent_state *other);
int (*split_extent_hook)(struct inode *inode,
struct extent_state *orig, u64 split);
int (*write_cache_pages_lock_hook)(struct page *page);
};
......@@ -79,10 +84,14 @@ struct extent_state {
u64 start;
u64 end; /* inclusive */
struct rb_node rb_node;
/* ADD NEW ELEMENTS AFTER THIS */
struct extent_io_tree *tree;
wait_queue_head_t wq;
atomic_t refs;
unsigned long state;
u64 split_start;
u64 split_end;
/* for use by the FS */
u64 private;
......
......@@ -123,7 +123,10 @@ static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans,
root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
end_of_last_block = start_pos + num_bytes - 1;
btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block);
if (err)
return err;
for (i = 0; i < num_pages; i++) {
struct page *p = pages[i];
SetPageUptodate(p);
......@@ -927,6 +930,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
err = file_remove_suid(file);
if (err)
goto out_nolock;
err = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
if (err)
goto out_nolock;
file_update_time(file);
pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
......@@ -1028,6 +1036,7 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
mutex_unlock(&inode->i_mutex);
if (ret)
err = ret;
btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
out_nolock:
kfree(pages);
......
......@@ -1159,6 +1159,83 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
return ret;
}
static int btrfs_split_extent_hook(struct inode *inode,
struct extent_state *orig, u64 split)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 size;
if (!(orig->state & EXTENT_DELALLOC))
return 0;
size = orig->end - orig->start + 1;
if (size > root->fs_info->max_extent) {
u64 num_extents;
u64 new_size;
new_size = orig->end - split + 1;
num_extents = div64_u64(size + root->fs_info->max_extent - 1,
root->fs_info->max_extent);
/*
* if we break a large extent up then leave delalloc_extents be,
* since we've already accounted for the large extent.
*/
if (div64_u64(new_size + root->fs_info->max_extent - 1,
root->fs_info->max_extent) < num_extents)
return 0;
}
BTRFS_I(inode)->delalloc_extents++;
return 0;
}
/*
* extent_io.c merge_extent_hook, used to track merged delayed allocation
* extents so we can keep track of new extents that are just merged onto old
* extents, such as when we are doing sequential writes, so we can properly
* account for the metadata space we'll need.
*/
static int btrfs_merge_extent_hook(struct inode *inode,
struct extent_state *new,
struct extent_state *other)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
u64 new_size, old_size;
u64 num_extents;
/* not delalloc, ignore it */
if (!(other->state & EXTENT_DELALLOC))
return 0;
old_size = other->end - other->start + 1;
if (new->start < other->start)
new_size = other->end - new->start + 1;
else
new_size = new->end - other->start + 1;
/* we're not bigger than the max, unreserve the space and go */
if (new_size <= root->fs_info->max_extent) {
BTRFS_I(inode)->delalloc_extents--;
return 0;
}
/*
* If we grew by another max_extent, just return, we want to keep that
* reserved amount.
*/
num_extents = div64_u64(old_size + root->fs_info->max_extent - 1,
root->fs_info->max_extent);
if (div64_u64(new_size + root->fs_info->max_extent - 1,
root->fs_info->max_extent) > num_extents)
return 0;
BTRFS_I(inode)->delalloc_extents--;
return 0;
}
/*
* extent_io.c set_bit_hook, used to track delayed allocation
* bytes in this file, and to maintain the list of inodes that
......@@ -1167,6 +1244,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
......@@ -1174,6 +1252,8 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
*/
if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
BTRFS_I(inode)->delalloc_extents++;
btrfs_delalloc_reserve_space(root, inode, end - start + 1);
spin_lock(&root->fs_info->delalloc_lock);
BTRFS_I(inode)->delalloc_bytes += end - start + 1;
......@@ -1190,22 +1270,27 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
/*
* extent_io.c clear_bit_hook, see set_bit_hook for why
*/
static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
unsigned long old, unsigned long bits)
static int btrfs_clear_bit_hook(struct inode *inode,
struct extent_state *state, unsigned long bits)
{
/*
* set_bit and clear bit hooks normally require _irqsave/restore
* but in this case, we are only testeing for the DELALLOC
* bit, which is only set or cleared with irqs on
*/
if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
struct btrfs_root *root = BTRFS_I(inode)->root;
BTRFS_I(inode)->delalloc_extents--;
btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
spin_lock(&root->fs_info->delalloc_lock);
if (end - start + 1 > root->fs_info->delalloc_bytes) {
if (state->end - state->start + 1 >
root->fs_info->delalloc_bytes) {
printk(KERN_INFO "btrfs warning: delalloc account "
"%llu %llu\n",
(unsigned long long)end - start + 1,
(unsigned long long)
state->end - state->start + 1,
(unsigned long long)
root->fs_info->delalloc_bytes);
btrfs_delalloc_free_space(root, inode, (u64)-1);
......@@ -1213,9 +1298,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
BTRFS_I(inode)->delalloc_bytes = 0;
} else {
btrfs_delalloc_free_space(root, inode,
end - start + 1);
root->fs_info->delalloc_bytes -= end - start + 1;
BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
state->end -
state->start + 1);
root->fs_info->delalloc_bytes -= state->end -
state->start + 1;
BTRFS_I(inode)->delalloc_bytes -= state->end -
state->start + 1;
}
if (BTRFS_I(inode)->delalloc_bytes == 0 &&
!list_empty(&BTRFS_I(inode)->delalloc_inodes)) {
......@@ -2950,7 +3038,12 @@ static int btrfs_truncate_page(struct address_space *mapping, loff_t from)
goto again;
}
btrfs_set_extent_delalloc(inode, page_start, page_end);
ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
if (ret) {
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
goto out_unlock;
}
ret = 0;
if (offset != PAGE_CACHE_SIZE) {
kaddr = kmap(page);
......@@ -2981,15 +3074,11 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
u64 last_byte;
u64 cur_offset;
u64 hole_size;
int err;
int err = 0;
if (size <= hole_start)
return 0;
err = btrfs_check_metadata_free_space(root);
if (err)
return err;
btrfs_truncate_page(inode->i_mapping, inode->i_size);
while (1) {
......@@ -3024,12 +3113,18 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
cur_offset, &hint_byte, 1);
if (err)
break;
err = btrfs_reserve_metadata_space(root, 1);
if (err)
break;
err = btrfs_insert_file_extent(trans, root,
inode->i_ino, cur_offset, 0,
0, hole_size, 0, hole_size,
0, 0, 0);
btrfs_drop_extent_cache(inode, hole_start,
last_byte - 1, 0);
btrfs_unreserve_metadata_space(root, 1);
}
free_extent_map(em);
cur_offset = last_byte;
......@@ -3990,11 +4085,18 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
if (!new_valid_dev(rdev))
return -EINVAL;
err = btrfs_check_metadata_free_space(root);
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
err = btrfs_reserve_metadata_space(root, 5);
if (err)
goto fail;
return err;
trans = btrfs_start_transaction(root, 1);
if (!trans)
goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
......@@ -4032,6 +4134,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
......@@ -4052,10 +4155,18 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
u64 objectid;
u64 index = 0;
err = btrfs_check_metadata_free_space(root);
/*
* 2 for inode item and ref
* 2 for dir items
* 1 for xattr if selinux is on
*/
err = btrfs_reserve_metadata_space(root, 5);
if (err)
goto fail;
return err;
trans = btrfs_start_transaction(root, 1);
if (!trans)
goto fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
......@@ -4096,6 +4207,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
......@@ -4118,10 +4230,16 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
if (inode->i_nlink == 0)
return -ENOENT;
btrfs_inc_nlink(inode);
err = btrfs_check_metadata_free_space(root);
/*
* 1 item for inode ref
* 2 items for dir items
*/
err = btrfs_reserve_metadata_space(root, 3);
if (err)
goto fail;
return err;
btrfs_inc_nlink(inode);
err = btrfs_set_inode_index(dir, &index);
if (err)
goto fail;
......@@ -4145,6 +4263,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
fail:
btrfs_unreserve_metadata_space(root, 3);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
......@@ -4164,17 +4283,21 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
u64 index = 0;
unsigned long nr = 1;
err = btrfs_check_metadata_free_space(root);
/*
* 2 items for inode and ref
* 2 items for dir items
* 1 for xattr if selinux is on
*/
err = btrfs_reserve_metadata_space(root, 5);
if (err)
goto out_unlock;
return err;
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir);
if (IS_ERR(trans)) {
err = PTR_ERR(trans);
if (!trans) {
err = -ENOMEM;
goto out_unlock;
}
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
if (err) {
......@@ -4223,6 +4346,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
btrfs_end_transaction_throttle(trans, root);
out_unlock:
btrfs_unreserve_metadata_space(root, 5);
if (drop_on_err)
iput(inode);
btrfs_btree_balance_dirty(root, nr);
......@@ -4747,6 +4871,13 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto out;
}
ret = btrfs_reserve_metadata_for_delalloc(root, inode, 1);
if (ret) {
btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
ret = VM_FAULT_SIGBUS;
goto out;
}
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
lock_page(page);
......@@ -4778,7 +4909,12 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
goto again;
}
btrfs_set_extent_delalloc(inode, page_start, page_end);
ret = btrfs_set_extent_delalloc(inode, page_start, page_end);
if (ret) {
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
ret = VM_FAULT_SIGBUS;
goto out_unlock;
}
ret = 0;
/* page is wholly or partially inside EOF */
......@@ -4801,6 +4937,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
btrfs_unreserve_metadata_for_delalloc(root, inode, 1);
if (!ret)
return VM_FAULT_LOCKED;
unlock_page(page);
......@@ -4917,6 +5054,8 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
return NULL;
ei->last_trans = 0;
ei->logged_trans = 0;
ei->delalloc_extents = 0;
ei->delalloc_reserved_extents = 0;
btrfs_ordered_inode_tree_init(&ei->ordered_tree);
INIT_LIST_HEAD(&ei->i_orphan);
INIT_LIST_HEAD(&ei->ordered_operations);
......@@ -5070,7 +5209,12 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_inode->i_size > BTRFS_EMPTY_DIR_SIZE)
return -ENOTEMPTY;
ret = btrfs_check_metadata_free_space(root);
/*
* 2 items for dir items
* 1 item for orphan entry
* 1 item for ref
*/
ret = btrfs_reserve_metadata_space(root, 4);
if (ret)
return ret;
......@@ -5185,6 +5329,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
up_read(&root->fs_info->subvol_sem);
btrfs_unreserve_metadata_space(root, 4);
return ret;
}
......@@ -5256,11 +5402,18 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
return -ENAMETOOLONG;
err = btrfs_check_metadata_free_space(root);
/*
* 2 items for inode item and ref
* 2 items for dir items
* 1 item for xattr if selinux is on
*/
err = btrfs_reserve_metadata_space(root, 5);
if (err)
goto out_fail;
return err;
trans = btrfs_start_transaction(root, 1);
if (!trans)
goto out_fail;
btrfs_set_trans_block_group(trans, dir);
err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid);
......@@ -5341,6 +5494,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
nr = trans->blocks_used;
btrfs_end_transaction_throttle(trans, root);
out_fail:
btrfs_unreserve_metadata_space(root, 5);
if (drop_inode) {
inode_dec_link_count(inode);
iput(inode);
......@@ -5362,6 +5516,11 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
while (num_bytes > 0) {
alloc_size = min(num_bytes, root->fs_info->max_extent);
ret = btrfs_reserve_metadata_space(root, 1);
if (ret)
goto out;
ret = btrfs_reserve_extent(trans, root, alloc_size,
root->sectorsize, 0, alloc_hint,
(u64)-1, &ins, 1);
......@@ -5381,6 +5540,7 @@ static int prealloc_file_range(struct btrfs_trans_handle *trans,
num_bytes -= ins.offset;
cur_offset += ins.offset;
alloc_hint = ins.objectid + ins.offset;
btrfs_unreserve_metadata_space(root, 1);
}
out:
if (cur_offset > start) {
......@@ -5566,6 +5726,8 @@ static struct extent_io_ops btrfs_extent_io_ops = {
.readpage_io_failed_hook = btrfs_io_failed_hook,
.set_bit_hook = btrfs_set_bit_hook,
.clear_bit_hook = btrfs_clear_bit_hook,
.merge_extent_hook = btrfs_merge_extent_hook,
.split_extent_hook = btrfs_split_extent_hook,
};
/*
......
......@@ -239,7 +239,13 @@ static noinline int create_subvol(struct btrfs_root *root,
u64 index = 0;
unsigned long nr = 1;
ret = btrfs_check_metadata_free_space(root);
/*
* 1 - inode item
* 2 - refs
* 1 - root item
* 2 - dir items
*/
ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
return ret;
......@@ -340,6 +346,9 @@ static noinline int create_subvol(struct btrfs_root *root,
err = btrfs_commit_transaction(trans, root);
if (err && !ret)
ret = err;
btrfs_unreserve_metadata_space(root, 6);
btrfs_btree_balance_dirty(root, nr);
return ret;
}
......@@ -355,19 +364,27 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
if (!root->ref_cows)
return -EINVAL;
ret = btrfs_check_metadata_free_space(root);
/*
* 1 - inode item
* 2 - refs
* 1 - root item
* 2 - dir items
*/
ret = btrfs_reserve_metadata_space(root, 6);
if (ret)
goto fail_unlock;
pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS);
if (!pending_snapshot) {
ret = -ENOMEM;
btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS);
if (!pending_snapshot->name) {
ret = -ENOMEM;
kfree(pending_snapshot);
btrfs_unreserve_metadata_space(root, 6);
goto fail_unlock;
}
memcpy(pending_snapshot->name, name, namelen);
......
......@@ -186,6 +186,9 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->alloc_exclude_start = 0;
h->delayed_ref_updates = 0;
if (!current->journal_info)
current->journal_info = h;
root->fs_info->running_transaction->use_count++;
record_root_in_trans(h, root);
mutex_unlock(&root->fs_info->trans_mutex);
......@@ -317,6 +320,9 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
wake_up(&cur_trans->writer_wait);
put_transaction(cur_trans);
mutex_unlock(&info->trans_mutex);
if (current->journal_info == trans)
current->journal_info = NULL;
memset(trans, 0, sizeof(*trans));
kmem_cache_free(btrfs_trans_handle_cachep, trans);
......@@ -743,6 +749,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
memcpy(&pending->root_key, &key, sizeof(key));
fail:
kfree(new_root_item);
btrfs_unreserve_metadata_space(root, 6);
return ret;
}
......@@ -1059,6 +1066,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_unlock(&root->fs_info->trans_mutex);
if (current->journal_info == trans)
current->journal_info = NULL;
kmem_cache_free(btrfs_trans_handle_cachep, trans);
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment